sveltejs · khromov · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/AGENTS.md b/AGENTS.md
@@ -21,7 +21,7 @@ bun run generate-report.ts
 bun run generate-report.ts results/result-2024-12-07-14-30-45.json
 
 # Run unit tests for lib modules
-bun run test:self
+bun test
 
 # Run TypeScript type checking
 bun tsc --noEmit
@@ -68,6 +68,9 @@ MCP integration is configured via the interactive CLI at runtime. Options:
 │   ├── report.ts               # Report generation orchestration
 │   ├── report-template.ts      # HTML report template generation
 │   ├── report-styles.ts        # CSS styles for HTML reports
+│   ├── token-cache.ts          # Token cache simulation for cost estimation
+│   ├── utils.ts                # Utility functions (sanitization, cost calculation, etc.)
+│   ├── utils.test.ts           # Unit tests for utility functions
 │   └── tools/
 │       ├── index.ts            # Tool exports
 │       ├── result-write.ts     # ResultWrite tool for final output
@@ -149,8 +152,75 @@ Key functions:
 
 - `extractPricingFromGatewayModel()`: Parse gateway model pricing
 - `buildPricingMap()`: Build lookup map from gateway models
+- `lookupPricingFromMap()`: Find pricing for a specific model
 - `calculateCost()`: Calculate total cost from token usage
 - `formatCost()` / `formatMTokCost()`: Format costs for display
+- `getModelPricingDisplay()`: Convert per-token costs to per-MTok for display
+
+### Token Cache Simulation
+
+The `lib/token-cache.ts` module simulates prompt caching behavior:
+
+**TokenCache Class:**
+
+- Models growing prefix cache across multiple API calls
+- Tracks cache hits, cache writes, and output tokens
+- Calculates simulated costs using cache read/write rates
+- Default rates: 10% for reads, 125% for writes (if not specified in pricing)
+
+**Cache Behavior Model:**
+
+1. Each test runs in its own context (cache resets between tests)
+2. Step 1's input is written to cache (pays cache creation rate)
+3. Each subsequent step:
+   - Previous step's full input is cached (pays cache read rate)
+   - New tokens extend the cache (pays cache creation rate)
+4. The cache prefix grows with each step
+
+**simulateCacheSavings()** (in `lib/utils.ts`):
+
+- Estimates cost savings with prompt caching enabled
+- Returns `simulatedCostWithCache`, `cacheHits`, and `cacheWriteTokens`
+- Results displayed in HTML report as "Cache Simulation" section
+- Shows potential savings compared to actual cost without caching
+
+### Utility Functions
+
+The `lib/utils.ts` module provides core utilities:
+
+- `sanitizeModelName()`: Convert model IDs to filesystem-safe names
+- `getTimestampedFilename()`: Generate timestamped filenames with optional model suffix
+- `isHttpUrl()`: Check if string is HTTP/HTTPS URL
+- `extractResultWriteContent()`: Extract component code from agent steps
+- `calculateTotalCost()`: Aggregate token usage and costs across all tests
+- `buildAgentPrompt()`: Build user message array from test definition
+- `simulateCacheSavings()`: Simulate cache savings using growing prefix model
+
+### Reference Verification
+
+The `lib/verify-references.ts` module verifies reference implementations:
+
+**Key Functions:**
+
+- `loadTestDefinitions()`: Discover test suites in `tests/` directory
+- `copyReferenceToComponent()`: Copy Reference.svelte to Component.svelte temporarily
+- `cleanupComponent()`: Remove temporary Component.svelte file
+- `runTest()`: Execute tests and collect detailed results
+- `printSummary()`: Display verification results summary
+- `verifyAllReferences()`: Main function that orchestrates entire verification workflow
+
+**Workflow:**
+
+1. Discover all test suites with Reference.svelte
+2. For each test:
+   - Copy Reference.svelte → Component.svelte
+   - Run vitest against the test
+   - Collect pass/fail results
+   - Cleanup Component.svelte
+3. Print summary of all results
+4. Return exit code (0 for success, 1 for failures)
+
+Used by `verify-references.ts` script accessible via `bun run verify-tests`.
 
 ### Key Technologies
 
@@ -185,9 +255,10 @@ The project uses `@ai-sdk/mcp` with a custom patch applied via `patch-package`:
    f. Test results are collected (pass/fail, error details)
    g. Output directory is cleaned up
 5. Results aggregated with pricing calculations
-6. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json`
-7. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html`
-8. Report automatically opens in default browser
+6. Cache simulation estimates potential savings
+7. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json`
+8. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html`
+9. Report automatically opens in default browser
 
 ### Output Files
 
@@ -227,7 +298,8 @@ All results are saved in the `results/` directory with timestamped filenames:
     "pricing": {
       "inputCostPerMTok": 3,
       "outputCostPerMTok": 15,
-      "cacheReadCostPerMTok": 0.3
+      "cacheReadCostPerMTok": 0.3,
+      "cacheCreationCostPerMTok": 3.75
     },
     "totalCost": {
       "inputCost": 0.003,
@@ -237,6 +309,11 @@ All results are saved in the `results/` directory with timestamped filenames:
       "inputTokens": 1000,
       "outputTokens": 1000,
       "cachedInputTokens": 1000
+    },
+    "cacheSimulation": {
+      "simulatedCostWithCache": 0.015,
+      "cacheHits": 2000,
+      "cacheWriteTokens": 1500
     }
   }
 }
@@ -251,8 +328,9 @@ Unit tests for library modules are in `lib/*.test.ts`:
 - `lib/output-test-runner.test.ts` - Output directory management
 - `lib/tools/result-write.test.ts` - ResultWrite tool behavior
 - `lib/tools/test-component.test.ts` - TestComponent tool behavior
+- `lib/utils.test.ts` - Utility functions, cost calculation, cache simulation
 
-Run unit tests with: `bun run test:self`
+Run unit tests with: `bun test`
 
 ## TypeScript Configuration
 
@@ -277,5 +355,10 @@ Run unit tests with: `bun run test:self`
 - All result files are saved with timestamps to preserve historical benchmarks
 - MCP integration can be configured via interactive CLI without code changes
 - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge
+- Cache simulation shows estimated savings if prompt caching were enabled
 - Exit code is 0 if all tests pass, 1 if any tests fail
 - Pricing is fetched from Vercel AI Gateway model metadata at runtime
+
+## Important notes
+
+Always run `bun run tsc` and `bun test` before completing work to make sure the TypeScript types and tests work.
diff --git a/index.ts b/index.ts
@@ -2,21 +2,16 @@ import { Experimental_Agent as Agent, hasToolCall, stepCountIs } from "ai";
 import { experimental_createMCPClient as createMCPClient } from "./node_modules/@ai-sdk/mcp/dist/index.mjs";
 import { Experimental_StdioMCPTransport as StdioMCPTransport } from "./node_modules/@ai-sdk/mcp/dist/mcp-stdio/index.mjs";
 import { writeFileSync, mkdirSync, existsSync } from "node:fs";
-import {
-  generateReport,
-  type SingleTestResult,
-} from "./lib/report.ts";
+import { generateReport, type SingleTestResult } from "./lib/report.ts";
 import {
   getTimestampedFilename,
   isHttpUrl,
   extractResultWriteContent,
   calculateTotalCost,
-} from "./lib/utils.ts";
-import {
-  discoverTests,
   buildAgentPrompt,
-  type TestDefinition,
-} from "./lib/test-discovery.ts";
+  simulateCacheSavings,
+} from "./lib/utils.ts";
+import { discoverTests, type TestDefinition } from "./lib/test-discovery.ts";
 import {
   setupOutputsDirectory,
   cleanupOutputsDirectory,
@@ -30,8 +25,6 @@ import {
   getModelPricingDisplay,
   formatCost,
   formatMTokCost,
-  type ModelPricingLookup,
-  type GatewayModel,
 } from "./lib/pricing.ts";
 import type { LanguageModel } from "ai";
 import {
@@ -48,9 +41,9 @@ import { gateway } from "ai";
 
 async function validateAndConfirmPricing(
   models: string[],
-  pricingMap: Map<string, ModelPricingLookup | null>,
+  pricingMap: ReturnType<typeof buildPricingMap>,
 ) {
-  const lookups = new Map<string, ModelPricingLookup | null>();
+  const lookups = new Map<string, ReturnType<typeof lookupPricingFromMap>>();
 
   for (const modelId of models) {
     const lookup = lookupPricingFromMap(modelId, pricingMap);
@@ -64,7 +57,15 @@ async function validateAndConfirmPricing(
     const pricingLines = models.map((modelId) => {
       const lookup = lookups.get(modelId)!;
       const display = getModelPricingDisplay(lookup.pricing);
-      return `${modelId}\n  → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`;
+      const cacheReadText =
+        display.cacheReadCostPerMTok !== undefined
+          ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read`
+          : "";
+      const cacheWriteText =
+        display.cacheCreationCostPerMTok !== undefined
+          ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write`
+          : "";
+      return `${modelId}\n  → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText}`;
     });
 
     note(pricingLines.join("\n\n"), "💰 Pricing Found");
@@ -96,8 +97,16 @@ async function validateAndConfirmPricing(
       for (const modelId of modelsWithPricing) {
         const lookup = lookups.get(modelId)!;
         const display = getModelPricingDisplay(lookup.pricing);
+        const cacheReadText =
+          display.cacheReadCostPerMTok !== undefined
+            ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read`
+            : "";
+        const cacheWriteText =
+          display.cacheCreationCostPerMTok !== undefined
+            ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write`
+            : "";
         lines.push(
-          `  ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in)`,
+          `  ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText})`,
         );
       }
     }
@@ -126,8 +135,7 @@ async function selectOptions() {
 
   const available_models = await gateway.getAvailableModels();
 
-  const gatewayModels = available_models.models as GatewayModel[];
-  const pricingMap = buildPricingMap(gatewayModels);
+  const pricingMap = buildPricingMap(available_models.models);
 
   const models = await multiselect({
     message: "Select model(s) to benchmark",
@@ -171,6 +179,7 @@ async function selectOptions() {
       { value: "http", label: "MCP over HTTP" },
       { value: "stdio", label: "MCP over StdIO" },
     ],
+    initialValue: "http",
   });
 
   if (isCancel(mcp_integration)) {
@@ -248,7 +257,7 @@ async function runSingleTest(
   console.log(`\n[${testIndex + 1}/${totalTests}] Running test: ${test.name}`);
   console.log("─".repeat(50));
 
-  const prompt = buildAgentPrompt(test);
+  const messages = buildAgentPrompt(test);
 
   try {
     const tools = {
@@ -301,7 +310,7 @@ async function runSingleTest(
     if (testComponentEnabled) {
       console.log("  📋 TestComponent tool is available");
     }
-    const result = await agent.generate({ prompt });
+    const result = await agent.generate({ messages });
 
     const resultWriteContent = extractResultWriteContent(result.steps);
 
@@ -385,9 +394,17 @@ async function main() {
     const lookup = pricing.lookups.get(modelId);
     if (pricing.enabled && lookup) {
       const display = getModelPricingDisplay(lookup.pricing);
+      const cacheReadText =
+        display.cacheReadCostPerMTok !== undefined
+          ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read`
+          : "";
+      const cacheWriteText =
+        display.cacheCreationCostPerMTok !== undefined
+          ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write`
+          : "";
       console.log(`   ${modelId}`);
       console.log(
-        `      💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`,
+        `      💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText}`,
       );
     } else {
       console.log(`   ${modelId}`);
@@ -456,8 +473,16 @@ async function main() {
 
     if (pricingLookup) {
       const display = getModelPricingDisplay(pricingLookup.pricing);
+      const cacheReadText =
+        display.cacheReadCostPerMTok !== undefined
+          ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read`
+          : "";
+      const cacheWriteText =
+        display.cacheCreationCostPerMTok !== undefined
+          ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write`
+          : "";
       console.log(
-        `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`,
+        `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText}`,
       );
     }
 
@@ -514,6 +539,7 @@ async function main() {
 
     let totalCost = null;
     let pricingInfo = null;
+    let cacheSimulation = null;
 
     if (pricingLookup) {
       totalCost = calculateTotalCost(testResults, pricingLookup.pricing);
@@ -522,6 +548,7 @@ async function main() {
         inputCostPerMTok: pricingDisplay.inputCostPerMTok,
         outputCostPerMTok: pricingDisplay.outputCostPerMTok,
         cacheReadCostPerMTok: pricingDisplay.cacheReadCostPerMTok,
+        cacheCreationCostPerMTok: pricingDisplay.cacheCreationCostPerMTok,
       };
 
       console.log("\n💰 Cost Summary");
@@ -534,10 +561,45 @@ async function main() {
       );
       if (totalCost.cachedInputTokens > 0) {
         console.log(
-          `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()} (${formatCost(totalCost.cacheReadCost)})`,
+          `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()}`,
         );
       }
       console.log(`Total cost: ${formatCost(totalCost.totalCost)}`);
+
+      // Simulate cache savings
+      cacheSimulation = simulateCacheSavings(
+        testResults,
+        pricingLookup.pricing,
+      );
+      if (
+        cacheSimulation.cacheHits > 0 ||
+        cacheSimulation.cacheWriteTokens > 0
+      ) {
+        console.log("\n📊 Cache Simulation (estimated with prompt caching):");
+        console.log("─".repeat(50));
+        const totalCacheTokens =
+          cacheSimulation.cacheHits + cacheSimulation.cacheWriteTokens;
+        console.log(
+          `Cache reads: ${cacheSimulation.cacheHits.toLocaleString()} tokens`,
+        );
+        console.log(
+          `Cache writes: ${cacheSimulation.cacheWriteTokens.toLocaleString()} tokens`,
+        );
+        console.log(
+          `Total input tokens: ${totalCacheTokens.toLocaleString()} (reads + writes)`,
+        );
+        console.log(
+          `Estimated cost with cache: ${formatCost(cacheSimulation.simulatedCostWithCache)}`,
+        );
+        const savings =
+          totalCost.totalCost - cacheSimulation.simulatedCostWithCache;
+        const savingsPercent = (savings / totalCost.totalCost) * 100;
+        if (savings > 0) {
+          console.log(
+            `Potential savings: ${formatCost(savings)} (${savingsPercent.toFixed(1)}%)`,
+          );
+        }
+      }
     }
 
     const resultsDir = "results";
@@ -561,6 +623,7 @@ async function main() {
         pricingKey: pricingLookup?.matchedKey ?? null,
         pricing: pricingInfo,
         totalCost,
+        cacheSimulation,
       },
     };