diff --git a/AGENTS.md b/AGENTS.md index 3e7c825..3cbfbf8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,7 +21,7 @@ bun run generate-report.ts bun run generate-report.ts results/result-2024-12-07-14-30-45.json # Run unit tests for lib modules -bun run test:self +bun test # Run TypeScript type checking bun tsc --noEmit @@ -68,6 +68,9 @@ MCP integration is configured via the interactive CLI at runtime. Options: │ ├── report.ts # Report generation orchestration │ ├── report-template.ts # HTML report template generation │ ├── report-styles.ts # CSS styles for HTML reports +│ ├── token-cache.ts # Token cache simulation for cost estimation +│ ├── utils.ts # Utility functions (sanitization, cost calculation, etc.) +│ ├── utils.test.ts # Unit tests for utility functions │ └── tools/ │ ├── index.ts # Tool exports │ ├── result-write.ts # ResultWrite tool for final output @@ -149,8 +152,75 @@ Key functions: - `extractPricingFromGatewayModel()`: Parse gateway model pricing - `buildPricingMap()`: Build lookup map from gateway models +- `lookupPricingFromMap()`: Find pricing for a specific model - `calculateCost()`: Calculate total cost from token usage - `formatCost()` / `formatMTokCost()`: Format costs for display +- `getModelPricingDisplay()`: Convert per-token costs to per-MTok for display + +### Token Cache Simulation + +The `lib/token-cache.ts` module simulates prompt caching behavior: + +**TokenCache Class:** + +- Models growing prefix cache across multiple API calls +- Tracks cache hits, cache writes, and output tokens +- Calculates simulated costs using cache read/write rates +- Default rates: 10% for reads, 125% for writes (if not specified in pricing) + +**Cache Behavior Model:** + +1. Each test runs in its own context (cache resets between tests) +2. Step 1's input is written to cache (pays cache creation rate) +3. Each subsequent step: + - Previous step's full input is cached (pays cache read rate) + - New tokens extend the cache (pays cache creation rate) +4. The cache prefix grows with each step + +**simulateCacheSavings()** (in `lib/utils.ts`): + +- Estimates cost savings with prompt caching enabled +- Returns `simulatedCostWithCache`, `cacheHits`, and `cacheWriteTokens` +- Results displayed in HTML report as "Cache Simulation" section +- Shows potential savings compared to actual cost without caching + +### Utility Functions + +The `lib/utils.ts` module provides core utilities: + +- `sanitizeModelName()`: Convert model IDs to filesystem-safe names +- `getTimestampedFilename()`: Generate timestamped filenames with optional model suffix +- `isHttpUrl()`: Check if string is HTTP/HTTPS URL +- `extractResultWriteContent()`: Extract component code from agent steps +- `calculateTotalCost()`: Aggregate token usage and costs across all tests +- `buildAgentPrompt()`: Build user message array from test definition +- `simulateCacheSavings()`: Simulate cache savings using growing prefix model + +### Reference Verification + +The `lib/verify-references.ts` module verifies reference implementations: + +**Key Functions:** + +- `loadTestDefinitions()`: Discover test suites in `tests/` directory +- `copyReferenceToComponent()`: Copy Reference.svelte to Component.svelte temporarily +- `cleanupComponent()`: Remove temporary Component.svelte file +- `runTest()`: Execute tests and collect detailed results +- `printSummary()`: Display verification results summary +- `verifyAllReferences()`: Main function that orchestrates entire verification workflow + +**Workflow:** + +1. Discover all test suites with Reference.svelte +2. For each test: + - Copy Reference.svelte → Component.svelte + - Run vitest against the test + - Collect pass/fail results + - Cleanup Component.svelte +3. Print summary of all results +4. Return exit code (0 for success, 1 for failures) + +Used by `verify-references.ts` script accessible via `bun run verify-tests`. ### Key Technologies @@ -185,9 +255,10 @@ The project uses `@ai-sdk/mcp` with a custom patch applied via `patch-package`: f. Test results are collected (pass/fail, error details) g. Output directory is cleaned up 5. Results aggregated with pricing calculations -6. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json` -7. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html` -8. Report automatically opens in default browser +6. Cache simulation estimates potential savings +7. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json` +8. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html` +9. Report automatically opens in default browser ### Output Files @@ -227,7 +298,8 @@ All results are saved in the `results/` directory with timestamped filenames: "pricing": { "inputCostPerMTok": 3, "outputCostPerMTok": 15, - "cacheReadCostPerMTok": 0.3 + "cacheReadCostPerMTok": 0.3, + "cacheCreationCostPerMTok": 3.75 }, "totalCost": { "inputCost": 0.003, @@ -237,6 +309,11 @@ All results are saved in the `results/` directory with timestamped filenames: "inputTokens": 1000, "outputTokens": 1000, "cachedInputTokens": 1000 + }, + "cacheSimulation": { + "simulatedCostWithCache": 0.015, + "cacheHits": 2000, + "cacheWriteTokens": 1500 } } } @@ -251,8 +328,9 @@ Unit tests for library modules are in `lib/*.test.ts`: - `lib/output-test-runner.test.ts` - Output directory management - `lib/tools/result-write.test.ts` - ResultWrite tool behavior - `lib/tools/test-component.test.ts` - TestComponent tool behavior +- `lib/utils.test.ts` - Utility functions, cost calculation, cache simulation -Run unit tests with: `bun run test:self` +Run unit tests with: `bun test` ## TypeScript Configuration @@ -277,5 +355,10 @@ Run unit tests with: `bun run test:self` - All result files are saved with timestamps to preserve historical benchmarks - MCP integration can be configured via interactive CLI without code changes - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge +- Cache simulation shows estimated savings if prompt caching were enabled - Exit code is 0 if all tests pass, 1 if any tests fail - Pricing is fetched from Vercel AI Gateway model metadata at runtime + +## Important notes + +Always run `bun run tsc` and `bun test` before completing work to make sure the TypeScript types and tests work. diff --git a/index.ts b/index.ts index b4a4bb3..99c68b0 100644 --- a/index.ts +++ b/index.ts @@ -2,21 +2,16 @@ import { Experimental_Agent as Agent, hasToolCall, stepCountIs } from "ai"; import { experimental_createMCPClient as createMCPClient } from "./node_modules/@ai-sdk/mcp/dist/index.mjs"; import { Experimental_StdioMCPTransport as StdioMCPTransport } from "./node_modules/@ai-sdk/mcp/dist/mcp-stdio/index.mjs"; import { writeFileSync, mkdirSync, existsSync } from "node:fs"; -import { - generateReport, - type SingleTestResult, -} from "./lib/report.ts"; +import { generateReport, type SingleTestResult } from "./lib/report.ts"; import { getTimestampedFilename, isHttpUrl, extractResultWriteContent, calculateTotalCost, -} from "./lib/utils.ts"; -import { - discoverTests, buildAgentPrompt, - type TestDefinition, -} from "./lib/test-discovery.ts"; + simulateCacheSavings, +} from "./lib/utils.ts"; +import { discoverTests, type TestDefinition } from "./lib/test-discovery.ts"; import { setupOutputsDirectory, cleanupOutputsDirectory, @@ -30,8 +25,6 @@ import { getModelPricingDisplay, formatCost, formatMTokCost, - type ModelPricingLookup, - type GatewayModel, } from "./lib/pricing.ts"; import type { LanguageModel } from "ai"; import { @@ -48,9 +41,9 @@ import { gateway } from "ai"; async function validateAndConfirmPricing( models: string[], - pricingMap: Map, + pricingMap: ReturnType, ) { - const lookups = new Map(); + const lookups = new Map>(); for (const modelId of models) { const lookup = lookupPricingFromMap(modelId, pricingMap); @@ -64,7 +57,15 @@ async function validateAndConfirmPricing( const pricingLines = models.map((modelId) => { const lookup = lookups.get(modelId)!; const display = getModelPricingDisplay(lookup.pricing); - return `${modelId}\n → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`; + const cacheReadText = + display.cacheReadCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read` + : ""; + const cacheWriteText = + display.cacheCreationCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write` + : ""; + return `${modelId}\n → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText}`; }); note(pricingLines.join("\n\n"), "💰 Pricing Found"); @@ -96,8 +97,16 @@ async function validateAndConfirmPricing( for (const modelId of modelsWithPricing) { const lookup = lookups.get(modelId)!; const display = getModelPricingDisplay(lookup.pricing); + const cacheReadText = + display.cacheReadCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read` + : ""; + const cacheWriteText = + display.cacheCreationCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write` + : ""; lines.push( - ` ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in)`, + ` ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText})`, ); } } @@ -126,8 +135,7 @@ async function selectOptions() { const available_models = await gateway.getAvailableModels(); - const gatewayModels = available_models.models as GatewayModel[]; - const pricingMap = buildPricingMap(gatewayModels); + const pricingMap = buildPricingMap(available_models.models); const models = await multiselect({ message: "Select model(s) to benchmark", @@ -171,6 +179,7 @@ async function selectOptions() { { value: "http", label: "MCP over HTTP" }, { value: "stdio", label: "MCP over StdIO" }, ], + initialValue: "http", }); if (isCancel(mcp_integration)) { @@ -248,7 +257,7 @@ async function runSingleTest( console.log(`\n[${testIndex + 1}/${totalTests}] Running test: ${test.name}`); console.log("─".repeat(50)); - const prompt = buildAgentPrompt(test); + const messages = buildAgentPrompt(test); try { const tools = { @@ -301,7 +310,7 @@ async function runSingleTest( if (testComponentEnabled) { console.log(" 📋 TestComponent tool is available"); } - const result = await agent.generate({ prompt }); + const result = await agent.generate({ messages }); const resultWriteContent = extractResultWriteContent(result.steps); @@ -385,9 +394,17 @@ async function main() { const lookup = pricing.lookups.get(modelId); if (pricing.enabled && lookup) { const display = getModelPricingDisplay(lookup.pricing); + const cacheReadText = + display.cacheReadCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read` + : ""; + const cacheWriteText = + display.cacheCreationCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write` + : ""; console.log(` ${modelId}`); console.log( - ` 💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`, + ` 💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText}`, ); } else { console.log(` ${modelId}`); @@ -456,8 +473,16 @@ async function main() { if (pricingLookup) { const display = getModelPricingDisplay(pricingLookup.pricing); + const cacheReadText = + display.cacheReadCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheReadCostPerMTok)}/MTok cache read` + : ""; + const cacheWriteText = + display.cacheCreationCostPerMTok !== undefined + ? `, ${formatMTokCost(display.cacheCreationCostPerMTok)}/MTok cache write` + : ""; console.log( - `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`, + `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out${cacheReadText}${cacheWriteText}`, ); } @@ -514,6 +539,7 @@ async function main() { let totalCost = null; let pricingInfo = null; + let cacheSimulation = null; if (pricingLookup) { totalCost = calculateTotalCost(testResults, pricingLookup.pricing); @@ -522,6 +548,7 @@ async function main() { inputCostPerMTok: pricingDisplay.inputCostPerMTok, outputCostPerMTok: pricingDisplay.outputCostPerMTok, cacheReadCostPerMTok: pricingDisplay.cacheReadCostPerMTok, + cacheCreationCostPerMTok: pricingDisplay.cacheCreationCostPerMTok, }; console.log("\n💰 Cost Summary"); @@ -534,10 +561,45 @@ async function main() { ); if (totalCost.cachedInputTokens > 0) { console.log( - `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()} (${formatCost(totalCost.cacheReadCost)})`, + `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()}`, ); } console.log(`Total cost: ${formatCost(totalCost.totalCost)}`); + + // Simulate cache savings + cacheSimulation = simulateCacheSavings( + testResults, + pricingLookup.pricing, + ); + if ( + cacheSimulation.cacheHits > 0 || + cacheSimulation.cacheWriteTokens > 0 + ) { + console.log("\n📊 Cache Simulation (estimated with prompt caching):"); + console.log("─".repeat(50)); + const totalCacheTokens = + cacheSimulation.cacheHits + cacheSimulation.cacheWriteTokens; + console.log( + `Cache reads: ${cacheSimulation.cacheHits.toLocaleString()} tokens`, + ); + console.log( + `Cache writes: ${cacheSimulation.cacheWriteTokens.toLocaleString()} tokens`, + ); + console.log( + `Total input tokens: ${totalCacheTokens.toLocaleString()} (reads + writes)`, + ); + console.log( + `Estimated cost with cache: ${formatCost(cacheSimulation.simulatedCostWithCache)}`, + ); + const savings = + totalCost.totalCost - cacheSimulation.simulatedCostWithCache; + const savingsPercent = (savings / totalCost.totalCost) * 100; + if (savings > 0) { + console.log( + `Potential savings: ${formatCost(savings)} (${savingsPercent.toFixed(1)}%)`, + ); + } + } } const resultsDir = "results"; @@ -561,6 +623,7 @@ async function main() { pricingKey: pricingLookup?.matchedKey ?? null, pricing: pricingInfo, totalCost, + cacheSimulation, }, }; diff --git a/lib/pricing.test.ts b/lib/pricing.test.ts index c5edfc8..151689c 100644 --- a/lib/pricing.test.ts +++ b/lib/pricing.test.ts @@ -7,13 +7,12 @@ import { formatCost, formatMTokCost, getModelPricingDisplay, - type ModelPricing, - type GatewayModel, } from "./pricing.ts"; +import type { GatewayLanguageModelEntry } from "@ai-sdk/gateway"; describe("extractPricingFromGatewayModel", () => { it("should extract pricing from a gateway model with all fields", () => { - const model: GatewayModel = { + const model: GatewayLanguageModelEntry = { id: "anthropic/claude-opus-4.5", name: "Claude Opus 4.5", pricing: { @@ -22,6 +21,11 @@ describe("extractPricingFromGatewayModel", () => { cachedInputTokens: "0.0000005", cacheCreationInputTokens: "0.00000625", }, + specification: { + specificationVersion: "v2", + provider: "anthropic", + modelId: "claude-opus-4.5", + }, modelType: "language", }; @@ -35,13 +39,18 @@ describe("extractPricingFromGatewayModel", () => { }); it("should extract pricing with only input and output", () => { - const model: GatewayModel = { + const model: GatewayLanguageModelEntry = { id: "openai/gpt-4o", name: "GPT-4o", pricing: { input: "0.000003", output: "0.000015", }, + specification: { + specificationVersion: "v2", + provider: "openai", + modelId: "gpt-4o", + }, modelType: "language", }; @@ -51,12 +60,18 @@ describe("extractPricingFromGatewayModel", () => { expect(pricing!.inputCostPerToken).toBe(0.000003); expect(pricing!.outputCostPerToken).toBe(0.000015); expect(pricing!.cacheReadInputTokenCost).toBeUndefined(); + expect(pricing!.cacheCreationInputTokenCost).toBeUndefined(); }); it("should return null for model without pricing", () => { - const model: GatewayModel = { + const model: GatewayLanguageModelEntry = { id: "local/model", name: "Local Model", + specification: { + specificationVersion: "v2", + provider: "local", + modelId: "model", + }, modelType: "language", }; @@ -65,12 +80,17 @@ describe("extractPricingFromGatewayModel", () => { }); it("should throw error for model with empty pricing object", () => { - const model: GatewayModel = { + const model = { id: "local/model", name: "Local Model", - pricing: {}, + pricing: {} as any, + specification: { + specificationVersion: "v2", + provider: "local", + modelId: "model", + }, modelType: "language", - }; + } as GatewayLanguageModelEntry; expect(() => extractPricingFromGatewayModel(model)).toThrowError( /Invalid pricing/, @@ -78,13 +98,18 @@ describe("extractPricingFromGatewayModel", () => { }); it("should throw error for invalid pricing values", () => { - const model: GatewayModel = { + const model: GatewayLanguageModelEntry = { id: "test/model", name: "Test Model", pricing: { input: "invalid", output: "0.000015", }, + specification: { + specificationVersion: "v2", + provider: "test", + modelId: "model", + }, modelType: "language", }; @@ -96,22 +121,37 @@ describe("extractPricingFromGatewayModel", () => { describe("buildPricingMap", () => { it("should build a map from gateway models", () => { - const models: GatewayModel[] = [ + const models: GatewayLanguageModelEntry[] = [ { id: "anthropic/claude-sonnet-4", name: "Claude Sonnet 4", pricing: { input: "0.000003", output: "0.000015" }, + specification: { + specificationVersion: "v2", + provider: "anthropic", + modelId: "claude-sonnet-4", + }, modelType: "language", }, { id: "openai/gpt-4o", name: "GPT-4o", pricing: { input: "0.000005", output: "0.000015" }, + specification: { + specificationVersion: "v2", + provider: "openai", + modelId: "gpt-4o", + }, modelType: "language", }, { id: "local/model", name: "Local Model", + specification: { + specificationVersion: "v2", + provider: "local", + modelId: "model", + }, modelType: "language", }, ]; @@ -127,11 +167,16 @@ describe("buildPricingMap", () => { describe("lookupPricingFromMap", () => { it("should return pricing lookup for existing model", () => { - const models: GatewayModel[] = [ + const models: GatewayLanguageModelEntry[] = [ { id: "anthropic/claude-sonnet-4", name: "Claude Sonnet 4", pricing: { input: "0.000003", output: "0.000015" }, + specification: { + specificationVersion: "v2", + provider: "anthropic", + modelId: "claude-sonnet-4", + }, modelType: "language", }, ]; @@ -152,82 +197,40 @@ describe("lookupPricingFromMap", () => { }); describe("calculateCost", () => { - const basePricing: ModelPricing = { + const basePricing = { inputCostPerToken: 0.000003, // $3 per MTok outputCostPerToken: 0.000015, // $15 per MTok - }; + } satisfies NonNullable>; - const pricingWithCache: ModelPricing = { + const pricingWithCache = { ...basePricing, cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok (10% of input) - }; + } satisfies NonNullable>; describe("basic cost calculation", () => { - it("should calculate cost with no cached tokens", () => { - const result = calculateCost(basePricing, 1000, 500, 0); + it("should calculate cost correctly", () => { + const result = calculateCost(basePricing, 1000, 500); expect(result.inputTokens).toBe(1000); expect(result.outputTokens).toBe(500); - expect(result.cachedInputTokens).toBe(0); expect(result.inputCost).toBe(0.003); // 1000 * $3/MTok expect(result.outputCost).toBeCloseTo(0.0075); // 500 * $15/MTok - expect(result.cacheReadCost).toBe(0); expect(result.totalCost).toBe(0.0105); }); - - it("should default cachedInputTokens to 0", () => { - const result = calculateCost(basePricing, 1000, 500); - - expect(result.cachedInputTokens).toBe(0); - expect(result.inputCost).toBe(0.003); - }); }); - describe("cached token billing", () => { - it("should bill cached tokens at reduced rate", () => { - // 1000 input tokens, 800 are cached - const result = calculateCost(pricingWithCache, 1000, 500, 800); - - expect(result.inputTokens).toBe(1000); - expect(result.cachedInputTokens).toBe(800); - // Uncached: 200 tokens * $3/MTok = $0.0006 - expect(result.inputCost).toBeCloseTo(0.0006); - // Cached: 800 tokens * $0.30/MTok = $0.00024 - expect(result.cacheReadCost).toBeCloseTo(0.00024); - // Output: 500 * $15/MTok = $0.0075 - expect(result.outputCost).toBeCloseTo(0.0075); - expect(result.totalCost).toBeCloseTo(0.00834); - }); - - it("should treat cached tokens as free when no cache rate specified", () => { - // Using basePricing which has no cacheReadInputTokenCost - const result = calculateCost(basePricing, 1000, 500, 800); - - // Only 200 uncached tokens should be billed - expect(result.inputCost).toBeCloseTo(0.0006); - expect(result.cacheReadCost).toBe(0); - }); - - it("should handle all tokens being cached", () => { - const result = calculateCost(pricingWithCache, 1000, 500, 1000); - - expect(result.inputCost).toBe(0); - expect(result.cacheReadCost).toBe(0.0003); // 1000 * $0.30/MTok - }); - }); describe("edge cases", () => { it("should handle zero tokens", () => { - const result = calculateCost(basePricing, 0, 0, 0); + const result = calculateCost(basePricing, 0, 0); expect(result.inputCost).toBe(0); expect(result.outputCost).toBe(0); - expect(result.cacheReadCost).toBe(0); expect(result.totalCost).toBe(0); }); it("should handle large token counts", () => { - const result = calculateCost(basePricing, 1_000_000, 500_000, 0); + const result = calculateCost(basePricing, 1_000_000, 500_000); expect(result.inputCost).toBe(3); // 1M * $3/MTok expect(result.outputCost).toBe(7.5); // 500K * $15/MTok @@ -235,11 +238,11 @@ describe("calculateCost", () => { }); it("should handle pricing with zero costs", () => { - const freePricing: ModelPricing = { + const freePricing = { inputCostPerToken: 0, outputCostPerToken: 0, - }; - const result = calculateCost(freePricing, 1000, 500, 0); + } satisfies NonNullable>; + const result = calculateCost(freePricing, 1000, 500); expect(result.totalCost).toBe(0); }); @@ -291,37 +294,55 @@ describe("formatMTokCost", () => { describe("getModelPricingDisplay", () => { it("should convert per-token costs to per-MTok", () => { - const pricing: ModelPricing = { + const pricing = { inputCostPerToken: 0.000003, // $3 per MTok outputCostPerToken: 0.000015, // $15 per MTok - }; + } satisfies NonNullable>; const display = getModelPricingDisplay(pricing); expect(display.inputCostPerMTok).toBe(3); expect(display.outputCostPerMTok).toBe(15); expect(display.cacheReadCostPerMTok).toBeUndefined(); + expect(display.cacheCreationCostPerMTok).toBeUndefined(); }); it("should include cache read cost when available", () => { - const pricing: ModelPricing = { + const pricing = { inputCostPerToken: 0.000003, outputCostPerToken: 0.000015, cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok - }; + } satisfies NonNullable>; + + const display = getModelPricingDisplay(pricing); + + expect(display.inputCostPerMTok).toBe(3); + expect(display.outputCostPerMTok).toBe(15); + expect(display.cacheReadCostPerMTok).toBe(0.3); + expect(display.cacheCreationCostPerMTok).toBeUndefined(); + }); + + it("should include cache creation cost when available", () => { + const pricing = { + inputCostPerToken: 0.000003, + outputCostPerToken: 0.000015, + cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok + cacheCreationInputTokenCost: 0.00000375, // $3.75 per MTok + } satisfies NonNullable>; const display = getModelPricingDisplay(pricing); expect(display.inputCostPerMTok).toBe(3); expect(display.outputCostPerMTok).toBe(15); expect(display.cacheReadCostPerMTok).toBe(0.3); + expect(display.cacheCreationCostPerMTok).toBe(3.75); }); it("should handle zero costs", () => { - const pricing: ModelPricing = { + const pricing = { inputCostPerToken: 0, outputCostPerToken: 0, - }; + } satisfies NonNullable>; const display = getModelPricingDisplay(pricing); @@ -330,11 +351,11 @@ describe("getModelPricingDisplay", () => { }); it("should preserve explicit zero cost for cache read", () => { - const pricing: ModelPricing = { + const pricing = { inputCostPerToken: 0.000003, outputCostPerToken: 0.000015, cacheReadInputTokenCost: 0, - }; + } satisfies NonNullable>; const display = getModelPricingDisplay(pricing); diff --git a/lib/pricing.ts b/lib/pricing.ts index 46429fa..fbb314a 100644 --- a/lib/pricing.ts +++ b/lib/pricing.ts @@ -1,53 +1,7 @@ -export interface ModelPricing { - inputCostPerToken: number; - outputCostPerToken: number; - cacheReadInputTokenCost?: number; - cacheCreationInputTokenCost?: number; -} - -export interface CostCalculation { - inputCost: number; - outputCost: number; - cacheReadCost: number; - totalCost: number; - inputTokens: number; - outputTokens: number; - cachedInputTokens: number; -} - -export interface ModelPricingDisplay { - inputCostPerMTok: number; - outputCostPerMTok: number; - cacheReadCostPerMTok?: number; -} - -export interface ModelPricingLookup { - pricing: ModelPricing; - matchedKey: string; -} - -export interface GatewayPricing { - input?: string; - output?: string; - cachedInputTokens?: string; - cacheCreationInputTokens?: string; -} - -export interface GatewayModel { - id: string; - name: string; - description?: string; - pricing?: GatewayPricing; - specification?: { - specificationVersion: string; - provider: string; - modelId: string; - }; - modelType: string; -} +import type { GatewayLanguageModelEntry } from "@ai-sdk/gateway"; export function extractPricingFromGatewayModel( - model: GatewayModel, + model: GatewayLanguageModelEntry, ) { if (!model.pricing) { return null; @@ -64,9 +18,14 @@ export function extractPricingFromGatewayModel( ); } - const result: ModelPricing = { + const result = { inputCostPerToken: inputCost, outputCostPerToken: outputCost, + } as { + inputCostPerToken: number; + outputCostPerToken: number; + cacheReadInputTokenCost?: number; + cacheCreationInputTokenCost?: number; }; if (pricing.cachedInputTokens) { @@ -86,10 +45,14 @@ export function extractPricingFromGatewayModel( return result; } -export function buildPricingMap( - models: GatewayModel[], -) { - const map = new Map(); +export function buildPricingMap(models: GatewayLanguageModelEntry[]) { + const map = new Map< + string, + { + pricing: NonNullable>; + matchedKey: string; + } | null + >(); for (const model of models) { const pricing = extractPricingFromGatewayModel(model); @@ -108,13 +71,13 @@ export function buildPricingMap( export function lookupPricingFromMap( modelId: string, - pricingMap: Map, + pricingMap: ReturnType, ) { return pricingMap.get(modelId) ?? null; } export function getModelPricingDisplay( - pricing: ModelPricing, + pricing: NonNullable>, ) { return { inputCostPerMTok: pricing.inputCostPerToken * 1_000_000, @@ -123,31 +86,27 @@ export function getModelPricingDisplay( pricing.cacheReadInputTokenCost !== undefined ? pricing.cacheReadInputTokenCost * 1_000_000 : undefined, + cacheCreationCostPerMTok: + pricing.cacheCreationInputTokenCost !== undefined + ? pricing.cacheCreationInputTokenCost * 1_000_000 + : undefined, }; } export function calculateCost( - pricing: ModelPricing, + pricing: NonNullable>, inputTokens: number, outputTokens: number, - cachedInputTokens: number = 0, ) { - const uncachedInputTokens = inputTokens - cachedInputTokens; - const inputCost = uncachedInputTokens * pricing.inputCostPerToken; - + const inputCost = inputTokens * pricing.inputCostPerToken; const outputCost = outputTokens * pricing.outputCostPerToken; - const cacheReadCost = - cachedInputTokens * (pricing.cacheReadInputTokenCost ?? 0); - return { inputCost, outputCost, - cacheReadCost, - totalCost: inputCost + outputCost + cacheReadCost, + totalCost: inputCost + outputCost, inputTokens, outputTokens, - cachedInputTokens, }; } diff --git a/lib/report-template.ts b/lib/report-template.ts index 2ecb63a..b5b0d42 100644 --- a/lib/report-template.ts +++ b/lib/report-template.ts @@ -98,9 +98,7 @@ function renderContentBlock(block: ContentBlock) { return ""; } -function renderVerificationResult( - verification: TestVerificationResult | null, -) { +function renderVerificationResult(verification: TestVerificationResult | null) { if (!verification) { return `
@@ -259,6 +257,16 @@ function renderPricingSection(data: MultiTestResultData) { ? `${escapeHtml(pricingKey)}` : ""; + const cacheReadText = + pricing.cacheReadCostPerMTok !== undefined + ? `·${formatMTokCost(pricing.cacheReadCostPerMTok)}/MTok cache read` + : ""; + + const cacheWriteText = + pricing.cacheCreationCostPerMTok !== undefined + ? `·${formatMTokCost(pricing.cacheCreationCostPerMTok)}/MTok cache write` + : ""; + pricingInfoHtml = `
Model Pricing: @@ -266,21 +274,33 @@ function renderPricingSection(data: MultiTestResultData) { ${formatMTokCost(pricing.inputCostPerMTok)}/MTok in · ${formatMTokCost(pricing.outputCostPerMTok)}/MTok out - ${pricing.cacheReadCostPerMTok !== undefined ? `·${formatMTokCost(pricing.cacheReadCostPerMTok)}/MTok cached` : ""} + ${cacheReadText} + ${cacheWriteText}
`; } let costBreakdownHtml = ""; if (totalCost) { - const uncachedInputTokens = - totalCost.inputTokens - totalCost.cachedInputTokens; + const cacheSimRow = + data.metadata.cacheSimulation && + pricing?.cacheReadCostPerMTok !== undefined && + (data.metadata.cacheSimulation.cacheHits > 0 || + data.metadata.cacheSimulation.cacheWriteTokens > 0) + ? ` +
+ Estimated cost with prompt cache: + ${data.metadata.cacheSimulation.cacheHits.toLocaleString()} reads + ${data.metadata.cacheSimulation.cacheWriteTokens.toLocaleString()} writes = ${(data.metadata.cacheSimulation.cacheHits + data.metadata.cacheSimulation.cacheWriteTokens).toLocaleString()} tokens + ${formatCost(data.metadata.cacheSimulation.simulatedCostWithCache)} +
+ ` + : ""; costBreakdownHtml = `
Input tokens: - ${uncachedInputTokens.toLocaleString()} + ${totalCost.inputTokens.toLocaleString()} ${formatCost(totalCost.inputCost)}
@@ -292,9 +312,9 @@ function renderPricingSection(data: MultiTestResultData) { totalCost.cachedInputTokens > 0 ? `
- Cached tokens: + Cached tokens (from usage): ${totalCost.cachedInputTokens.toLocaleString()} ⚡ - ${formatCost(totalCost.cacheReadCost)} + -
` : "" @@ -304,6 +324,7 @@ function renderPricingSection(data: MultiTestResultData) { ${formatCost(totalCost.totalCost)}
+ ${cacheSimRow}
`; } @@ -388,7 +409,7 @@ function getPricingStyles() { .cost-row { display: grid; - grid-template-columns: 120px 1fr auto; + grid-template-columns: 200px 1fr auto; gap: 8px; align-items: center; font-size: 13px; @@ -398,6 +419,14 @@ function getPricingStyles() { color: var(--text-muted); } + .cost-row.simulated { + margin-top: 8px; + padding-top: 8px; + border-top: 1px dashed var(--border); + color: var(--text-muted); + font-style: italic; + } + .cost-row.total { margin-top: 8px; padding-top: 8px; @@ -429,6 +458,10 @@ function getPricingStyles() { color: var(--success); font-size: 15px; } + + .cost-row.simulated .cost-value { + color: var(--mcp-enabled); + } `; } diff --git a/lib/report.ts b/lib/report.ts index c7e50bf..a2ff3c2 100644 --- a/lib/report.ts +++ b/lib/report.ts @@ -1,6 +1,7 @@ import { readFile, writeFile } from "node:fs/promises"; import type { TestVerificationResult } from "./output-test-runner.ts"; import { generateMultiTestHtml } from "./report-template.ts"; +import type { simulateCacheSavings } from "./utils.ts"; interface TextBlock { type: "text"; @@ -69,12 +70,12 @@ export interface PricingInfo { inputCostPerMTok: number; outputCostPerMTok: number; cacheReadCostPerMTok?: number; + cacheCreationCostPerMTok?: number; } export interface TotalCostInfo { inputCost: number; outputCost: number; - cacheReadCost: number; totalCost: number; inputTokens: number; outputTokens: number; @@ -90,6 +91,7 @@ interface Metadata { pricingKey?: string | null; pricing?: PricingInfo | null; totalCost?: TotalCostInfo | null; + cacheSimulation?: ReturnType | null; } export interface SingleTestResult { diff --git a/lib/test-discovery.ts b/lib/test-discovery.ts index 414292b..826f619 100644 --- a/lib/test-discovery.ts +++ b/lib/test-discovery.ts @@ -61,9 +61,3 @@ export function discoverTests() { return definitions; } - -export function buildAgentPrompt(test: TestDefinition) { - return `${test.prompt} - -IMPORTANT: When you have finished implementing the component, use the ResultWrite tool to output your final Svelte component code. Only output the component code itself, no explanations or markdown formatting.`; -} diff --git a/lib/token-cache.ts b/lib/token-cache.ts new file mode 100644 index 0000000..40721a0 --- /dev/null +++ b/lib/token-cache.ts @@ -0,0 +1,79 @@ +import type { extractPricingFromGatewayModel } from "./pricing.ts"; + +export class TokenCache { + private currentTokens: number; + private totalCachedTokens: number = 0; + private messages: Array<{ message: string; tokens: number }> = []; + private pricing: NonNullable< + ReturnType + > | null; + private totalOutputTokens: number = 0; + + constructor( + tokens: number, + pricing?: NonNullable< + ReturnType + > | null, + ) { + this.currentTokens = tokens; + this.pricing = pricing ?? null; + } + + addMessage(message: string, tokens: number, outputTokens: number = 0): void { + // The existing tokens are served from cache on this call + this.totalCachedTokens += this.currentTokens; + + // Now add the new message to our running total + this.currentTokens += tokens; + this.totalOutputTokens += outputTokens; + this.messages.push({ message, tokens }); + } + + getCacheStats() { + return { + totalCachedTokens: this.totalCachedTokens, + currentContextTokens: this.currentTokens, + messageCount: this.messages.length, + }; + } + + calculateSimulatedCost(): { + simulatedCost: number; + cacheReadCost: number; + cacheWriteCost: number; + outputCost: number; + } { + if ( + !this.pricing || + !this.pricing.cacheReadInputTokenCost || + !this.pricing.cacheCreationInputTokenCost + ) { + return { + simulatedCost: 0, + cacheReadCost: 0, + cacheWriteCost: 0, + outputCost: 0, + }; + } + + const cacheReadRate = this.pricing.cacheReadInputTokenCost; + + const cacheWriteRate = this.pricing.cacheCreationInputTokenCost; + + // Tokens read from cache across all API calls + const cacheReadCost = this.totalCachedTokens * cacheReadRate; + + // Tokens written to cache across all API calls (all current tokens were written at some point) + const cacheWriteCost = this.currentTokens * cacheWriteRate; + + // Output tokens at output rate + const outputCost = this.totalOutputTokens * this.pricing.outputCostPerToken; + + return { + simulatedCost: cacheReadCost + cacheWriteCost + outputCost, + cacheReadCost, + cacheWriteCost, + outputCost, + }; + } +} diff --git a/lib/utils.test.ts b/lib/utils.test.ts index c3ad8df..6de99ad 100644 --- a/lib/utils.test.ts +++ b/lib/utils.test.ts @@ -3,8 +3,10 @@ import { sanitizeModelName, getTimestampedFilename, calculateTotalCost, + simulateCacheSavings, } from "./utils.ts"; -import type { ModelPricing } from "./pricing.ts"; +import { TokenCache } from "./token-cache.ts"; +import { extractPricingFromGatewayModel } from "./pricing.ts"; import type { SingleTestResult } from "./report.ts"; describe("sanitizeModelName", () => { @@ -104,11 +106,11 @@ describe("getTimestampedFilename", () => { }); describe("calculateTotalCost", () => { - const pricing: ModelPricing = { + const pricing = { inputCostPerToken: 1.0 / 1_000_000, outputCostPerToken: 2.0 / 1_000_000, cacheReadInputTokenCost: 0.1 / 1_000_000, - }; + } satisfies NonNullable>; it("calculates zero cost for empty results", () => { const tests: SingleTestResult[] = []; @@ -117,7 +119,6 @@ describe("calculateTotalCost", () => { expect(result).toEqual({ inputCost: 0, outputCost: 0, - cacheReadCost: 0, totalCost: 0, inputTokens: 0, outputTokens: 0, @@ -169,24 +170,496 @@ describe("calculateTotalCost", () => { // Total Input: 100 + 200 + 300 = 600 // Total Output: 50 + 100 + 150 = 300 // Total Cached: 10 + 0 + 20 = 30 - // Uncached Input: 600 - 30 = 570 - // Costs (per Token): - // Input: 570 * (1.0 / 1e6) = 0.00057 + // Costs (per Token) - calculateCost bills all input at full rate: + // Input: 600 * (1.0 / 1e6) = 0.0006 // Output: 300 * (2.0 / 1e6) = 0.0006 - // Cache: 30 * (0.1 / 1e6) = 0.000003 - // Total: 0.00057 + 0.0006 + 0.000003 = 0.001173 + // Total: 0.0006 + 0.0006 = 0.0012 const result = calculateTotalCost(tests, pricing); expect(result).toEqual({ - inputCost: 0.00057, + inputCost: 0.0006, outputCost: 0.0006, - cacheReadCost: 0.000003, - totalCost: 0.001173, + totalCost: 0.0012, inputTokens: 600, outputTokens: 300, cachedInputTokens: 30, }); }); }); + +describe("TokenCache", () => { + const pricing = { + inputCostPerToken: 1.0 / 1_000_000, + outputCostPerToken: 2.0 / 1_000_000, + cacheCreationInputTokenCost: 1.25 / 1_000_000, + cacheReadInputTokenCost: 0.1 / 1_000_000, + } satisfies NonNullable>; + + it("initializes with correct values", () => { + const cache = new TokenCache(100, pricing); + const stats = cache.getCacheStats(); + + expect(stats.totalCachedTokens).toBe(0); + expect(stats.currentContextTokens).toBe(100); + expect(stats.messageCount).toBe(0); + }); + + it("accumulates cached tokens correctly", () => { + const cache = new TokenCache(100, pricing); + + cache.addMessage("What is JavaScript?", 50); + let stats = cache.getCacheStats(); + expect(stats.totalCachedTokens).toBe(100); // 100 from initial + expect(stats.currentContextTokens).toBe(150); // 100 + 50 + expect(stats.messageCount).toBe(1); + + cache.addMessage("JavaScript is...", 200); + stats = cache.getCacheStats(); + expect(stats.totalCachedTokens).toBe(250); // 100 + 150 + expect(stats.currentContextTokens).toBe(350); // 150 + 200 + expect(stats.messageCount).toBe(2); + + cache.addMessage("Can you give an example?", 30); + stats = cache.getCacheStats(); + expect(stats.totalCachedTokens).toBe(600); // 100 + 150 + 350 + expect(stats.currentContextTokens).toBe(380); // 350 + 30 + expect(stats.messageCount).toBe(3); + }); + + it("tracks output tokens separately", () => { + const cache = new TokenCache(100, pricing); + + cache.addMessage("msg1", 50, 200); + cache.addMessage("msg2", 30, 150); + + const stats = cache.getCacheStats(); + expect(stats.totalCachedTokens).toBe(250); // 100 + 150 + expect(stats.currentContextTokens).toBe(180); // 100 + 50 + 30 + }); + + it("calculates cost with pricing", () => { + const cache = new TokenCache(100, pricing); + + cache.addMessage("msg1", 50, 200); + cache.addMessage("msg2", 100, 300); + + const cost = cache.calculateSimulatedCost(); + + // totalCachedTokens = 100 + 150 = 250 (tokens read from cache across calls) + // currentTokens = 250 (all tokens written to cache) + // totalOutputTokens = 200 + 300 = 500 + + // cacheReadCost = 250 * 0.1e-6 = 0.000025 + // cacheWriteCost = 250 * 1.25e-6 = 0.0003125 (cache write rate is 1.25x) + // outputCost = 500 * 2e-6 = 0.001 + // simulatedCost = 0.000025 + 0.0003125 + 0.001 = 0.0013375 + + expect(cost.cacheReadCost).toBeCloseTo(0.000025, 6); + expect(cost.cacheWriteCost).toBeCloseTo(0.0003125, 6); + expect(cost.outputCost).toBeCloseTo(0.001, 6); + expect(cost.simulatedCost).toBeCloseTo(0.0013375, 6); + }); + + it("calculates zero cost without pricing", () => { + const cache = new TokenCache(100); + + cache.addMessage("msg1", 50, 200); + + const cost = cache.calculateSimulatedCost(); + + expect(cost.cacheReadCost).toBe(0); + expect(cost.cacheWriteCost).toBe(0); + expect(cost.outputCost).toBe(0); + expect(cost.simulatedCost).toBe(0); + }); + + it("handles zero tokens", () => { + const cache = new TokenCache(0, pricing); + const stats = cache.getCacheStats(); + + expect(stats.totalCachedTokens).toBe(0); + expect(stats.currentContextTokens).toBe(0); + expect(stats.messageCount).toBe(0); + + const cost = cache.calculateSimulatedCost(); + expect(cost.simulatedCost).toBe(0); + }); +}); + +describe("simulateCacheSavings - growing prefix model", () => { + // Default pricing: input=$1/MTok, output=$2/MTok + // Default cache read: 10% of input = $0.10/MTok + // Default cache write: 125% of input = $1.25/MTok + const basicPricing = { + inputCostPerToken: 1.0 / 1_000_000, + outputCostPerToken: 2.0 / 1_000_000, + } satisfies NonNullable>; + + it("returns zeros for empty tests array", () => { + const tests: SingleTestResult[] = []; + const result = simulateCacheSavings(tests, basicPricing); + + expect(result).toEqual({ + simulatedCostWithCache: 0, + cacheHits: 0, + cacheWriteTokens: 0, + }); + }); + + it("handles single test with single step (no cache hits)", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 1000, + outputTokens: 500, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + // Step 1: 1000 input tokens at cache write rate (1.25/MTok) + 500 output at $2/MTok + // Simulated cost = 1000 * 1.25e-6 + 500 * 2e-6 = 0.00125 + 0.001 = 0.00225 + expect(result.cacheHits).toBe(0); + expect(result.cacheWriteTokens).toBe(1000); + expect(result.simulatedCostWithCache).toBeCloseTo(0.00225, 6); + }); + + it("calculates savings for single test with multiple steps - growing prefix", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 1000, + outputTokens: 200, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 1500, + outputTokens: 300, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 2000, + outputTokens: 400, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + // Growing prefix model: + // Step 1: 1000 tokens → write all to cache + // Cost: 1000 * 1.25e-6 + 200 * 2e-6 = 0.00125 + 0.0004 = 0.00165 + // Step 2: 1500 tokens → 1000 cached (read), 500 new (write) + // Cost: 1000 * 0.1e-6 + 500 * 1.25e-6 + 300 * 2e-6 = 0.0001 + 0.000625 + 0.0006 = 0.001325 + // Step 3: 2000 tokens → 1500 cached (read), 500 new (write) + // Cost: 1500 * 0.1e-6 + 500 * 1.25e-6 + 400 * 2e-6 = 0.00015 + 0.000625 + 0.0008 = 0.001575 + // Total simulated: 0.00165 + 0.001325 + 0.001575 = 0.00455 + + expect(result.cacheHits).toBe(1000 + 1500); // 1000 from step 2 + 1500 from step 3 + expect(result.cacheWriteTokens).toBe(1000 + 500 + 500); // 1000 step1 + 500 step2 + 500 step3 + expect(result.simulatedCostWithCache).toBeCloseTo(0.00455, 6); + }); + + it("aggregates across multiple tests with cache reset per test", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 500, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 800, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + ], + }, + { + testName: "test2", + prompt: "p2", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 600, + outputTokens: 200, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 900, + outputTokens: 200, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 1200, + outputTokens: 200, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + // Test 1: + // Step 1: 500 write, 100 output + // Step 2: 500 read, 300 write, 100 output + // Hits: 500, Writes: 500 + 300 = 800 + // + // Test 2: + // Step 1: 600 write, 200 output + // Step 2: 600 read, 300 write, 200 output + // Step 3: 900 read, 300 write, 200 output + // Hits: 600 + 900 = 1500, Writes: 600 + 300 + 300 = 1200 + + // Total: hits = 500 + 1500 = 2000, writes = 800 + 1200 = 2000 + + expect(result.cacheHits).toBe(2000); + expect(result.cacheWriteTokens).toBe(2000); + + // Calculate expected cost manually: + // Test 1 Step 1: 500 * 1.25e-6 + 100 * 2e-6 = 0.000625 + 0.0002 = 0.000825 + // Test 1 Step 2: 500 * 0.1e-6 + 300 * 1.25e-6 + 100 * 2e-6 = 0.00005 + 0.000375 + 0.0002 = 0.000625 + // Test 2 Step 1: 600 * 1.25e-6 + 200 * 2e-6 = 0.00075 + 0.0004 = 0.00115 + // Test 2 Step 2: 600 * 0.1e-6 + 300 * 1.25e-6 + 200 * 2e-6 = 0.00006 + 0.000375 + 0.0004 = 0.000835 + // Test 2 Step 3: 900 * 0.1e-6 + 300 * 1.25e-6 + 200 * 2e-6 = 0.00009 + 0.000375 + 0.0004 = 0.000865 + // Total: 0.000825 + 0.000625 + 0.00115 + 0.000835 + 0.000865 = 0.0043 + + expect(result.simulatedCostWithCache).toBeCloseTo(0.0043, 6); + }); + + it("skips tests with empty steps array", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [], + }, + { + testName: "test2", + prompt: "p2", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 1000, + outputTokens: 500, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + // Only test2 should be counted + expect(result.cacheHits).toBe(0); + expect(result.cacheWriteTokens).toBe(1000); + }); + + it("uses custom cache pricing when provided", () => { + const customPricing = { + inputCostPerToken: 1.0 / 1_000_000, + outputCostPerToken: 2.0 / 1_000_000, + cacheReadInputTokenCost: 0.05 / 1_000_000, // 5% instead of default 10% + cacheCreationInputTokenCost: 1.5 / 1_000_000, // 150% instead of default 125% + } satisfies NonNullable>; + + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 1000, + outputTokens: 500, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 1500, + outputTokens: 500, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, customPricing); + + // Step 1: 1000 write at $1.50/MTok + 500 output at $2/MTok + // = 1000 * 1.5e-6 + 500 * 2e-6 = 0.0015 + 0.001 = 0.0025 + // Step 2: 1000 read at $0.05/MTok + 500 write at $1.50/MTok + 500 output at $2/MTok + // = 1000 * 0.05e-6 + 500 * 1.5e-6 + 500 * 2e-6 = 0.00005 + 0.00075 + 0.001 = 0.0018 + // Total: 0.0025 + 0.0018 = 0.0043 + + expect(result.cacheHits).toBe(1000); + expect(result.cacheWriteTokens).toBe(1000 + 500); + expect(result.simulatedCostWithCache).toBeCloseTo(0.0043, 6); + }); + + it("handles input tokens decreasing between steps (edge case)", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 1000, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 800, // Less than step 1 (unusual but possible) + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + // Step 1: 1000 write + // Step 2: 1000 read (previous step), 0 new write (800 - 1000 = -200 → clamped to 0) + // This tests the Math.max(0, newPortion) behavior + + expect(result.cacheHits).toBe(1000); // Still reads full previous prefix + expect(result.cacheWriteTokens).toBe(1000); // Only step 1 writes + }); + + it("handles zero actual cost edge case", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 0, + outputTokens: 0, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + expect(result.simulatedCostWithCache).toBe(0); + expect(result.cacheHits).toBe(0); + expect(result.cacheWriteTokens).toBe(0); + }); + + it("compares favorably to actual cost for multi-step tests", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 1000, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 1200, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + { + usage: { + inputTokens: 1400, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + ], + }, + ]; + + const result = simulateCacheSavings(tests, basicPricing); + + // Actual cost (no caching): + // Input: (1000 + 1200 + 1400) * 1e-6 = 3600 * 1e-6 = 0.0036 + // Output: (100 + 100 + 100) * 2e-6 = 300 * 2e-6 = 0.0006 + // Total actual: 0.0042 + + const actualCost = 0.0042; + + // Simulated should be less than actual for multi-step scenarios + expect(result.simulatedCostWithCache).toBeLessThan(actualCost); + + // Calculate savings + const savings = actualCost - result.simulatedCostWithCache; + const savingsPercent = (savings / actualCost) * 100; + + // Should have meaningful savings (>10% for this scenario) + expect(savingsPercent).toBeGreaterThan(10); + }); +}); diff --git a/lib/utils.ts b/lib/utils.ts index e3cb01c..63c0efc 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -1,5 +1,8 @@ -import { calculateCost, type ModelPricing } from "./pricing.ts"; -import type { SingleTestResult, TotalCostInfo } from "./report.ts"; +import { calculateCost, extractPricingFromGatewayModel } from "./pricing.ts"; +import type { SingleTestResult } from "./report.ts"; +import type { ModelMessage } from "@ai-sdk/provider-utils"; +import type { TestDefinition } from "./test-discovery.ts"; +import { TokenCache } from "./token-cache.ts"; export function sanitizeModelName(modelName: string) { return modelName.replace(/[^a-zA-Z0-9.]/g, "-"); @@ -53,7 +56,7 @@ export function extractResultWriteContent(steps: unknown[]) { export function calculateTotalCost( tests: SingleTestResult[], - pricing: ModelPricing, + pricing: NonNullable>, ) { let totalInputTokens = 0; let totalOutputTokens = 0; @@ -71,16 +74,102 @@ export function calculateTotalCost( pricing, totalInputTokens, totalOutputTokens, - totalCachedInputTokens, ); return { inputCost: costResult.inputCost, outputCost: costResult.outputCost, - cacheReadCost: costResult.cacheReadCost, totalCost: costResult.totalCost, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, cachedInputTokens: totalCachedInputTokens, }; } + +export function buildAgentPrompt(test: TestDefinition): ModelMessage[] { + return [ + { + role: "user", + content: `${test.prompt} + +IMPORTANT: When you have finished implementing the component, use the ResultWrite tool to output your final Svelte component code. Only output the component code itself, no explanations or markdown formatting.`, + }, + ]; +} + +/** + * Simulates cache savings using a growing prefix model. + * + * Cache behavior modeled: + * - Each test runs in its own context (cache resets between tests) + * - Step 1's input is written to cache (pays cache creation rate) + * - Each subsequent step: + * - Previous step's full input is cached (pays cache read rate) + * - New tokens extend the cache (pays cache creation rate) + * - The cache prefix grows with each step + * + * Example for a test with 3 steps (inputs: 1000 → 1500 → 2000): + * Step 1: 1000 tokens → pay cache creation for 1000 + * Step 2: 1500 tokens → 1000 cached (read) + 500 new (creation) + * Step 3: 2000 tokens → 1500 cached (read) + 500 new (creation) + */ +export function simulateCacheSavings( + tests: SingleTestResult[], + pricing: NonNullable>, +) { + // Default rates if not specified: + // - Cache read: 10% of input cost + // - Cache creation: 125% of input cost (25% premium) + const cacheReadRate = + pricing.cacheReadInputTokenCost ?? pricing.inputCostPerToken * 0.1; + const cacheWriteRate = + pricing.cacheCreationInputTokenCost ?? pricing.inputCostPerToken * 1.25; + + let totalCacheHits = 0; // Total tokens read from cache across all steps + let totalCacheWriteTokens = 0; // Total tokens written to cache (including step 1) + let simulatedCost = 0; + + for (const test of tests) { + if (test.steps.length === 0) continue; + + const firstStep = test.steps[0]; + if (!firstStep) continue; + + // Create cache with first step's input tokens + const cache = new TokenCache(firstStep.usage.inputTokens, pricing); + totalCacheWriteTokens += firstStep.usage.inputTokens; + + // First step: pay cache creation rate for all input + simulatedCost += firstStep.usage.inputTokens * cacheWriteRate; + simulatedCost += firstStep.usage.outputTokens * pricing.outputCostPerToken; + + // Add output tokens for first step (but no new input tokens yet) + cache.addMessage("step-0", 0, firstStep.usage.outputTokens); + + // Process subsequent steps + for (let i = 1; i < test.steps.length; i++) { + const step = test.steps[i]; + if (!step) continue; + + const stats = cache.getCacheStats(); + const cachedPortion = stats.currentContextTokens; + const newTokens = Math.max(0, step.usage.inputTokens - cachedPortion); + + totalCacheHits += cachedPortion; + totalCacheWriteTokens += newTokens; + + // Calculate cost for this step + simulatedCost += cachedPortion * cacheReadRate; + simulatedCost += newTokens * cacheWriteRate; + simulatedCost += step.usage.outputTokens * pricing.outputCostPerToken; + + cache.addMessage(`step-${i}`, newTokens, step.usage.outputTokens); + } + } + + return { + simulatedCostWithCache: simulatedCost, + cacheHits: totalCacheHits, + cacheWriteTokens: totalCacheWriteTokens, + }; +}