diff --git a/.bunversion b/.bunversion new file mode 100644 index 0000000..5cf6288 --- /dev/null +++ b/.bunversion @@ -0,0 +1 @@ +bun-v1.3.4 \ No newline at end of file diff --git a/.cocominify b/.cocominify index 3d0dbe4..0685a19 100644 --- a/.cocominify +++ b/.cocominify @@ -1 +1,2 @@ -tests/ \ No newline at end of file +tests/ +data/ \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..51a7fdf --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,31 @@ +name: Test + +on: + pull_request: + push: + branches: + - main + +jobs: + test: + name: Run Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Get Bun version + id: bun-version + run: echo "version=$(cat .bunversion | sed 's/bun-v//')" >> $GITHUB_OUTPUT + + - uses: oven-sh/setup-bun@v2 + with: + bun-version: ${{ steps.bun-version.outputs.version }} + + - name: Install dependencies + run: bun install + + - name: TypeScript type check + run: bun run tsc + + - name: Run tests + run: bun test diff --git a/AGENTS.md b/AGENTS.md index b38b6cb..3e7c825 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,6 +1,6 @@ ## Project Overview -AI SDK benchmarking tool built with Vercel AI SDK and Bun runtime. Tests AI agents with MCP (Model Context Protocol) server integration, specifically using the Svelte MCP server for agent benchmarks. Automatically discovers and runs all tests in the `tests/` directory and verifies generated components against test suites. +AI SDK benchmarking tool built with Vercel AI SDK and Bun runtime. Tests AI agents with MCP (Model Context Protocol) server integration using the Vercel AI Gateway. Automatically discovers and runs all tests in the `tests/` directory and verifies LLM-generated Svelte components against test suites. ## Development Commands @@ -8,125 +8,85 @@ AI SDK benchmarking tool built with Vercel AI SDK and Bun runtime. Tests AI agen # Install dependencies (runs patch-package automatically) bun install -# Run the main benchmark (discovers and runs all tests) -bun run index.ts +# Run the main benchmark (interactive CLI) +bun run start # Verify reference implementations against test suites bun run verify-tests -# Generate HTML report from most recent result +# Generate HTML reports from all result JSON files bun run generate-report.ts # Generate HTML report from specific result file bun run generate-report.ts results/result-2024-12-07-14-30-45.json +# Run unit tests for lib modules +bun run test:self + # Run TypeScript type checking bun tsc --noEmit -``` - -## Environment Variables - -### MODEL Configuration - -The `MODEL` environment variable determines which AI provider to use: - -**Anthropic Direct API:** -```bash -MODEL=anthropic/claude-haiku-4-5 -MODEL=anthropic/claude-sonnet-4 +# Format code with Prettier +bun run prettier ``` -**OpenAI Direct API:** - -```bash -MODEL=openai/gpt-5 -MODEL=openai/gpt-5-mini -MODEL=openai/gpt-4o -``` +## Environment Variables -**OpenRouter (300+ models):** +### Vercel AI Gateway -```bash -MODEL=openrouter/anthropic/claude-sonnet-4 -MODEL=openrouter/google/gemini-pro -MODEL=openrouter/meta-llama/llama-3.1-405b-instruct -``` +The benchmark uses the Vercel AI Gateway for model access. Configuration: -**LM Studio (Local models via OpenAI-compatible API):** +1. Link to a Vercel project with AI Gateway enabled: `bun run vercel:link` +2. Pull environment variables: `bun run vercel:env:pull` -```bash -MODEL=lmstudio/model-name -``` +Required environment variable: -LM Studio runs a local OpenAI-compatible API server on `http://localhost:1234/v1`. Make sure LM Studio is running with a model loaded before using this provider. +- `VERCEL_OIDC_TOKEN`: OIDC token for Vercel AI Gateway authentication ### MCP Server Configuration -The `MCP_SERVER_URL` environment variable controls MCP (Model Context Protocol) integration. The tool automatically detects whether to use HTTP or StdIO transport based on the value format. - -**HTTP MCP Servers (Remote):** +MCP integration is configured via the interactive CLI at runtime. Options: -```bash -# Enable MCP with Svelte server (default for this benchmark) -MCP_SERVER_URL=https://mcp.svelte.dev/mcp - -# Use a different HTTP MCP server -MCP_SERVER_URL=https://your-mcp-server.com/mcp -``` +- **No MCP Integration**: Agent runs with built-in tools only +- **MCP over HTTP**: Uses HTTP transport (default: `https://mcp.svelte.dev/mcp`) +- **MCP over StdIO**: Uses local command (default: `npx -y @sveltejs/mcp`) -**StdIO MCP Servers (Local):** - -For local MCP servers, simply provide the command string (any non-HTTP value): - -```bash -# Use the default Svelte MCP server via npx -MCP_SERVER_URL=npx -y @sveltejs/mcp +## Architecture -# Use a custom local MCP server -MCP_SERVER_URL=node path/to/your/mcp-server.js +### Directory Structure -# Use with Bun runtime -MCP_SERVER_URL=bun run src/mcp-server.ts --verbose ``` - -**Disable MCP:** - -```bash -# Disable MCP integration (run without external tools) -MCP_SERVER_URL= +├── index.ts # Main entry point with interactive CLI +├── lib/ +│ ├── pricing.ts # Cost calculation from gateway pricing +│ ├── pricing.test.ts # Unit tests for pricing module +│ ├── test-discovery.ts # Test suite discovery and prompt building +│ ├── test-discovery.test.ts # Unit tests for test discovery +│ ├── output-test-runner.ts # Vitest runner for component verification +│ ├── output-test-runner.test.ts # Unit tests for output runner +│ ├── verify-references.ts # Reference implementation verification +│ ├── report.ts # Report generation orchestration +│ ├── report-template.ts # HTML report template generation +│ ├── report-styles.ts # CSS styles for HTML reports +│ └── tools/ +│ ├── index.ts # Tool exports +│ ├── result-write.ts # ResultWrite tool for final output +│ ├── result-write.test.ts # Unit tests for ResultWrite tool +│ ├── test-component.ts # TestComponent tool for iterative testing +│ └── test-component.test.ts # Unit tests for TestComponent tool +├── tests/ # Benchmark test suites +│ └── {test-name}/ +│ ├── Reference.svelte # Reference implementation +│ ├── test.ts # Vitest test file +│ └── prompt.md # Agent prompt +├── results/ # Benchmark results (JSON + HTML) +├── outputs/ # Temporary directory for test verification +└── patches/ # Patches for dependencies ``` -**Behavior:** - -- If `MCP_SERVER_URL` starts with `http://` or `https://`: Uses HTTP transport with that URL -- If `MCP_SERVER_URL` is set but not an HTTP URL: Uses StdIO transport, treating the value as a command string -- If `MCP_SERVER_URL` is empty or not set: Agent runs without MCP tools (only built-in tools) -- MCP transport type (HTTP or StdIO) and configuration are documented in the result JSON and HTML report - -### Required API Keys - -- `ANTHROPIC_API_KEY`: Required when using `anthropic/*` models -- `OPENAI_API_KEY`: Required when using `openai/*` models (get at https://platform.openai.com/api-keys) -- `OPENROUTER_API_KEY`: Required when using `openrouter/*` models (get at https://openrouter.ai/keys) -- No API key required for `lmstudio/*` models (runs locally) - -### Provider Routing - -The benchmark tool automatically routes to the correct provider based on the `MODEL` prefix: - -- `anthropic/*` → Direct Anthropic API -- `openai/*` → Direct OpenAI API -- `openrouter/*` → OpenRouter unified API -- `lmstudio/*` → LM Studio local server (OpenAI-compatible) - -This allows switching models and providers without any code changes. - -## Architecture - ### Test Suite Structure -Test suites are organized in the `tests/` directory with the following structure: +Benchmark test suites in `tests/` directory: ``` tests/ @@ -138,35 +98,66 @@ tests/ **Benchmark Workflow:** -1. `index.ts` discovers all test suites in `tests/` -2. For each test: +1. `index.ts` presents interactive CLI for model/MCP selection +2. Discovers all test suites in `tests/` +3. For each selected model and test: - Loads `prompt.md` and builds agent prompt - - Agent generates component code based on the prompt + - Agent generates component code using available tools - Agent calls `ResultWrite` tool with the component code - Component is written to `outputs/{test-name}/Component.svelte` - Test file is copied to `outputs/{test-name}/test.ts` - Vitest runs tests against the generated component - Results are collected (pass/fail, error messages) - Output directory is cleaned up -3. All results are saved to a timestamped JSON file -4. HTML report is generated with expandable sections for each test +4. All results are saved to timestamped JSON file +5. HTML report is generated with expandable sections for each test + +### Agent Tools + +**ResultWrite** (`lib/tools/result-write.ts`): -**Reference Verification:** +- Called when agent completes component implementation +- Signals the agent to stop (via `stopWhen` configuration) +- Accepts `content` parameter with Svelte component code -- Run `bun run verify-tests` to validate reference implementations -- Each test file imports `Component.svelte` (not Reference.svelte directly) -- Verification system temporarily copies Reference.svelte → Component.svelte -- Tests use `@testing-library/svelte` for component testing -- Tests use `data-testid` attributes for element selection +**TestComponent** (`lib/tools/test-component.ts`): + +- Optional tool for iterative development +- Runs component against test suite before final submission +- Returns pass/fail status and detailed error messages +- Enabled/disabled via interactive CLI + +### Interactive CLI + +The benchmark uses `@clack/prompts` for an interactive CLI that prompts for: + +1. **Model Selection**: Multi-select from Vercel AI Gateway available models +2. **MCP Integration**: Choose HTTP, StdIO, or no MCP +3. **TestComponent Tool**: Enable/disable iterative testing tool +4. **Pricing Confirmation**: Review and confirm cost calculation settings + +### Pricing System + +The pricing module (`lib/pricing.ts`) handles cost calculation: + +- Extracts pricing from Vercel AI Gateway model metadata +- Calculates costs based on input/output/cached tokens +- Supports cache read billing at reduced rates +- Displays costs in reports with per-million-token rates + +Key functions: + +- `extractPricingFromGatewayModel()`: Parse gateway model pricing +- `buildPricingMap()`: Build lookup map from gateway models +- `calculateCost()`: Calculate total cost from token usage +- `formatCost()` / `formatMTokCost()`: Format costs for display ### Key Technologies - **Vercel AI SDK v5**: Agent framework with tool calling -- **@ai-sdk/anthropic**: Anthropic provider for direct API access -- **@ai-sdk/openai**: OpenAI provider for direct API access -- **@ai-sdk/openai-compatible**: OpenAI-compatible provider for LM Studio and other local servers -- **@openrouter/ai-sdk-provider**: OpenRouter provider for unified access to 300+ models +- **Vercel AI Gateway**: Unified access to multiple AI providers - **@ai-sdk/mcp**: MCP client integration (with custom patch) +- **@clack/prompts**: Interactive CLI prompts - **Bun Runtime**: JavaScript runtime (not Node.js) - **Vitest**: Test framework for component testing - **@testing-library/svelte**: Testing utilities for Svelte components @@ -176,33 +167,34 @@ tests/ The project uses `@ai-sdk/mcp` with a custom patch applied via `patch-package`: - Patch location: `patches/@ai-sdk+mcp+0.0.11.patch` -- Fixes: Handles missing event types in HTTP SSE responses by treating undefined events as "message" events -- MCP server: Configurable via `MCP_SERVER_URL` environment variable -- Default server: Svelte documentation server (`https://mcp.svelte.dev/mcp`) -- Can be disabled by leaving `MCP_SERVER_URL` empty +- Fixes: Handles missing event types in HTTP SSE responses +- Supports both HTTP and StdIO transports +- Configuration via interactive CLI at runtime ### Data Flow -1. Test discovery scans `tests/` directory for valid test suites -2. For each test: - a. Agent receives prompt with access to tools (built-in + optional MCP tools) +1. Interactive CLI collects configuration (models, MCP, tools) +2. Gateway provides available models and pricing +3. Test discovery scans `tests/` directory +4. For each model and test: + a. Agent receives prompt with access to tools (built-in + optional MCP) b. Agent iterates through steps, calling tools as needed - c. Agent stops when `ResultWrite` tool is called with component code + c. Agent stops when `ResultWrite` tool is called d. Component is written to `outputs/{test-name}/Component.svelte` e. Vitest runs test file against the generated component f. Test results are collected (pass/fail, error details) g. Output directory is cleaned up -3. All results aggregated into multi-test result object -4. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json` with metadata -5. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html` -6. Report automatically opens in default browser +5. Results aggregated with pricing calculations +6. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json` +7. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html` +8. Report automatically opens in default browser ### Output Files All results are saved in the `results/` directory with timestamped filenames: -- **JSON files**: `result-2024-12-07-14-30-45.json` - Complete execution trace with all test results -- **HTML files**: `result-2024-12-07-14-30-45.html` - Interactive visualization with expandable test sections +- **JSON files**: `result-2024-12-07-14-30-45.json` - Complete execution trace +- **HTML files**: `result-2024-12-07-14-30-45.html` - Interactive visualization **Multi-Test Result JSON Structure:** @@ -223,25 +215,44 @@ All results are saved in the `results/` directory with timestamped filenames: "duration": 150, "failedTests": [] } - }, - ... + } ], "metadata": { "mcpEnabled": true, "mcpServerUrl": "https://mcp.svelte.dev/mcp", + "mcpTransportType": "HTTP", "timestamp": "2024-12-07T14:30:45.123Z", - "model": "anthropic/claude-sonnet-4" + "model": "anthropic/claude-sonnet-4", + "pricingKey": "anthropic/claude-sonnet-4", + "pricing": { + "inputCostPerMTok": 3, + "outputCostPerMTok": 15, + "cacheReadCostPerMTok": 0.3 + }, + "totalCost": { + "inputCost": 0.003, + "outputCost": 0.015, + "cacheReadCost": 0.0003, + "totalCost": 0.0183, + "inputTokens": 1000, + "outputTokens": 1000, + "cachedInputTokens": 1000 + } } } ``` -This naming convention allows you to: +## Unit Tests + +Unit tests for library modules are in `lib/*.test.ts`: + +- `lib/pricing.test.ts` - Pricing extraction, calculation, formatting +- `lib/test-discovery.test.ts` - Test suite discovery and prompt building +- `lib/output-test-runner.test.ts` - Output directory management +- `lib/tools/result-write.test.ts` - ResultWrite tool behavior +- `lib/tools/test-component.test.ts` - TestComponent tool behavior -- Run multiple benchmarks without overwriting previous results -- Easily identify when each benchmark was run -- Compare results across different runs -- Track whether MCP was enabled for each run -- See per-test verification status +Run unit tests with: `bun run test:self` ## TypeScript Configuration @@ -258,12 +269,13 @@ This naming convention allows you to: - The MCP client import uses a direct path to the patched module: `./node_modules/@ai-sdk/mcp/dist/index.mjs` - Agent stops execution when the `ResultWrite` tool is called (configured via `stopWhen` option) +- Agent also stops after 10 steps maximum (configured via `stepCountIs(10)`) - The `outputs/` directory is used temporarily for test verification and is cleaned up after each test - HTML reports include expandable sections for each test with full step details - Test verification results show pass/fail status and failed test details - Token usage includes cached token counts when available - All result files are saved with timestamps to preserve historical benchmarks -- MCP integration can be toggled via `MCP_SERVER_URL` environment variable without code changes +- MCP integration can be configured via interactive CLI without code changes - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge - Exit code is 0 if all tests pass, 1 if any tests fail -- LM Studio provider requires LM Studio to be running locally with a model loaded +- Pricing is fetched from Vercel AI Gateway model metadata at runtime diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000..43c994c --- /dev/null +++ b/GEMINI.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/README.md b/README.md index 2dbb337..91a33a7 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ AI SDK benchmarking tool that tests AI agents with MCP (Model Context Protocol) To install dependencies: ```bash +./scripts/install.sh # installs the correct bun version bun install ``` diff --git a/generate-report.ts b/generate-report.ts index 0dcd1b2..ffd28ae 100644 --- a/generate-report.ts +++ b/generate-report.ts @@ -1,14 +1,10 @@ import { generateReport } from "./lib/report.ts"; import { readdirSync } from "node:fs"; -/** - * Get all result files from the results directory - */ function getAllResultFiles(): string[] { const resultsDir = "results"; const files = readdirSync(resultsDir); - // Filter for result JSON files const resultFiles = files.filter( (file) => file.startsWith("result-") && file.endsWith(".json"), ); @@ -17,18 +13,15 @@ function getAllResultFiles(): string[] { throw new Error("No result files found in results/ directory"); } - // Sort by filename (which includes timestamp) in descending order resultFiles.sort((a, b) => b.localeCompare(a)); return resultFiles.map((file) => `${resultsDir}/${file}`); } -// Get all result JSON files const resultFiles = getAllResultFiles(); console.log(`Found ${resultFiles.length} result file(s) to process\n`); -// Generate HTML report for each JSON file for (const jsonPath of resultFiles) { const htmlPath = jsonPath.replace(/\.json$/, ".html"); console.log(`Generating report: ${jsonPath} -> ${htmlPath}`); diff --git a/index.ts b/index.ts index a1c0591..b4a4bb3 100644 --- a/index.ts +++ b/index.ts @@ -5,8 +5,13 @@ import { writeFileSync, mkdirSync, existsSync } from "node:fs"; import { generateReport, type SingleTestResult, - type MultiTestResultData, } from "./lib/report.ts"; +import { + getTimestampedFilename, + isHttpUrl, + extractResultWriteContent, + calculateTotalCost, +} from "./lib/utils.ts"; import { discoverTests, buildAgentPrompt, @@ -19,6 +24,15 @@ import { runTestVerification, } from "./lib/output-test-runner.ts"; import { resultWriteTool, testComponentTool } from "./lib/tools/index.ts"; +import { + buildPricingMap, + lookupPricingFromMap, + getModelPricingDisplay, + formatCost, + formatMTokCost, + type ModelPricingLookup, + type GatewayModel, +} from "./lib/pricing.ts"; import type { LanguageModel } from "ai"; import { intro, @@ -28,16 +42,95 @@ import { text, select, confirm, + note, } from "@clack/prompts"; import { gateway } from "ai"; +async function validateAndConfirmPricing( + models: string[], + pricingMap: Map, +) { + const lookups = new Map(); + + for (const modelId of models) { + const lookup = lookupPricingFromMap(modelId, pricingMap); + lookups.set(modelId, lookup); + } + + const modelsWithPricing = models.filter((m) => lookups.get(m) !== null); + const modelsWithoutPricing = models.filter((m) => lookups.get(m) === null); + + if (modelsWithoutPricing.length === 0) { + const pricingLines = models.map((modelId) => { + const lookup = lookups.get(modelId)!; + const display = getModelPricingDisplay(lookup.pricing); + return `${modelId}\n → ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`; + }); + + note(pricingLines.join("\n\n"), "💰 Pricing Found"); + + const usePricing = await confirm({ + message: "Enable cost calculation?", + initialValue: true, + }); + + if (isCancel(usePricing)) { + cancel("Operation cancelled."); + process.exit(0); + } + + return { enabled: usePricing, lookups }; + } else { + const lines: string[] = []; + + if (modelsWithoutPricing.length > 0) { + lines.push("No pricing found for:"); + for (const modelId of modelsWithoutPricing) { + lines.push(` ✗ ${modelId}`); + } + } + + if (modelsWithPricing.length > 0) { + lines.push(""); + lines.push("Pricing available for:"); + for (const modelId of modelsWithPricing) { + const lookup = lookups.get(modelId)!; + const display = getModelPricingDisplay(lookup.pricing); + lines.push( + ` ✓ ${modelId} (${formatMTokCost(display.inputCostPerMTok)}/MTok in)`, + ); + } + } + + lines.push(""); + lines.push("Cost calculation will be disabled."); + + note(lines.join("\n"), "⚠️ Pricing Incomplete"); + + const proceed = await confirm({ + message: "Continue without pricing?", + initialValue: true, + }); + + if (isCancel(proceed) || !proceed) { + cancel("Operation cancelled."); + process.exit(0); + } + + return { enabled: false, lookups }; + } +} + async function selectOptions() { intro("🚀 Svelte AI Bench"); const available_models = await gateway.getAvailableModels(); + const gatewayModels = available_models.models as GatewayModel[]; + const pricingMap = buildPricingMap(gatewayModels); + const models = await multiselect({ - message: "Select a model to benchmark", + message: "Select model(s) to benchmark", options: [{ value: "custom", label: "Custom" }].concat( available_models.models.reduce>( (arr, model) => { @@ -67,6 +160,10 @@ async function selectOptions() { models.push(custom_model); } + const selectedModels = models.filter((model) => model !== "custom"); + + const pricing = await validateAndConfirmPricing(selectedModels, pricingMap); + const mcp_integration = await select({ message: "Which MCP integration to use?", options: [ @@ -123,31 +220,13 @@ async function selectOptions() { } return { - models: models.filter((model) => model !== "custom"), + models: selectedModels, mcp, testingTool, + pricing, }; } -/** - * Generate a timestamped filename - */ -function getTimestampedFilename(prefix: string, extension: string): string { - const now = new Date(); - const year = now.getFullYear(); - const month = String(now.getMonth() + 1).padStart(2, "0"); - const day = String(now.getDate()).padStart(2, "0"); - const hours = String(now.getHours()).padStart(2, "0"); - const minutes = String(now.getMinutes()).padStart(2, "0"); - const seconds = String(now.getSeconds()).padStart(2, "0"); - - return `${prefix}-${year}-${month}-${day}-${hours}-${minutes}-${seconds}.${extension}`; -} - -/** - * Parse a command string into command and args - * Example: "npx -y @sveltejs/mcp" -> { command: "npx", args: ["-y", "@sveltejs/mcp"] } - */ function parseCommandString(commandString: string): { command: string; args: string[]; @@ -158,42 +237,6 @@ function parseCommandString(commandString: string): { return { command, args }; } -/** - * Check if a string is an HTTP/HTTPS URL - */ -function isHttpUrl(str: string): boolean { - return str.startsWith("http://") || str.startsWith("https://"); -} - -/** - * Extract ResultWrite content from agent steps - */ -function extractResultWriteContent(steps: unknown[]): string | null { - for (const step of steps) { - const s = step as { - content?: Array<{ - type: string; - toolName?: string; - input?: { content: string }; - }>; - }; - if (s.content) { - for (const content of s.content) { - if ( - content.type === "tool-call" && - content.toolName === "ResultWrite" - ) { - return content.input?.content ?? null; - } - } - } - } - return null; -} - -/** - * Run a single test with the AI agent - */ async function runSingleTest( test: TestDefinition, model: LanguageModel, @@ -208,14 +251,12 @@ async function runSingleTest( const prompt = buildAgentPrompt(test); try { - // Build tools object with conditional tools const tools = { ResultWrite: resultWriteTool, ...(testComponentEnabled && { TestComponent: testComponentTool(test) }), ...(mcpClient ? await mcpClient.tools() : {}), }; - // Create agent for this test let stepCounter = 0; const agent = new Agent({ model, @@ -256,14 +297,12 @@ async function runSingleTest( }, }); - // Run the agent console.log(" ⏳ Running agent..."); if (testComponentEnabled) { console.log(" 📋 TestComponent tool is available"); } const result = await agent.generate({ prompt }); - // Extract the generated component code const resultWriteContent = extractResultWriteContent(result.steps); if (!resultWriteContent) { @@ -279,7 +318,6 @@ async function runSingleTest( console.log(" ✓ Component generated"); - // Run test verification console.log(" ⏳ Verifying against tests..."); const verification = await runTestVerification(test, resultWriteContent); @@ -298,7 +336,6 @@ async function runSingleTest( } } - // Clean up this test's output directory cleanupTestEnvironment(test.name); return { @@ -328,42 +365,55 @@ async function runSingleTest( } } -// Main execution async function main() { - const { models, mcp, testingTool } = await selectOptions(); - // Get MCP server URL/command from environment (optional) + const { models, mcp, testingTool, pricing } = await selectOptions(); + const mcpServerUrl = mcp; const mcpEnabled = !!mcp; - // Check if TestComponent tool is disabled const testComponentEnabled = testingTool; - // Determine MCP transport type const isHttpTransport = mcpServerUrl && isHttpUrl(mcpServerUrl); const mcpTransportType = isHttpTransport ? "HTTP" : "StdIO"; - console.log("╔════════════════════════════════════════════════════╗"); + console.log("\n╔════════════════════════════════════════════════════╗"); console.log("║ SvelteBench 2.0 - Multi-Test ║"); console.log("╚════════════════════════════════════════════════════╝"); - console.log(`Model: ${models.join(", ")}`); - console.log(`MCP Integration: ${mcpEnabled ? "Enabled" : "Disabled"}`); + + console.log("\n📋 Models:"); + for (const modelId of models) { + const lookup = pricing.lookups.get(modelId); + if (pricing.enabled && lookup) { + const display = getModelPricingDisplay(lookup.pricing); + console.log(` ${modelId}`); + console.log( + ` 💰 ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`, + ); + } else { + console.log(` ${modelId}`); + } + } + + console.log(`\n💰 Pricing: ${pricing.enabled ? "Enabled" : "Disabled"}`); + + console.log(`🔌 MCP Integration: ${mcpEnabled ? "Enabled" : "Disabled"}`); if (mcpEnabled) { - console.log(`MCP Transport: ${mcpTransportType}`); + console.log(` Transport: ${mcpTransportType}`); if (isHttpTransport) { - console.log(`MCP Server URL: ${mcpServerUrl}`); + console.log(` URL: ${mcpServerUrl}`); } else { - console.log(`MCP StdIO Command: ${mcpServerUrl}`); + console.log(` Command: ${mcpServerUrl}`); } } + console.log( - `TestComponent Tool: ${testComponentEnabled ? "Enabled" : "Disabled"}`, + `🧪 TestComponent Tool: ${testComponentEnabled ? "Enabled" : "Disabled"}`, ); - // Discover all tests console.log("\n📁 Discovering tests..."); const tests = discoverTests(); console.log( - `Found ${tests.length} test(s): ${tests.map((t) => t.name).join(", ")}`, + ` Found ${tests.length} test(s): ${tests.map((t) => t.name).join(", ")}`, ); if (tests.length === 0) { @@ -371,14 +421,11 @@ async function main() { process.exit(1); } - // Set up outputs directory setupOutputsDirectory(); - // Conditionally create MCP client based on transport type - let mcpClient: Awaited> | null = null; + let mcpClient = null; if (mcpEnabled) { if (isHttpTransport) { - // HTTP transport mcpClient = await createMCPClient({ transport: { type: "http", @@ -386,7 +433,6 @@ async function main() { }, }); } else { - // StdIO transport - treat mcpServerUrl as command string const { command, args } = parseCommandString(mcpServerUrl!); mcpClient = await createMCPClient({ transport: new StdioMCPTransport({ @@ -399,9 +445,25 @@ async function main() { let totalFailed = 0; - for (const model of models) { - // Run all tests - const testResults: SingleTestResult[] = []; + for (const modelId of models) { + console.log("\n" + "═".repeat(50)); + console.log(`🤖 Running benchmark for model: ${modelId}`); + console.log("═".repeat(50)); + + const pricingLookup = pricing.enabled + ? (pricing.lookups.get(modelId) ?? null) + : null; + + if (pricingLookup) { + const display = getModelPricingDisplay(pricingLookup.pricing); + console.log( + `💰 Pricing: ${formatMTokCost(display.inputCostPerMTok)}/MTok in, ${formatMTokCost(display.outputCostPerMTok)}/MTok out`, + ); + } + + const model = gateway.languageModel(modelId); + + const testResults = []; const startTime = Date.now(); for (let i = 0; i < tests.length; i++) { @@ -420,10 +482,6 @@ async function main() { const totalDuration = Date.now() - startTime; - // Clean up outputs directory - cleanupOutputsDirectory(); - - // Print summary console.log("\n" + "═".repeat(50)); console.log("📊 Test Summary"); console.log("═".repeat(50)); @@ -454,39 +512,66 @@ async function main() { `Total: ${passed} passed, ${failed} failed, ${skipped} skipped (${(totalDuration / 1000).toFixed(1)}s)`, ); - // Ensure results directory exists + let totalCost = null; + let pricingInfo = null; + + if (pricingLookup) { + totalCost = calculateTotalCost(testResults, pricingLookup.pricing); + const pricingDisplay = getModelPricingDisplay(pricingLookup.pricing); + pricingInfo = { + inputCostPerMTok: pricingDisplay.inputCostPerMTok, + outputCostPerMTok: pricingDisplay.outputCostPerMTok, + cacheReadCostPerMTok: pricingDisplay.cacheReadCostPerMTok, + }; + + console.log("\n💰 Cost Summary"); + console.log("─".repeat(50)); + console.log( + `Input tokens: ${totalCost.inputTokens.toLocaleString()} (${formatCost(totalCost.inputCost)})`, + ); + console.log( + `Output tokens: ${totalCost.outputTokens.toLocaleString()} (${formatCost(totalCost.outputCost)})`, + ); + if (totalCost.cachedInputTokens > 0) { + console.log( + `Cached tokens: ${totalCost.cachedInputTokens.toLocaleString()} (${formatCost(totalCost.cacheReadCost)})`, + ); + } + console.log(`Total cost: ${formatCost(totalCost.totalCost)}`); + } + const resultsDir = "results"; if (!existsSync(resultsDir)) { mkdirSync(resultsDir, { recursive: true }); } - // Generate timestamped filenames - const jsonFilename = getTimestampedFilename("result", "json"); - const htmlFilename = getTimestampedFilename("result", "html"); + const jsonFilename = getTimestampedFilename("result", "json", modelId); + const htmlFilename = getTimestampedFilename("result", "html", modelId); const jsonPath = `${resultsDir}/${jsonFilename}`; const htmlPath = `${resultsDir}/${htmlFilename}`; - // Build the result data - const resultData: MultiTestResultData = { + const resultData = { tests: testResults, metadata: { mcpEnabled, mcpServerUrl: mcpEnabled ? mcpServerUrl! : null, mcpTransportType: mcpEnabled ? mcpTransportType : null, timestamp: new Date().toISOString(), - model, + model: modelId, + pricingKey: pricingLookup?.matchedKey ?? null, + pricing: pricingInfo, + totalCost, }, }; - // Save result JSON writeFileSync(jsonPath, JSON.stringify(resultData, null, 2)); console.log(`\n✓ Results saved to ${jsonPath}`); - // Generate HTML report await generateReport(jsonPath, htmlPath); } - // Exit with appropriate code + cleanupOutputsDirectory(); + process.exit(totalFailed > 0 ? 1 : 0); } diff --git a/lib/output-test-runner.ts b/lib/output-test-runner.ts index eedf615..59de519 100644 --- a/lib/output-test-runner.ts +++ b/lib/output-test-runner.ts @@ -27,79 +27,56 @@ export interface TestVerificationResult { failedTests?: FailedTest[]; } -/** - * Ensure the outputs directory exists and is clean - */ -export function setupOutputsDirectory(): void { +export function setupOutputsDirectory() { if (existsSync(OUTPUTS_DIR)) { rmSync(OUTPUTS_DIR, { recursive: true, force: true }); } mkdirSync(OUTPUTS_DIR, { recursive: true }); } -/** - * Clean up the outputs directory - */ -export function cleanupOutputsDirectory(): void { +export function cleanupOutputsDirectory() { if (existsSync(OUTPUTS_DIR)) { rmSync(OUTPUTS_DIR, { recursive: true, force: true }); } } -/** - * Prepare the outputs directory for a specific test - * - Creates a subdirectory for the test - * - Copies the test.ts file - * - Writes the LLM-generated component - */ export function prepareTestEnvironment( test: TestDefinition, componentCode: string, -): string { +) { const testDir = join(OUTPUTS_DIR, test.name); - // Create the test directory if (existsSync(testDir)) { rmSync(testDir, { recursive: true, force: true }); } mkdirSync(testDir, { recursive: true }); - // Write the LLM-generated component as Component.svelte const componentPath = join(testDir, "Component.svelte"); writeFileSync(componentPath, componentCode, "utf-8"); - // Copy the test file const testFilePath = join(testDir, "test.ts"); copyFileSync(test.testFile, testFilePath); return testDir; } -/** - * Clean up a specific test's output directory - */ -export function cleanupTestEnvironment(testName: string): void { +export function cleanupTestEnvironment(testName: string) { const testDir = join(OUTPUTS_DIR, testName); if (existsSync(testDir)) { rmSync(testDir, { recursive: true, force: true }); } } -/** - * Run vitest on the generated component and return the results - */ export async function runTestVerification( test: TestDefinition, componentCode: string, -): Promise { +) { const startTime = Date.now(); try { - // Prepare the test environment const testDir = prepareTestEnvironment(test, componentCode); const testFilePath = join(testDir, "test.ts"); - // Run vitest programmatically const vitest = await startVitest("test", [testFilePath], { watch: false, reporters: ["verbose"], @@ -120,10 +97,9 @@ export async function runTestVerification( await vitest.close(); const testModules = vitest.state.getTestModules(); - const failedTests: FailedTest[] = []; - const allErrors: string[] = []; + const failedTests = []; + const allErrors = []; - // Get unhandled errors const unhandledErrors = vitest.state.getUnhandledErrors(); for (const error of unhandledErrors) { const errorMessage = @@ -131,7 +107,6 @@ export async function runTestVerification( allErrors.push(errorMessage); } - // Calculate success/failure let passed = true; let numTests = 0; let numFailed = 0; @@ -154,7 +129,6 @@ export async function runTestVerification( passed = false; } - // Add module errors const moduleErrors = module.errors(); for (const error of moduleErrors) { if (error.message) { @@ -176,7 +150,6 @@ export async function runTestVerification( if (result.state === "failed") { numFailed++; - // Build full test name from ancestor titles const ancestorTitles: string[] = []; let parent = t.parent; while (parent && "name" in parent) { @@ -195,7 +168,6 @@ export async function runTestVerification( ? `${ancestorTitles.join(" > ")} > ${t.name}` : t.name; - // Collect error messages const errorMessages: string[] = []; if (result.errors) { for (const testError of result.errors) { diff --git a/lib/pricing.test.ts b/lib/pricing.test.ts new file mode 100644 index 0000000..c5edfc8 --- /dev/null +++ b/lib/pricing.test.ts @@ -0,0 +1,345 @@ +import { describe, it, expect } from "vitest"; +import { + extractPricingFromGatewayModel, + buildPricingMap, + lookupPricingFromMap, + calculateCost, + formatCost, + formatMTokCost, + getModelPricingDisplay, + type ModelPricing, + type GatewayModel, +} from "./pricing.ts"; + +describe("extractPricingFromGatewayModel", () => { + it("should extract pricing from a gateway model with all fields", () => { + const model: GatewayModel = { + id: "anthropic/claude-opus-4.5", + name: "Claude Opus 4.5", + pricing: { + input: "0.000005", + output: "0.000025", + cachedInputTokens: "0.0000005", + cacheCreationInputTokens: "0.00000625", + }, + modelType: "language", + }; + + const pricing = extractPricingFromGatewayModel(model); + + expect(pricing).not.toBeNull(); + expect(pricing!.inputCostPerToken).toBe(0.000005); + expect(pricing!.outputCostPerToken).toBe(0.000025); + expect(pricing!.cacheReadInputTokenCost).toBe(0.0000005); + expect(pricing!.cacheCreationInputTokenCost).toBe(0.00000625); + }); + + it("should extract pricing with only input and output", () => { + const model: GatewayModel = { + id: "openai/gpt-4o", + name: "GPT-4o", + pricing: { + input: "0.000003", + output: "0.000015", + }, + modelType: "language", + }; + + const pricing = extractPricingFromGatewayModel(model); + + expect(pricing).not.toBeNull(); + expect(pricing!.inputCostPerToken).toBe(0.000003); + expect(pricing!.outputCostPerToken).toBe(0.000015); + expect(pricing!.cacheReadInputTokenCost).toBeUndefined(); + }); + + it("should return null for model without pricing", () => { + const model: GatewayModel = { + id: "local/model", + name: "Local Model", + modelType: "language", + }; + + const pricing = extractPricingFromGatewayModel(model); + expect(pricing).toBeNull(); + }); + + it("should throw error for model with empty pricing object", () => { + const model: GatewayModel = { + id: "local/model", + name: "Local Model", + pricing: {}, + modelType: "language", + }; + + expect(() => extractPricingFromGatewayModel(model)).toThrowError( + /Invalid pricing/, + ); + }); + + it("should throw error for invalid pricing values", () => { + const model: GatewayModel = { + id: "test/model", + name: "Test Model", + pricing: { + input: "invalid", + output: "0.000015", + }, + modelType: "language", + }; + + expect(() => extractPricingFromGatewayModel(model)).toThrowError( + /Invalid pricing/, + ); + }); +}); + +describe("buildPricingMap", () => { + it("should build a map from gateway models", () => { + const models: GatewayModel[] = [ + { + id: "anthropic/claude-sonnet-4", + name: "Claude Sonnet 4", + pricing: { input: "0.000003", output: "0.000015" }, + modelType: "language", + }, + { + id: "openai/gpt-4o", + name: "GPT-4o", + pricing: { input: "0.000005", output: "0.000015" }, + modelType: "language", + }, + { + id: "local/model", + name: "Local Model", + modelType: "language", + }, + ]; + + const map = buildPricingMap(models); + + expect(map.size).toBe(3); + expect(map.get("anthropic/claude-sonnet-4")).not.toBeNull(); + expect(map.get("openai/gpt-4o")).not.toBeNull(); + expect(map.get("local/model")).toBeNull(); + }); +}); + +describe("lookupPricingFromMap", () => { + it("should return pricing lookup for existing model", () => { + const models: GatewayModel[] = [ + { + id: "anthropic/claude-sonnet-4", + name: "Claude Sonnet 4", + pricing: { input: "0.000003", output: "0.000015" }, + modelType: "language", + }, + ]; + + const map = buildPricingMap(models); + const lookup = lookupPricingFromMap("anthropic/claude-sonnet-4", map); + + expect(lookup).not.toBeNull(); + expect(lookup!.matchedKey).toBe("anthropic/claude-sonnet-4"); + expect(lookup!.pricing.inputCostPerToken).toBe(0.000003); + }); + + it("should return null for non-existent model", () => { + const map = buildPricingMap([]); + const lookup = lookupPricingFromMap("non/existent", map); + expect(lookup).toBeNull(); + }); +}); + +describe("calculateCost", () => { + const basePricing: ModelPricing = { + inputCostPerToken: 0.000003, // $3 per MTok + outputCostPerToken: 0.000015, // $15 per MTok + }; + + const pricingWithCache: ModelPricing = { + ...basePricing, + cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok (10% of input) + }; + + describe("basic cost calculation", () => { + it("should calculate cost with no cached tokens", () => { + const result = calculateCost(basePricing, 1000, 500, 0); + + expect(result.inputTokens).toBe(1000); + expect(result.outputTokens).toBe(500); + expect(result.cachedInputTokens).toBe(0); + expect(result.inputCost).toBe(0.003); // 1000 * $3/MTok + expect(result.outputCost).toBeCloseTo(0.0075); // 500 * $15/MTok + expect(result.cacheReadCost).toBe(0); + expect(result.totalCost).toBe(0.0105); + }); + + it("should default cachedInputTokens to 0", () => { + const result = calculateCost(basePricing, 1000, 500); + + expect(result.cachedInputTokens).toBe(0); + expect(result.inputCost).toBe(0.003); + }); + }); + + describe("cached token billing", () => { + it("should bill cached tokens at reduced rate", () => { + // 1000 input tokens, 800 are cached + const result = calculateCost(pricingWithCache, 1000, 500, 800); + + expect(result.inputTokens).toBe(1000); + expect(result.cachedInputTokens).toBe(800); + // Uncached: 200 tokens * $3/MTok = $0.0006 + expect(result.inputCost).toBeCloseTo(0.0006); + // Cached: 800 tokens * $0.30/MTok = $0.00024 + expect(result.cacheReadCost).toBeCloseTo(0.00024); + // Output: 500 * $15/MTok = $0.0075 + expect(result.outputCost).toBeCloseTo(0.0075); + expect(result.totalCost).toBeCloseTo(0.00834); + }); + + it("should treat cached tokens as free when no cache rate specified", () => { + // Using basePricing which has no cacheReadInputTokenCost + const result = calculateCost(basePricing, 1000, 500, 800); + + // Only 200 uncached tokens should be billed + expect(result.inputCost).toBeCloseTo(0.0006); + expect(result.cacheReadCost).toBe(0); + }); + + it("should handle all tokens being cached", () => { + const result = calculateCost(pricingWithCache, 1000, 500, 1000); + + expect(result.inputCost).toBe(0); + expect(result.cacheReadCost).toBe(0.0003); // 1000 * $0.30/MTok + }); + }); + + describe("edge cases", () => { + it("should handle zero tokens", () => { + const result = calculateCost(basePricing, 0, 0, 0); + + expect(result.inputCost).toBe(0); + expect(result.outputCost).toBe(0); + expect(result.cacheReadCost).toBe(0); + expect(result.totalCost).toBe(0); + }); + + it("should handle large token counts", () => { + const result = calculateCost(basePricing, 1_000_000, 500_000, 0); + + expect(result.inputCost).toBe(3); // 1M * $3/MTok + expect(result.outputCost).toBe(7.5); // 500K * $15/MTok + expect(result.totalCost).toBe(10.5); + }); + + it("should handle pricing with zero costs", () => { + const freePricing: ModelPricing = { + inputCostPerToken: 0, + outputCostPerToken: 0, + }; + const result = calculateCost(freePricing, 1000, 500, 0); + + expect(result.totalCost).toBe(0); + }); + }); +}); + +describe("formatCost", () => { + it('should format zero as "$0.00"', () => { + expect(formatCost(0)).toBe("$0.00"); + }); + + it("should format very small costs with 6 decimal places", () => { + expect(formatCost(0.000123)).toBe("$0.000123"); + expect(formatCost(0.001)).toBe("$0.001000"); + expect(formatCost(0.0099)).toBe("$0.009900"); + }); + + it("should format small costs with 4 decimal places", () => { + expect(formatCost(0.01)).toBe("$0.0100"); + expect(formatCost(0.1234)).toBe("$0.1234"); + expect(formatCost(0.99)).toBe("$0.9900"); + }); + + it("should format costs >= $1 with 2 decimal places", () => { + expect(formatCost(1)).toBe("$1.00"); + expect(formatCost(1.234)).toBe("$1.23"); + expect(formatCost(10.5)).toBe("$10.50"); + expect(formatCost(100)).toBe("$100.00"); + }); +}); + +describe("formatMTokCost", () => { + it('should format zero as "$0"', () => { + expect(formatMTokCost(0)).toBe("$0"); + }); + + it("should format very small per-MTok costs with 4 decimal places", () => { + expect(formatMTokCost(0.001)).toBe("$0.0010"); + expect(formatMTokCost(0.0099)).toBe("$0.0099"); + }); + + it("should format per-MTok costs >= $0.01 with 2 decimal places", () => { + expect(formatMTokCost(0.01)).toBe("$0.01"); + expect(formatMTokCost(0.3)).toBe("$0.30"); + expect(formatMTokCost(3)).toBe("$3.00"); + expect(formatMTokCost(15)).toBe("$15.00"); + }); +}); + +describe("getModelPricingDisplay", () => { + it("should convert per-token costs to per-MTok", () => { + const pricing: ModelPricing = { + inputCostPerToken: 0.000003, // $3 per MTok + outputCostPerToken: 0.000015, // $15 per MTok + }; + + const display = getModelPricingDisplay(pricing); + + expect(display.inputCostPerMTok).toBe(3); + expect(display.outputCostPerMTok).toBe(15); + expect(display.cacheReadCostPerMTok).toBeUndefined(); + }); + + it("should include cache read cost when available", () => { + const pricing: ModelPricing = { + inputCostPerToken: 0.000003, + outputCostPerToken: 0.000015, + cacheReadInputTokenCost: 0.0000003, // $0.30 per MTok + }; + + const display = getModelPricingDisplay(pricing); + + expect(display.inputCostPerMTok).toBe(3); + expect(display.outputCostPerMTok).toBe(15); + expect(display.cacheReadCostPerMTok).toBe(0.3); + }); + + it("should handle zero costs", () => { + const pricing: ModelPricing = { + inputCostPerToken: 0, + outputCostPerToken: 0, + }; + + const display = getModelPricingDisplay(pricing); + + expect(display.inputCostPerMTok).toBe(0); + expect(display.outputCostPerMTok).toBe(0); + }); + + it("should preserve explicit zero cost for cache read", () => { + const pricing: ModelPricing = { + inputCostPerToken: 0.000003, + outputCostPerToken: 0.000015, + cacheReadInputTokenCost: 0, + }; + + const display = getModelPricingDisplay(pricing); + + expect(display.inputCostPerMTok).toBe(3); + expect(display.outputCostPerMTok).toBe(15); + expect(display.cacheReadCostPerMTok).toBe(0); + }); +}); diff --git a/lib/pricing.ts b/lib/pricing.ts new file mode 100644 index 0000000..46429fa --- /dev/null +++ b/lib/pricing.ts @@ -0,0 +1,171 @@ +export interface ModelPricing { + inputCostPerToken: number; + outputCostPerToken: number; + cacheReadInputTokenCost?: number; + cacheCreationInputTokenCost?: number; +} + +export interface CostCalculation { + inputCost: number; + outputCost: number; + cacheReadCost: number; + totalCost: number; + inputTokens: number; + outputTokens: number; + cachedInputTokens: number; +} + +export interface ModelPricingDisplay { + inputCostPerMTok: number; + outputCostPerMTok: number; + cacheReadCostPerMTok?: number; +} + +export interface ModelPricingLookup { + pricing: ModelPricing; + matchedKey: string; +} + +export interface GatewayPricing { + input?: string; + output?: string; + cachedInputTokens?: string; + cacheCreationInputTokens?: string; +} + +export interface GatewayModel { + id: string; + name: string; + description?: string; + pricing?: GatewayPricing; + specification?: { + specificationVersion: string; + provider: string; + modelId: string; + }; + modelType: string; +} + +export function extractPricingFromGatewayModel( + model: GatewayModel, +) { + if (!model.pricing) { + return null; + } + + const { pricing } = model; + + const inputCost = pricing.input ? parseFloat(pricing.input) : NaN; + const outputCost = pricing.output ? parseFloat(pricing.output) : NaN; + + if (isNaN(inputCost) || isNaN(outputCost)) { + throw new Error( + `Invalid pricing for model ${model.id}: input and output pricing must be valid numbers.`, + ); + } + + const result: ModelPricing = { + inputCostPerToken: inputCost, + outputCostPerToken: outputCost, + }; + + if (pricing.cachedInputTokens) { + const cached = parseFloat(pricing.cachedInputTokens); + if (!isNaN(cached)) { + result.cacheReadInputTokenCost = cached; + } + } + + if (pricing.cacheCreationInputTokens) { + const creation = parseFloat(pricing.cacheCreationInputTokens); + if (!isNaN(creation)) { + result.cacheCreationInputTokenCost = creation; + } + } + + return result; +} + +export function buildPricingMap( + models: GatewayModel[], +) { + const map = new Map(); + + for (const model of models) { + const pricing = extractPricingFromGatewayModel(model); + if (pricing) { + map.set(model.id, { + pricing, + matchedKey: model.id, + }); + } else { + map.set(model.id, null); + } + } + + return map; +} + +export function lookupPricingFromMap( + modelId: string, + pricingMap: Map, +) { + return pricingMap.get(modelId) ?? null; +} + +export function getModelPricingDisplay( + pricing: ModelPricing, +) { + return { + inputCostPerMTok: pricing.inputCostPerToken * 1_000_000, + outputCostPerMTok: pricing.outputCostPerToken * 1_000_000, + cacheReadCostPerMTok: + pricing.cacheReadInputTokenCost !== undefined + ? pricing.cacheReadInputTokenCost * 1_000_000 + : undefined, + }; +} + +export function calculateCost( + pricing: ModelPricing, + inputTokens: number, + outputTokens: number, + cachedInputTokens: number = 0, +) { + const uncachedInputTokens = inputTokens - cachedInputTokens; + const inputCost = uncachedInputTokens * pricing.inputCostPerToken; + + const outputCost = outputTokens * pricing.outputCostPerToken; + + const cacheReadCost = + cachedInputTokens * (pricing.cacheReadInputTokenCost ?? 0); + + return { + inputCost, + outputCost, + cacheReadCost, + totalCost: inputCost + outputCost + cacheReadCost, + inputTokens, + outputTokens, + cachedInputTokens, + }; +} + +export function formatCost(cost: number) { + if (cost === 0) return "$0.00"; + if (cost < 0.01) { + return `$${cost.toFixed(6)}`; + } + if (cost < 1) { + return `$${cost.toFixed(4)}`; + } + return `$${cost.toFixed(2)}`; +} + +export function formatMTokCost(costPerMTok: number) { + if (costPerMTok === 0) return "$0"; + if (costPerMTok < 0.01) { + return `$${costPerMTok.toFixed(4)}`; + } + return `$${costPerMTok.toFixed(2)}`; +} diff --git a/lib/report-template.ts b/lib/report-template.ts index ab3d932..2ecb63a 100644 --- a/lib/report-template.ts +++ b/lib/report-template.ts @@ -1,8 +1,8 @@ import type { TestVerificationResult } from "./output-test-runner.ts"; import type { MultiTestResultData, SingleTestResult } from "./report.ts"; import { getReportStyles } from "./report-styles.ts"; +import { formatCost, formatMTokCost } from "./pricing.ts"; -// Type definitions for content blocks interface TextBlock { type: "text"; text: string; @@ -42,10 +42,7 @@ interface Step { [key: string]: unknown; } -/** - * Escape HTML special characters - */ -function escapeHtml(text: string): string { +function escapeHtml(text: string) { const map: Record = { "&": "&", "<": "<", @@ -60,10 +57,7 @@ function escapeHtml(text: string): string { return result; } -/** - * Format timestamp to readable date - */ -function formatTimestamp(timestamp: string): string { +function formatTimestamp(timestamp: string) { const date = new Date(timestamp); return date.toLocaleString("en-US", { year: "numeric", @@ -75,18 +69,12 @@ function formatTimestamp(timestamp: string): string { }); } -/** - * Get first N lines of code - */ -function getFirstLines(code: string, numLines: number): string { +function getFirstLines(code: string, numLines: number) { const lines = code.split("\n"); return lines.slice(0, numLines).join("\n"); } -/** - * Render a single content block based on its type - */ -function renderContentBlock(block: ContentBlock): string { +function renderContentBlock(block: ContentBlock) { if (block.type === "text") { return `
${escapeHtml(block.text)}
`; } else if (block.type === "tool-call") { @@ -110,12 +98,9 @@ function renderContentBlock(block: ContentBlock): string { return ""; } -/** - * Render verification result section - */ function renderVerificationResult( verification: TestVerificationResult | null, -): string { +) { if (!verification) { return `
@@ -159,10 +144,7 @@ function renderVerificationResult(
`; } -/** - * Render steps for a single test - */ -function renderSteps(steps: Step[]): string { +function renderSteps(steps: Step[]) { return steps .map((step, index) => { const assistantContentHtml = @@ -195,10 +177,7 @@ function renderSteps(steps: Step[]): string { .join("\n"); } -/** - * Render a single test's section - */ -function renderTestSection(test: SingleTestResult, index: number): string { +function renderTestSection(test: SingleTestResult, index: number) { const totalTokens = test.steps.reduce( (sum, step) => sum + step.usage.totalTokens, 0, @@ -218,7 +197,6 @@ function renderTestSection(test: SingleTestResult, index: number): string { const stepsHtml = renderSteps(test.steps); const verificationHtml = renderVerificationResult(test.verification); - // Generate unique ID for this test's component code const componentId = `component-${test.testName.replace(/[^a-zA-Z0-9]/g, "-")}`; const resultWriteHtml = test.resultWriteContent @@ -267,10 +245,194 @@ function renderTestSection(test: SingleTestResult, index: number): string { `; } -/** - * Generate HTML report from multi-test result data - */ -export function generateMultiTestHtml(data: MultiTestResultData): string { +function renderPricingSection(data: MultiTestResultData) { + const { metadata } = data; + const { pricing, totalCost, pricingKey } = metadata; + + if (!pricing && !totalCost) { + return ""; + } + + let pricingInfoHtml = ""; + if (pricing) { + const pricingKeyDisplay = pricingKey + ? `${escapeHtml(pricingKey)}` + : ""; + + pricingInfoHtml = ` +
+ Model Pricing: + ${pricingKeyDisplay} + ${formatMTokCost(pricing.inputCostPerMTok)}/MTok in + · + ${formatMTokCost(pricing.outputCostPerMTok)}/MTok out + ${pricing.cacheReadCostPerMTok !== undefined ? `·${formatMTokCost(pricing.cacheReadCostPerMTok)}/MTok cached` : ""} +
+ `; + } + + let costBreakdownHtml = ""; + if (totalCost) { + const uncachedInputTokens = + totalCost.inputTokens - totalCost.cachedInputTokens; + + costBreakdownHtml = ` +
+
+ Input tokens: + ${uncachedInputTokens.toLocaleString()} + ${formatCost(totalCost.inputCost)} +
+
+ Output tokens: + ${totalCost.outputTokens.toLocaleString()} + ${formatCost(totalCost.outputCost)} +
+ ${ + totalCost.cachedInputTokens > 0 + ? ` +
+ Cached tokens: + ${totalCost.cachedInputTokens.toLocaleString()} ⚡ + ${formatCost(totalCost.cacheReadCost)} +
+ ` + : "" + } +
+ Total Cost: + + ${formatCost(totalCost.totalCost)} +
+
+ `; + } + + return ` +
+
+ 💰 + Cost Summary +
+ ${pricingInfoHtml} + ${costBreakdownHtml} +
+ `; +} + +function getPricingStyles() { + return ` + .pricing-section { + background: var(--surface); + border: 1px solid var(--border); + border-radius: 4px; + padding: 12px; + margin-bottom: 12px; + } + + .pricing-header { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 12px; + font-weight: 600; + } + + .pricing-icon { + font-size: 16px; + } + + .pricing-title { + font-size: 14px; + } + + .pricing-rates { + display: flex; + align-items: center; + gap: 8px; + font-size: 12px; + color: var(--text-muted); + margin-bottom: 12px; + padding-bottom: 12px; + border-bottom: 1px solid var(--border); + flex-wrap: wrap; + } + + .rate-label { + font-weight: 500; + } + + .pricing-key { + font-family: 'JetBrains Mono', monospace; + background: var(--bg); + padding: 2px 6px; + border-radius: 3px; + border: 1px solid var(--border); + color: var(--text); + font-size: 11px; + } + + .rate-value { + font-family: 'JetBrains Mono', monospace; + } + + .rate-separator { + color: var(--border); + } + + .cost-breakdown { + display: flex; + flex-direction: column; + gap: 6px; + } + + .cost-row { + display: grid; + grid-template-columns: 120px 1fr auto; + gap: 8px; + align-items: center; + font-size: 13px; + } + + .cost-row.cached { + color: var(--text-muted); + } + + .cost-row.total { + margin-top: 8px; + padding-top: 8px; + border-top: 1px solid var(--border); + font-weight: 600; + } + + .cost-label { + color: var(--text-muted); + } + + .cost-row.total .cost-label { + color: var(--text); + } + + .cost-tokens { + font-family: 'JetBrains Mono', monospace; + text-align: right; + } + + .cost-value { + font-family: 'JetBrains Mono', monospace; + font-weight: 500; + text-align: right; + min-width: 80px; + } + + .cost-row.total .cost-value { + color: var(--success); + font-size: 15px; + } + `; +} + +export function generateMultiTestHtml(data: MultiTestResultData) { const metadata = data.metadata; const totalTests = data.tests.length; const passedTests = data.tests.filter((t) => t.verification?.passed).length; @@ -299,6 +461,10 @@ export function generateMultiTestHtml(data: MultiTestResultData): string { ` : ""; + const costDisplay = metadata.totalCost + ? `${formatCost(metadata.totalCost.totalCost)}` + : ""; + const overallStatus = failedTests === 0 && skippedTests === 0 ? "all-passed" @@ -310,7 +476,22 @@ export function generateMultiTestHtml(data: MultiTestResultData): string { .map((test, index) => renderTestSection(test, index)) .join("\n"); - const styles = getReportStyles(); + const pricingHtml = renderPricingSection(data); + + const styles = + getReportStyles() + + getPricingStyles() + + ` + .cost-badge { + background: var(--success); + color: white; + font-size: 11px; + padding: 2px 6px; + border-radius: 3px; + font-weight: 500; + font-family: 'JetBrains Mono', monospace; + } + `; return ` @@ -324,7 +505,7 @@ export function generateMultiTestHtml(data: MultiTestResultData): string {
-

SvelteBench 2.0 ${mcpBadge}

+

SvelteBench 2.0 ${mcpBadge} ${costDisplay}

${escapeHtml(metadata.model)} · ${totalTests} tests · ${totalTokens.toLocaleString()} tokens · ${formatTimestamp(metadata.timestamp)}
@@ -338,12 +519,14 @@ export function generateMultiTestHtml(data: MultiTestResultData): string { ${mcpNotice} + ${pricingHtml} + ${testsHtml} `; diff --git a/lib/report.ts b/lib/report.ts index 14e2aff..c7e50bf 100644 --- a/lib/report.ts +++ b/lib/report.ts @@ -2,7 +2,6 @@ import { readFile, writeFile } from "node:fs/promises"; import type { TestVerificationResult } from "./output-test-runner.ts"; import { generateMultiTestHtml } from "./report-template.ts"; -// Type definitions for result.json structure interface TextBlock { type: "text"; text: string; @@ -66,15 +65,33 @@ interface Step { [key: string]: unknown; } +export interface PricingInfo { + inputCostPerMTok: number; + outputCostPerMTok: number; + cacheReadCostPerMTok?: number; +} + +export interface TotalCostInfo { + inputCost: number; + outputCost: number; + cacheReadCost: number; + totalCost: number; + inputTokens: number; + outputTokens: number; + cachedInputTokens: number; +} + interface Metadata { mcpEnabled: boolean; mcpServerUrl: string | null; mcpTransportType?: string | null; timestamp: string; model: string; + pricingKey?: string | null; + pricing?: PricingInfo | null; + totalCost?: TotalCostInfo | null; } -// Single test result within a multi-test run export interface SingleTestResult { testName: string; prompt: string; @@ -83,45 +100,33 @@ export interface SingleTestResult { verification: TestVerificationResult | null; } -// Multi-test result data structure export interface MultiTestResultData { tests: SingleTestResult[]; metadata: Metadata; } -// Legacy single-test result data structure (for backward compatibility) interface LegacyResultData { steps: Step[]; resultWriteContent?: string | null; metadata?: Metadata; } -/** - * Generate HTML report from result.json file - * Supports both legacy single-test and new multi-test formats - * @param resultPath - Path to the result.json file - * @param outputPath - Path where the HTML report will be saved - * @param openBrowser - Whether to open the report in the default browser (default: true) - */ export async function generateReport( resultPath: string, outputPath: string, openBrowser = true, -): Promise { +) { try { - // Read and parse the result.json file const jsonContent = await readFile(resultPath, "utf-8"); const data = JSON.parse(jsonContent); - let html: string; + let html; - // Check if it's the new multi-test format if ("tests" in data && Array.isArray(data.tests)) { html = generateMultiTestHtml(data as MultiTestResultData); } else { - // Legacy format - convert to multi-test format for consistent rendering const legacyData = data as LegacyResultData; - const multiTestData: MultiTestResultData = { + const multiTestData = { tests: [ { testName: "Legacy Test", @@ -141,12 +146,10 @@ export async function generateReport( html = generateMultiTestHtml(multiTestData); } - // Write the HTML file await writeFile(outputPath, html, "utf-8"); console.log(`✓ Report generated successfully: ${outputPath}`); - // Open the report in the default browser if (openBrowser) { Bun.spawn(["open", outputPath]); } diff --git a/lib/test-discovery.ts b/lib/test-discovery.ts index 6329cde..414292b 100644 --- a/lib/test-discovery.ts +++ b/lib/test-discovery.ts @@ -11,12 +11,9 @@ export interface TestDefinition { prompt: string; } -/** - * Discover all test suites in the tests/ directory and load their prompts - */ -export function discoverTests(): TestDefinition[] { +export function discoverTests() { const testsDir = join(process.cwd(), "tests"); - const definitions: TestDefinition[] = []; + const definitions = []; try { const entries = readdirSync(testsDir); @@ -31,13 +28,11 @@ export function discoverTests(): TestDefinition[] { const promptFile = join(entryPath, "prompt.md"); const componentFile = join(entryPath, "Component.svelte"); - // Validate that required files exist if ( existsSync(referenceFile) && existsSync(testFile) && existsSync(promptFile) ) { - // Load the prompt content const prompt = readFileSync(promptFile, "utf-8"); definitions.push({ @@ -50,7 +45,7 @@ export function discoverTests(): TestDefinition[] { prompt, }); } else { - const missing: string[] = []; + const missing = []; if (!existsSync(referenceFile)) missing.push("Reference.svelte"); if (!existsSync(testFile)) missing.push("test.ts"); if (!existsSync(promptFile)) missing.push("prompt.md"); @@ -62,16 +57,12 @@ export function discoverTests(): TestDefinition[] { console.error("Error discovering tests:", error); } - // Sort by name for consistent ordering definitions.sort((a, b) => a.name.localeCompare(b.name)); return definitions; } -/** - * Build a prompt for the AI agent including the test requirements - */ -export function buildAgentPrompt(test: TestDefinition): string { +export function buildAgentPrompt(test: TestDefinition) { return `${test.prompt} IMPORTANT: When you have finished implementing the component, use the ResultWrite tool to output your final Svelte component code. Only output the component code itself, no explanations or markdown formatting.`; diff --git a/lib/tools/test-component.ts b/lib/tools/test-component.ts index 4b5c221..4f2d3dd 100644 --- a/lib/tools/test-component.ts +++ b/lib/tools/test-component.ts @@ -22,7 +22,6 @@ export function testComponentTool(test: TestDefinition) { try { const result = await runTestVerification(test, content); - // Clean up the test environment after running cleanupTestEnvironment(test.name); if (result.passed) { @@ -54,7 +53,6 @@ export function testComponentTool(test: TestDefinition) { }; } } catch (error) { - // Ensure cleanup even on error cleanupTestEnvironment(test.name); console.log(`[TestComponent] ✗ Error running tests`); return { diff --git a/lib/utils.test.ts b/lib/utils.test.ts new file mode 100644 index 0000000..c3ad8df --- /dev/null +++ b/lib/utils.test.ts @@ -0,0 +1,192 @@ +import { describe, it, expect } from "vitest"; +import { + sanitizeModelName, + getTimestampedFilename, + calculateTotalCost, +} from "./utils.ts"; +import type { ModelPricing } from "./pricing.ts"; +import type { SingleTestResult } from "./report.ts"; + +describe("sanitizeModelName", () => { + it("replaces slashes with dashes", () => { + expect(sanitizeModelName("anthropic/claude-sonnet-4")).toBe( + "anthropic-claude-sonnet-4", + ); + }); + + it("replaces special characters with dashes", () => { + expect(sanitizeModelName("model@version")).toBe("model-version"); + expect(sanitizeModelName("model_name")).toBe("model-name"); + expect(sanitizeModelName("model name")).toBe("model-name"); + }); + + it("preserves dots", () => { + expect(sanitizeModelName("gpt-4.0")).toBe("gpt-4.0"); + expect(sanitizeModelName("model.v1.2.3")).toBe("model.v1.2.3"); + }); + + it("preserves alphanumeric characters", () => { + expect(sanitizeModelName("gpt4o")).toBe("gpt4o"); + expect(sanitizeModelName("claude3")).toBe("claude3"); + }); + + it("handles multiple consecutive special characters", () => { + expect(sanitizeModelName("model///name")).toBe("model---name"); + expect(sanitizeModelName("model@#$name")).toBe("model---name"); + }); +}); + +describe("getTimestampedFilename", () => { + const fixedDate = new Date("2025-12-12T14:30:45Z"); + + it("generates filename without model name", () => { + const result = getTimestampedFilename( + "result", + "json", + undefined, + fixedDate, + ); + expect(result).toBe("result-2025-12-12-14-30-45.json"); + }); + + it("generates filename with simple model name", () => { + const result = getTimestampedFilename( + "result", + "json", + "gpt-4o", + fixedDate, + ); + expect(result).toBe("result-2025-12-12-14-30-45-gpt-4o.json"); + }); + + it("generates filename with model name containing slashes", () => { + const result = getTimestampedFilename( + "result", + "json", + "anthropic/claude-sonnet-4", + fixedDate, + ); + expect(result).toBe( + "result-2025-12-12-14-30-45-anthropic-claude-sonnet-4.json", + ); + }); + + it("generates filename with model name containing special characters", () => { + const result = getTimestampedFilename( + "result", + "html", + "model@v1.2.3", + fixedDate, + ); + expect(result).toBe("result-2025-12-12-14-30-45-model-v1.2.3.html"); + }); + + it("handles different file extensions", () => { + const result = getTimestampedFilename( + "output", + "txt", + "test-model", + fixedDate, + ); + expect(result).toBe("output-2025-12-12-14-30-45-test-model.txt"); + }); + + it("pads single-digit months and days", () => { + const earlyDate = new Date("2025-01-05T08:09:07Z"); + const result = getTimestampedFilename( + "result", + "json", + undefined, + earlyDate, + ); + expect(result).toBe("result-2025-01-05-08-09-07.json"); + }); +}); + +describe("calculateTotalCost", () => { + const pricing: ModelPricing = { + inputCostPerToken: 1.0 / 1_000_000, + outputCostPerToken: 2.0 / 1_000_000, + cacheReadInputTokenCost: 0.1 / 1_000_000, + }; + + it("calculates zero cost for empty results", () => { + const tests: SingleTestResult[] = []; + const result = calculateTotalCost(tests, pricing); + + expect(result).toEqual({ + inputCost: 0, + outputCost: 0, + cacheReadCost: 0, + totalCost: 0, + inputTokens: 0, + outputTokens: 0, + cachedInputTokens: 0, + }); + }); + + it("aggregates usage from multiple steps and tests", () => { + const tests: SingleTestResult[] = [ + { + testName: "test1", + prompt: "p1", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 100, + outputTokens: 50, + cachedInputTokens: 10, + }, + } as any, + { + usage: { + inputTokens: 200, + outputTokens: 100, + cachedInputTokens: 0, + }, + } as any, + ], + }, + { + testName: "test2", + prompt: "p2", + resultWriteContent: null, + verification: {} as any, + steps: [ + { + usage: { + inputTokens: 300, + outputTokens: 150, + cachedInputTokens: 20, + }, + } as any, + ], + }, + ]; + + // Total Input: 100 + 200 + 300 = 600 + // Total Output: 50 + 100 + 150 = 300 + // Total Cached: 10 + 0 + 20 = 30 + // Uncached Input: 600 - 30 = 570 + + // Costs (per Token): + // Input: 570 * (1.0 / 1e6) = 0.00057 + // Output: 300 * (2.0 / 1e6) = 0.0006 + // Cache: 30 * (0.1 / 1e6) = 0.000003 + // Total: 0.00057 + 0.0006 + 0.000003 = 0.001173 + + const result = calculateTotalCost(tests, pricing); + + expect(result).toEqual({ + inputCost: 0.00057, + outputCost: 0.0006, + cacheReadCost: 0.000003, + totalCost: 0.001173, + inputTokens: 600, + outputTokens: 300, + cachedInputTokens: 30, + }); + }); +}); diff --git a/lib/utils.ts b/lib/utils.ts new file mode 100644 index 0000000..e3cb01c --- /dev/null +++ b/lib/utils.ts @@ -0,0 +1,86 @@ +import { calculateCost, type ModelPricing } from "./pricing.ts"; +import type { SingleTestResult, TotalCostInfo } from "./report.ts"; + +export function sanitizeModelName(modelName: string) { + return modelName.replace(/[^a-zA-Z0-9.]/g, "-"); +} + +export function getTimestampedFilename( + prefix: string, + extension: string, + modelName?: string, + now: Date = new Date(), +) { + const year = now.getUTCFullYear(); + const month = String(now.getUTCMonth() + 1).padStart(2, "0"); + const day = String(now.getUTCDate()).padStart(2, "0"); + const hours = String(now.getUTCHours()).padStart(2, "0"); + const minutes = String(now.getUTCMinutes()).padStart(2, "0"); + const seconds = String(now.getUTCSeconds()).padStart(2, "0"); + + const timestamp = `${year}-${month}-${day}-${hours}-${minutes}-${seconds}`; + const modelSuffix = modelName ? `-${sanitizeModelName(modelName)}` : ""; + + return `${prefix}-${timestamp}${modelSuffix}.${extension}`; +} + +export function isHttpUrl(str: string) { + return str.startsWith("http://") || str.startsWith("https://"); +} + +export function extractResultWriteContent(steps: unknown[]) { + for (const step of steps) { + const s = step as { + content?: Array<{ + type: string; + toolName?: string; + input?: { content: string }; + }>; + }; + if (s.content) { + for (const content of s.content) { + if ( + content.type === "tool-call" && + content.toolName === "ResultWrite" + ) { + return content.input?.content ?? null; + } + } + } + } + return null; +} + +export function calculateTotalCost( + tests: SingleTestResult[], + pricing: ModelPricing, +) { + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCachedInputTokens = 0; + + for (const test of tests) { + for (const step of test.steps) { + totalInputTokens += step.usage.inputTokens; + totalOutputTokens += step.usage.outputTokens; + totalCachedInputTokens += step.usage.cachedInputTokens ?? 0; + } + } + + const costResult = calculateCost( + pricing, + totalInputTokens, + totalOutputTokens, + totalCachedInputTokens, + ); + + return { + inputCost: costResult.inputCost, + outputCost: costResult.outputCost, + cacheReadCost: costResult.cacheReadCost, + totalCost: costResult.totalCost, + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + cachedInputTokens: totalCachedInputTokens, + }; +} diff --git a/lib/verify-references.ts b/lib/verify-references.ts index 87e767e..cbd9ee9 100644 --- a/lib/verify-references.ts +++ b/lib/verify-references.ts @@ -33,12 +33,9 @@ interface TestResult { failedTests?: FailedTest[]; } -/** - * Load all test definitions from the tests/ directory - */ -export function loadTestDefinitions(): TestDefinition[] { +export function loadTestDefinitions() { const testsDir = join(process.cwd(), "tests"); - const definitions: TestDefinition[] = []; + const definitions = []; try { const entries = readdirSync(testsDir); @@ -53,7 +50,6 @@ export function loadTestDefinitions(): TestDefinition[] { const promptFile = join(entryPath, "prompt.md"); const componentFile = join(entryPath, "Component.svelte"); - // Validate that required files exist if (existsSync(referenceFile) && existsSync(testFile)) { definitions.push({ name: entry, @@ -77,17 +73,11 @@ export function loadTestDefinitions(): TestDefinition[] { return definitions; } -/** - * Copy Reference.svelte to Component.svelte - */ -export function copyReferenceToComponent(testDef: TestDefinition): void { +export function copyReferenceToComponent(testDef: TestDefinition) { copyFileSync(testDef.referenceFile, testDef.componentFile); } -/** - * Clean up Component.svelte file - */ -export function cleanupComponent(testDef: TestDefinition): void { +export function cleanupComponent(testDef: TestDefinition) { if (existsSync(testDef.componentFile)) { try { unlinkSync(testDef.componentFile); @@ -97,14 +87,10 @@ export function cleanupComponent(testDef: TestDefinition): void { } } -/** - * Run vitest on a specific test file and return the results - */ -export async function runTest(testDef: TestDefinition): Promise { +export async function runTest(testDef: TestDefinition) { const startTime = Date.now(); try { - // Run vitest programmatically const vitest = await startVitest("test", [testDef.testFile], { watch: false, reporters: ["verbose"], @@ -125,10 +111,9 @@ export async function runTest(testDef: TestDefinition): Promise { await vitest.close(); const testModules = vitest.state.getTestModules(); - const failedTests: FailedTest[] = []; - const allErrors: string[] = []; + const failedTests = []; + const allErrors = []; - // Get unhandled errors const unhandledErrors = vitest.state.getUnhandledErrors(); for (const error of unhandledErrors) { const errorMessage = @@ -136,7 +121,6 @@ export async function runTest(testDef: TestDefinition): Promise { allErrors.push(errorMessage); } - // Calculate success/failure let passed = true; let numTests = 0; let numFailed = 0; @@ -159,7 +143,6 @@ export async function runTest(testDef: TestDefinition): Promise { passed = false; } - // Add module errors const moduleErrors = module.errors(); for (const error of moduleErrors) { if (error.message) { @@ -181,7 +164,6 @@ export async function runTest(testDef: TestDefinition): Promise { if (result.state === "failed") { numFailed++; - // Build full test name from ancestor titles const ancestorTitles: string[] = []; let parent = t.parent; while (parent && "name" in parent) { @@ -200,7 +182,6 @@ export async function runTest(testDef: TestDefinition): Promise { ? `${ancestorTitles.join(" > ")} > ${t.name}` : t.name; - // Collect error messages const errorMessages: string[] = []; if (result.errors) { for (const testError of result.errors) { @@ -254,10 +235,7 @@ export async function runTest(testDef: TestDefinition): Promise { } } -/** - * Print summary of test results - */ -export function printSummary(results: TestResult[]): void { +export function printSummary(results: TestResult[]) { console.log("\n=== Test Verification Summary ===\n"); const totalSuites = results.length; @@ -291,10 +269,7 @@ export function printSummary(results: TestResult[]): void { } } -/** - * Main function to verify all reference implementations - */ -export async function verifyAllReferences(): Promise { +export async function verifyAllReferences() { console.log("Discovering test suites..."); const tests = loadTestDefinitions(); console.log(`Found ${tests.length} test suite(s)\n`); @@ -304,17 +279,15 @@ export async function verifyAllReferences(): Promise { return 1; } - const results: TestResult[] = []; + const results = []; for (const test of tests) { console.log(`Running tests/${test.name}...`); try { - // Copy Reference.svelte to Component.svelte copyReferenceToComponent(test); console.log(" ✓ Copied Reference.svelte → Component.svelte"); - // Run the test const result = await runTest(test); results.push(result); @@ -331,7 +304,6 @@ export async function verifyAllReferences(): Promise { console.log("\n Failed tests:"); for (const failed of result.failedTests) { console.log(`✗ ${failed.fullName}`); - // Print error message with indentation const errorLines = failed.errorMessage.split("\n"); for (const line of errorLines) { if (line.trim()) { @@ -343,16 +315,13 @@ export async function verifyAllReferences(): Promise { } } } finally { - // Always cleanup Component.svelte cleanupComponent(test); console.log(" ✓ Cleaned up Component.svelte\n"); } } - // Print summary printSummary(results); - // Return exit code const allPassed = results.every((r) => r.passed); return allPassed ? 0 : 1; } diff --git a/scripts/install.sh b/scripts/install.sh new file mode 100755 index 0000000..09a63c0 --- /dev/null +++ b/scripts/install.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# Read the Bun version from .bunversion file +if [ -f ".bunversion" ]; then + BUN_VERSION=$(cat .bunversion | tr -d '[:space:]') + echo "Installing Bun version: $BUN_VERSION" + curl -fsSL https://bun.com/install | bash -s "$BUN_VERSION" +else + echo "Error: .bunversion file not found" + exit 1 +fi