diff --git a/.changeset/eval-scorer-cost-telemetry.md b/.changeset/eval-scorer-cost-telemetry.md new file mode 100644 index 000000000..9915238c1 --- /dev/null +++ b/.changeset/eval-scorer-cost-telemetry.md @@ -0,0 +1,11 @@ +--- +"@voltagent/core": patch +--- + +fix: emit LLM judge token and provider cost telemetry on eval scorer spans + +VoltAgent now records LLM judge model, token usage, cached tokens, reasoning tokens, +and provider-reported cost details on live eval scorer spans. + +This makes scorer-side usage visible in observability backends and enables downstream +cost aggregation to distinguish agent costs from eval scorer costs. diff --git a/packages/core/src/agent/eval.ts b/packages/core/src/agent/eval.ts index 0b0182e88..a82949cc6 100644 --- a/packages/core/src/agent/eval.ts +++ b/packages/core/src/agent/eval.ts @@ -56,6 +56,21 @@ interface ScoreMetrics { datasetMetadata?: ReturnType; } +interface JudgeTelemetry { + modelName?: string; + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + cachedTokens?: number; + reasoningTokens?: number; + providerCost?: { + cost?: number; + upstreamInferenceCost?: number; + upstreamInferenceInputCost?: number; + upstreamInferenceOutputCost?: number; + }; +} + async function resolveScorerDescriptors( config: AgentEvalConfig, host: AgentEvalHost, @@ -168,6 +183,46 @@ function createScorerSpanAttributes( if (metrics.datasetMetadata?.datasetItemHash) { attributes["eval.dataset.item_hash"] = metrics.datasetMetadata.datasetItemHash; } + const judgeTelemetry = extractJudgeTelemetry(metrics.combinedMetadata); + if (judgeTelemetry?.modelName) { + attributes["ai.model.name"] = judgeTelemetry.modelName; + const provider = judgeTelemetry.modelName.includes("/") + ? judgeTelemetry.modelName.split("/")[0] + : undefined; + if (provider) { + attributes["ai.model.provider"] = provider; + } + } + if (judgeTelemetry?.promptTokens !== undefined) { + attributes["usage.prompt_tokens"] = judgeTelemetry.promptTokens; + } + if (judgeTelemetry?.completionTokens !== undefined) { + attributes["usage.completion_tokens"] = judgeTelemetry.completionTokens; + } + if (judgeTelemetry?.totalTokens !== undefined) { + attributes["usage.total_tokens"] = judgeTelemetry.totalTokens; + } + if (judgeTelemetry?.cachedTokens !== undefined) { + attributes["usage.cached_tokens"] = judgeTelemetry.cachedTokens; + } + if (judgeTelemetry?.reasoningTokens !== undefined) { + attributes["usage.reasoning_tokens"] = judgeTelemetry.reasoningTokens; + } + if (judgeTelemetry?.providerCost?.cost !== undefined) { + attributes["usage.cost"] = judgeTelemetry.providerCost.cost; + } + if (judgeTelemetry?.providerCost?.upstreamInferenceCost !== undefined) { + attributes["usage.cost_details.upstream_inference_cost"] = + judgeTelemetry.providerCost.upstreamInferenceCost; + } + if (judgeTelemetry?.providerCost?.upstreamInferenceInputCost !== undefined) { + attributes["usage.cost_details.upstream_inference_input_cost"] = + judgeTelemetry.providerCost.upstreamInferenceInputCost; + } + if (judgeTelemetry?.providerCost?.upstreamInferenceOutputCost !== undefined) { + attributes["usage.cost_details.upstream_inference_output_cost"] = + judgeTelemetry.providerCost.upstreamInferenceOutputCost; + } if (storagePayload.userId) { attributes["user.id"] = storagePayload.userId; } @@ -1353,6 +1408,89 @@ function extractErrorMessage(error: unknown): string { } } +function extractJudgeTelemetry( + metadata: Record | null | undefined, +): JudgeTelemetry | undefined { + const record = isPlainRecord(metadata) ? (metadata as Record) : undefined; + if (!record) { + return undefined; + } + + const sources: Array | undefined> = []; + if (isPlainRecord(record.voltAgent)) { + sources.push(record.voltAgent as Record); + } + if (isPlainRecord(record.scorer)) { + sources.push(record.scorer as Record); + } + if (isPlainRecord(record.payload)) { + sources.push(record.payload as Record); + } + + for (const source of sources) { + const judge = isPlainRecord(source?.judge) + ? (source?.judge as Record) + : undefined; + if (!judge) { + continue; + } + + const usage = isPlainRecord(judge.usage) ? (judge.usage as Record) : undefined; + const providerCost = isPlainRecord(judge.providerCost) + ? (judge.providerCost as Record) + : undefined; + + const telemetry: JudgeTelemetry = { + modelName: readString(judge.model), + promptTokens: readNumber(usage?.promptTokens), + completionTokens: readNumber(usage?.completionTokens), + totalTokens: readNumber(usage?.totalTokens), + cachedTokens: readNumber(usage?.cachedInputTokens ?? usage?.cachedTokens), + reasoningTokens: readNumber(usage?.reasoningTokens), + providerCost: providerCost + ? { + cost: readNumber(providerCost.cost), + upstreamInferenceCost: readNumber(providerCost.upstreamInferenceCost), + upstreamInferenceInputCost: readNumber(providerCost.upstreamInferenceInputCost), + upstreamInferenceOutputCost: readNumber(providerCost.upstreamInferenceOutputCost), + } + : undefined, + }; + + if ( + telemetry.modelName || + telemetry.promptTokens !== undefined || + telemetry.completionTokens !== undefined || + telemetry.totalTokens !== undefined || + telemetry.cachedTokens !== undefined || + telemetry.reasoningTokens !== undefined || + telemetry.providerCost?.cost !== undefined || + telemetry.providerCost?.upstreamInferenceCost !== undefined || + telemetry.providerCost?.upstreamInferenceInputCost !== undefined || + telemetry.providerCost?.upstreamInferenceOutputCost !== undefined + ) { + return telemetry; + } + } + + return undefined; +} + +function readString(value: unknown): string | undefined { + return typeof value === "string" && value.length > 0 ? value : undefined; +} + +function readNumber(value: unknown): number | undefined { + if (typeof value === "number") { + return Number.isFinite(value) ? value : undefined; + } + if (typeof value === "string") { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : undefined; + } + return undefined; +} + async function invokeEvalResultCallback( host: AgentEvalHost, config: AgentEvalScorerConfig, diff --git a/packages/core/src/eval/llm/create-judge-scorer.ts b/packages/core/src/eval/llm/create-judge-scorer.ts index 1aee60242..a3485f280 100644 --- a/packages/core/src/eval/llm/create-judge-scorer.ts +++ b/packages/core/src/eval/llm/create-judge-scorer.ts @@ -2,10 +2,19 @@ import { safeStringify } from "@voltagent/internal/utils"; import type { LanguageModel } from "ai"; import { generateText } from "ai"; +import { convertUsage } from "../../utils/usage-converter"; import type { LocalScorerDefinition } from "../runtime"; type DefaultPayload = Record; +type OpenRouterUsageCost = { + cost?: number; + isByok?: boolean; + upstreamInferenceCost?: number; + upstreamInferenceInputCost?: number; + upstreamInferenceOutputCost?: number; +}; + export interface LlmJudgeScorerParams extends Record { /** Optional criteria appended to the default judging instructions. */ criteria?: string; @@ -49,13 +58,16 @@ export function createLLMJudgeScorer; + providerCost?: OpenRouterUsageCost; +}): Record { + const judge: Record = { + model: input.judgeModel, + }; + + if (input.usage) { + judge.usage = input.usage; + } + + if (input.providerCost) { + judge.providerCost = input.providerCost; + } + + return judge; +} + +function extractOpenRouterUsageCost(providerMetadata: unknown): OpenRouterUsageCost | undefined { + if (!isRecord(providerMetadata)) { + return undefined; + } + + const openRouterMetadata = isRecord(providerMetadata.openrouter) + ? providerMetadata.openrouter + : undefined; + const usage = isRecord(openRouterMetadata?.usage) ? openRouterMetadata.usage : undefined; + const costDetails = isRecord(usage?.costDetails) ? usage.costDetails : undefined; + + if (!usage) { + return undefined; + } + + const result: OpenRouterUsageCost = {}; + const cost = toFiniteNumber(usage.cost); + if (cost !== undefined) { + result.cost = cost; + } + + if (typeof usage.isByok === "boolean") { + result.isByok = usage.isByok; + } + + const upstreamInferenceCost = toFiniteNumber(costDetails?.upstreamInferenceCost); + if (upstreamInferenceCost !== undefined) { + result.upstreamInferenceCost = upstreamInferenceCost; + } + + const upstreamInferenceInputCost = toFiniteNumber(costDetails?.upstreamInferenceInputCost); + if (upstreamInferenceInputCost !== undefined) { + result.upstreamInferenceInputCost = upstreamInferenceInputCost; + } + + const upstreamInferenceOutputCost = toFiniteNumber(costDetails?.upstreamInferenceOutputCost); + if (upstreamInferenceOutputCost !== undefined) { + result.upstreamInferenceOutputCost = upstreamInferenceOutputCost; + } + + return Object.keys(result).length > 0 ? result : undefined; +} + +function extractUsageFromError(error: unknown): ReturnType | undefined { + if (!isRecord(error)) { + return undefined; + } + + const usage = error.usage; + return usage ? convertUsage(usage as any) : undefined; +} + +function extractProviderCostFromError(error: unknown): OpenRouterUsageCost | undefined { + if (!isRecord(error)) { + return undefined; + } + + return extractOpenRouterUsageCost(error.providerMetadata); +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function toFiniteNumber(value: unknown): number | undefined { + if (typeof value === "number") { + return Number.isFinite(value) ? value : undefined; + } + + if (typeof value === "string") { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : undefined; + } + + return undefined; +} + +function resolveJudgeModelId(model: LanguageModel): string { + if (typeof model === "string") { + return model; + } + + if ("modelId" in model && typeof model.modelId === "string" && model.modelId.length > 0) { + return model.modelId; + } + + return "unknown"; +}