Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .changeset/eval-scorer-cost-telemetry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
"@voltagent/core": patch
---

fix: emit LLM judge token and provider cost telemetry on eval scorer spans

VoltAgent now records LLM judge model, token usage, cached tokens, reasoning tokens,
and provider-reported cost details on live eval scorer spans.

This makes scorer-side usage visible in observability backends and enables downstream
cost aggregation to distinguish agent costs from eval scorer costs.
138 changes: 138 additions & 0 deletions packages/core/src/agent/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,21 @@ interface ScoreMetrics {
datasetMetadata?: ReturnType<typeof extractDatasetMetadataFromCombinedMetadata>;
}

interface JudgeTelemetry {
modelName?: string;
promptTokens?: number;
completionTokens?: number;
totalTokens?: number;
cachedTokens?: number;
reasoningTokens?: number;
providerCost?: {
cost?: number;
upstreamInferenceCost?: number;
upstreamInferenceInputCost?: number;
upstreamInferenceOutputCost?: number;
};
}

async function resolveScorerDescriptors(
config: AgentEvalConfig,
host: AgentEvalHost,
Expand Down Expand Up @@ -168,6 +183,46 @@ function createScorerSpanAttributes(
if (metrics.datasetMetadata?.datasetItemHash) {
attributes["eval.dataset.item_hash"] = metrics.datasetMetadata.datasetItemHash;
}
const judgeTelemetry = extractJudgeTelemetry(metrics.combinedMetadata);
if (judgeTelemetry?.modelName) {
attributes["ai.model.name"] = judgeTelemetry.modelName;
const provider = judgeTelemetry.modelName.includes("/")
? judgeTelemetry.modelName.split("/")[0]
: undefined;
if (provider) {
attributes["ai.model.provider"] = provider;
}
}
if (judgeTelemetry?.promptTokens !== undefined) {
attributes["usage.prompt_tokens"] = judgeTelemetry.promptTokens;
}
if (judgeTelemetry?.completionTokens !== undefined) {
attributes["usage.completion_tokens"] = judgeTelemetry.completionTokens;
}
if (judgeTelemetry?.totalTokens !== undefined) {
attributes["usage.total_tokens"] = judgeTelemetry.totalTokens;
}
if (judgeTelemetry?.cachedTokens !== undefined) {
attributes["usage.cached_tokens"] = judgeTelemetry.cachedTokens;
}
if (judgeTelemetry?.reasoningTokens !== undefined) {
attributes["usage.reasoning_tokens"] = judgeTelemetry.reasoningTokens;
}
if (judgeTelemetry?.providerCost?.cost !== undefined) {
attributes["usage.cost"] = judgeTelemetry.providerCost.cost;
}
if (judgeTelemetry?.providerCost?.upstreamInferenceCost !== undefined) {
attributes["usage.cost_details.upstream_inference_cost"] =
judgeTelemetry.providerCost.upstreamInferenceCost;
}
if (judgeTelemetry?.providerCost?.upstreamInferenceInputCost !== undefined) {
attributes["usage.cost_details.upstream_inference_input_cost"] =
judgeTelemetry.providerCost.upstreamInferenceInputCost;
}
if (judgeTelemetry?.providerCost?.upstreamInferenceOutputCost !== undefined) {
attributes["usage.cost_details.upstream_inference_output_cost"] =
judgeTelemetry.providerCost.upstreamInferenceOutputCost;
}
if (storagePayload.userId) {
attributes["user.id"] = storagePayload.userId;
}
Expand Down Expand Up @@ -1353,6 +1408,89 @@ function extractErrorMessage(error: unknown): string {
}
}

function extractJudgeTelemetry(
metadata: Record<string, unknown> | null | undefined,
): JudgeTelemetry | undefined {
const record = isPlainRecord(metadata) ? (metadata as Record<string, unknown>) : undefined;
if (!record) {
return undefined;
}

const sources: Array<Record<string, unknown> | undefined> = [];
if (isPlainRecord(record.voltAgent)) {
sources.push(record.voltAgent as Record<string, unknown>);
}
if (isPlainRecord(record.scorer)) {
sources.push(record.scorer as Record<string, unknown>);
}
if (isPlainRecord(record.payload)) {
sources.push(record.payload as Record<string, unknown>);
}

for (const source of sources) {
const judge = isPlainRecord(source?.judge)
? (source?.judge as Record<string, unknown>)
: undefined;
if (!judge) {
continue;
}

const usage = isPlainRecord(judge.usage) ? (judge.usage as Record<string, unknown>) : undefined;
const providerCost = isPlainRecord(judge.providerCost)
? (judge.providerCost as Record<string, unknown>)
: undefined;

const telemetry: JudgeTelemetry = {
modelName: readString(judge.model),
promptTokens: readNumber(usage?.promptTokens),
completionTokens: readNumber(usage?.completionTokens),
totalTokens: readNumber(usage?.totalTokens),
cachedTokens: readNumber(usage?.cachedInputTokens ?? usage?.cachedTokens),
reasoningTokens: readNumber(usage?.reasoningTokens),
providerCost: providerCost
? {
cost: readNumber(providerCost.cost),
upstreamInferenceCost: readNumber(providerCost.upstreamInferenceCost),
upstreamInferenceInputCost: readNumber(providerCost.upstreamInferenceInputCost),
upstreamInferenceOutputCost: readNumber(providerCost.upstreamInferenceOutputCost),
}
: undefined,
};

if (
telemetry.modelName ||
telemetry.promptTokens !== undefined ||
telemetry.completionTokens !== undefined ||
telemetry.totalTokens !== undefined ||
telemetry.cachedTokens !== undefined ||
telemetry.reasoningTokens !== undefined ||
telemetry.providerCost?.cost !== undefined ||
telemetry.providerCost?.upstreamInferenceCost !== undefined ||
telemetry.providerCost?.upstreamInferenceInputCost !== undefined ||
telemetry.providerCost?.upstreamInferenceOutputCost !== undefined
) {
return telemetry;
}
}

return undefined;
}

function readString(value: unknown): string | undefined {
return typeof value === "string" && value.length > 0 ? value : undefined;
}

function readNumber(value: unknown): number | undefined {
if (typeof value === "number") {
return Number.isFinite(value) ? value : undefined;
}
if (typeof value === "string") {
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
return undefined;
}

async function invokeEvalResultCallback(
host: AgentEvalHost,
config: AgentEvalScorerConfig,
Expand Down
138 changes: 137 additions & 1 deletion packages/core/src/eval/llm/create-judge-scorer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,19 @@ import { safeStringify } from "@voltagent/internal/utils";
import type { LanguageModel } from "ai";
import { generateText } from "ai";

import { convertUsage } from "../../utils/usage-converter";
import type { LocalScorerDefinition } from "../runtime";

type DefaultPayload = Record<string, unknown>;

type OpenRouterUsageCost = {
cost?: number;
isByok?: boolean;
upstreamInferenceCost?: number;
upstreamInferenceInputCost?: number;
upstreamInferenceOutputCost?: number;
};

export interface LlmJudgeScorerParams extends Record<string, unknown> {
/** Optional criteria appended to the default judging instructions. */
criteria?: string;
Expand Down Expand Up @@ -49,13 +58,16 @@ export function createLLMJudgeScorer<Payload extends DefaultPayload = DefaultPay
const criteria = params.criteria ? params.criteria.trim() : "";

const prompt = buildPrompt({ instructions, criteria, question, answer });
const judgeModel = resolveJudgeModelId(model);

try {
const { text } = await generateText({
const { text, usage, providerMetadata } = await generateText({
model,
prompt,
maxOutputTokens,
});
const normalizedUsage = convertUsage(usage);
const providerCost = extractOpenRouterUsageCost(providerMetadata);

const parsed = parseJudgeResponse(text);
if (!parsed) {
Expand All @@ -66,6 +78,11 @@ export function createLLMJudgeScorer<Payload extends DefaultPayload = DefaultPay
raw: text.trim(),
voltAgent: {
scorer: scorerId,
judge: buildJudgeTelemetry({
judgeModel,
usage: normalizedUsage,
providerCost,
}),
},
},
error: new Error("Judge response was not valid JSON"),
Expand All @@ -80,6 +97,11 @@ export function createLLMJudgeScorer<Payload extends DefaultPayload = DefaultPay
raw: text.trim(),
voltAgent: {
scorer: scorerId,
judge: buildJudgeTelemetry({
judgeModel,
usage: normalizedUsage,
providerCost,
}),
},
},
};
Expand All @@ -90,6 +112,11 @@ export function createLLMJudgeScorer<Payload extends DefaultPayload = DefaultPay
metadata: {
voltAgent: {
scorer: scorerId,
judge: buildJudgeTelemetry({
judgeModel,
usage: extractUsageFromError(error),
providerCost: extractProviderCostFromError(error),
}),
},
},
error,
Expand Down Expand Up @@ -178,3 +205,112 @@ function stringify(value: unknown): string {
return String(value);
}
}

function buildJudgeTelemetry(input: {
judgeModel: string;
usage?: ReturnType<typeof convertUsage>;
providerCost?: OpenRouterUsageCost;
}): Record<string, unknown> {
const judge: Record<string, unknown> = {
model: input.judgeModel,
};

if (input.usage) {
judge.usage = input.usage;
}

if (input.providerCost) {
judge.providerCost = input.providerCost;
}

return judge;
}

function extractOpenRouterUsageCost(providerMetadata: unknown): OpenRouterUsageCost | undefined {
if (!isRecord(providerMetadata)) {
return undefined;
}

const openRouterMetadata = isRecord(providerMetadata.openrouter)
? providerMetadata.openrouter
: undefined;
const usage = isRecord(openRouterMetadata?.usage) ? openRouterMetadata.usage : undefined;
const costDetails = isRecord(usage?.costDetails) ? usage.costDetails : undefined;
Copy link
Copy Markdown
Contributor

@cubic-dev-ai cubic-dev-ai bot Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: OpenRouter judge telemetry parsing is incomplete and can miss provider cost fields when metadata uses snake_case keys.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At packages/core/src/eval/llm/create-judge-scorer.ts, line 238:

<comment>OpenRouter judge telemetry parsing is incomplete and can miss provider cost fields when metadata uses snake_case keys.</comment>

<file context>
@@ -178,3 +205,112 @@ function stringify(value: unknown): string {
+    ? providerMetadata.openrouter
+    : undefined;
+  const usage = isRecord(openRouterMetadata?.usage) ? openRouterMetadata.usage : undefined;
+  const costDetails = isRecord(usage?.costDetails) ? usage.costDetails : undefined;
+
+  if (!usage) {
</file context>
Fix with Cubic


if (!usage) {
return undefined;
}

const result: OpenRouterUsageCost = {};
const cost = toFiniteNumber(usage.cost);
if (cost !== undefined) {
result.cost = cost;
}

if (typeof usage.isByok === "boolean") {
result.isByok = usage.isByok;
}

const upstreamInferenceCost = toFiniteNumber(costDetails?.upstreamInferenceCost);
if (upstreamInferenceCost !== undefined) {
result.upstreamInferenceCost = upstreamInferenceCost;
}

const upstreamInferenceInputCost = toFiniteNumber(costDetails?.upstreamInferenceInputCost);
if (upstreamInferenceInputCost !== undefined) {
result.upstreamInferenceInputCost = upstreamInferenceInputCost;
}

const upstreamInferenceOutputCost = toFiniteNumber(costDetails?.upstreamInferenceOutputCost);
if (upstreamInferenceOutputCost !== undefined) {
result.upstreamInferenceOutputCost = upstreamInferenceOutputCost;
}

return Object.keys(result).length > 0 ? result : undefined;
}

function extractUsageFromError(error: unknown): ReturnType<typeof convertUsage> | undefined {
if (!isRecord(error)) {
return undefined;
}

const usage = error.usage;
return usage ? convertUsage(usage as any) : undefined;
}

function extractProviderCostFromError(error: unknown): OpenRouterUsageCost | undefined {
if (!isRecord(error)) {
return undefined;
}

return extractOpenRouterUsageCost(error.providerMetadata);
}

function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}

function toFiniteNumber(value: unknown): number | undefined {
if (typeof value === "number") {
return Number.isFinite(value) ? value : undefined;
}

if (typeof value === "string") {
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : undefined;
}

return undefined;
}

function resolveJudgeModelId(model: LanguageModel): string {
if (typeof model === "string") {
return model;
}

if ("modelId" in model && typeof model.modelId === "string" && model.modelId.length > 0) {
return model.modelId;
}

return "unknown";
}
Loading