diff --git a/packages/llm-sdk/src/adapters/anthropic.ts b/packages/llm-sdk/src/adapters/anthropic.ts index 89093c6..27e2170 100644 --- a/packages/llm-sdk/src/adapters/anthropic.ts +++ b/packages/llm-sdk/src/adapters/anthropic.ts @@ -10,6 +10,8 @@ import type { LLMAdapter, ChatCompletionRequest, CompletionResult, + ResponseRequest, + ResponseResult, } from "./base"; import { formatMessagesForAnthropic, @@ -803,6 +805,88 @@ export class AnthropicAdapter implements LLMAdapter { }; } } + + /** + * Responses API — MCP tools + reasoning + structured output via Anthropic Messages API + * Uses beta headers: mcp-client-2025-11-20, interleaved-thinking-2025-05-14 + */ + async respond(request: ResponseRequest): Promise { + const client = await this.getClient(); + + const mcpServers = (request.mcpServers ?? []).map((mcp) => ({ + type: "url", + url: mcp.server_url, + name: mcp.server_label, + ...(mcp.headers + ? { authorization_token: mcp.headers["Authorization"] } + : {}), + ...(mcp.allowed_tools + ? { tool_configuration: { allowed_tools: mcp.allowed_tools } } + : {}), + })); + + const thinkingBudget = + request.reasoningEffort === "high" + ? 16000 + : request.reasoningEffort === "medium" + ? 8000 + : 4000; + + const betas: string[] = []; + if (mcpServers.length) betas.push("mcp-client-2025-11-20"); + // interleaved-thinking-2025-05-14 is deprecated on Claude 4.x — adaptive thinking handles it automatically + + const payload: Record = { + model: this.model, + max_tokens: request.maxTokens ?? 8192, + messages: [{ role: "user", content: request.prompt }], + ...(mcpServers.length ? { mcp_servers: mcpServers } : {}), + ...(request.reasoningEffort + ? { thinking: { type: "enabled", budget_tokens: thinkingBudget } } + : {}), + ...(request.outputSchema + ? { + output_config: { + format: { + type: "json_schema", + json_schema: { + name: request.outputSchema.name, + schema: request.outputSchema.schema, + }, + }, + }, + } + : {}), + }; + + const response = await client.beta.messages.create(payload as any, { + headers: betas.length ? { "anthropic-beta": betas.join(",") } : {}, + }); + + let text = ""; + let inputTokens = 0; + let outputTokens = 0; + + for (const block of response.content ?? []) { + if ((block as any).type === "text") { + text += (block as any).text; + } + } + + if (response.usage) { + inputTokens = (response.usage as any).input_tokens ?? 0; + outputTokens = (response.usage as any).output_tokens ?? 0; + } + + return { + text, + usage: { + prompt_tokens: inputTokens, + completion_tokens: outputTokens, + total_tokens: inputTokens + outputTokens, + }, + }; + } } /** diff --git a/packages/llm-sdk/src/adapters/base.ts b/packages/llm-sdk/src/adapters/base.ts index 0a38666..0ba429b 100644 --- a/packages/llm-sdk/src/adapters/base.ts +++ b/packages/llm-sdk/src/adapters/base.ts @@ -76,6 +76,51 @@ export interface CompletionResult { rawResponse: Record; } +/** + * MCP server configuration for the Responses API + */ +export interface McpServerConfig { + type: "mcp"; + server_label: string; + server_url: string; + headers?: Record; + allowed_tools?: string[]; + require_approval?: "never" | "always"; +} + +/** + * Request for the Responses API (OpenAI Responses / Anthropic Messages with MCP) + */ +export interface ResponseRequest { + /** Prompt text */ + prompt: string; + /** MCP server(s) to attach */ + mcpServers?: McpServerConfig[]; + /** Reasoning effort: low | medium | high */ + reasoningEffort?: "low" | "medium" | "high"; + /** Zod/JSON schema for structured output */ + outputSchema?: { + name: string; + schema: Record; + }; + /** Max tokens for the response */ + maxTokens?: number; +} + +/** + * Normalized result from the Responses API + */ +export interface ResponseResult { + /** Generated text */ + text: string; + /** Token usage */ + usage?: { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + }; +} + /** * Base LLM adapter interface */ @@ -95,6 +140,12 @@ export interface LLMAdapter { * Non-streaming chat completion (for debugging/comparison) */ complete?(request: ChatCompletionRequest): Promise; + + /** + * Responses API — MCP tools + reasoning + structured output. + * OpenAI: uses /v1/responses. Anthropic: uses /v1/messages with beta headers. + */ + respond?(request: ResponseRequest): Promise; } /** @@ -754,11 +805,23 @@ export function messageToOpenAIContent( const attachments = message.metadata?.attachments; const content = message.content ?? ""; - // If no image attachments, return simple string - if (!hasImageAttachments(message)) { + // Check for audio parts in content array + const hasAudio = + Array.isArray(message.content) && + (message.content as Array<{ type: string }>).some( + (p) => p.type === "input_audio", + ); + + // If no image attachments and no audio parts, return simple string + if (!hasImageAttachments(message) && !hasAudio) { return content; } + // If content is already an array of parts (e.g. audio + text), pass through directly + if (Array.isArray(message.content)) { + return message.content as unknown as OpenAIContentBlock[]; + } + // Build content blocks array const blocks: OpenAIContentBlock[] = []; diff --git a/packages/llm-sdk/src/adapters/google.ts b/packages/llm-sdk/src/adapters/google.ts index 373a93f..c563788 100644 --- a/packages/llm-sdk/src/adapters/google.ts +++ b/packages/llm-sdk/src/adapters/google.ts @@ -174,8 +174,34 @@ function messageToGeminiContent(msg: Message): GeminiContent | null { return { role: "user", parts }; } - // Add text content - if (msg.content) { + // Handle content as array of parts (e.g. input_audio + text from OpenAI format) + if (Array.isArray(msg.content)) { + for (const part of msg.content as Array<{ + type: string; + text?: string; + input_audio?: { data: string; format: string }; + }>) { + if (part.type === "text" && part.text) { + parts.push({ text: part.text }); + } else if (part.type === "input_audio" && part.input_audio) { + const mimeMap: Record = { + mp3: "audio/mp3", + wav: "audio/wav", + ogg: "audio/ogg", + webm: "audio/webm", + m4a: "audio/mp4", + flac: "audio/flac", + }; + parts.push({ + inlineData: { + mimeType: mimeMap[part.input_audio.format] || "audio/mp3", + data: part.input_audio.data, + }, + }); + } + } + } else if (msg.content) { + // Add text content parts.push({ text: msg.content }); } diff --git a/packages/llm-sdk/src/adapters/openai.ts b/packages/llm-sdk/src/adapters/openai.ts index 114ecec..671e30b 100644 --- a/packages/llm-sdk/src/adapters/openai.ts +++ b/packages/llm-sdk/src/adapters/openai.ts @@ -10,6 +10,8 @@ import type { LLMAdapter, ChatCompletionRequest, CompletionResult, + ResponseRequest, + ResponseResult, } from "./base"; import { buildOpenAITokenParams, @@ -716,6 +718,73 @@ export class OpenAIAdapter implements LLMAdapter { rawResponse: response as Record, }; } + + /** + * Responses API — MCP tools + reasoning + structured output via OpenAI /v1/responses + */ + async respond(request: ResponseRequest): Promise { + const client = await this.getClient(); + + const tools: Array> = ( + request.mcpServers ?? [] + ).map((mcp) => ({ + type: "mcp", + server_label: mcp.server_label, + server_url: mcp.server_url, + ...(mcp.headers ? { headers: mcp.headers } : {}), + ...(mcp.allowed_tools ? { allowed_tools: mcp.allowed_tools } : {}), + require_approval: mcp.require_approval ?? "never", + })); + + const payload: Record = { + model: this.model, + input: [ + { + role: "developer", + content: [{ type: "input_text", text: request.prompt }], + }, + ], + ...(tools.length ? { tools } : {}), + ...(request.reasoningEffort + ? { reasoning: { effort: request.reasoningEffort, summary: "auto" } } + : {}), + ...(request.outputSchema + ? { + text: { + format: { + type: "json_schema", + name: request.outputSchema.name, + schema: request.outputSchema.schema, + strict: true, + }, + }, + } + : {}), + store: false, + }; + + const response = await client.responses.create(payload); + + const output: Array<{ + type: string; + content?: Array<{ type: string; text?: string }>; + }> = response.output ?? []; + const messageItem = output.find((item) => item.type === "message"); + const text = + messageItem?.content?.find((c) => c.type === "output_text")?.text ?? ""; + + const usage = response.usage + ? { + prompt_tokens: response.usage.input_tokens ?? 0, + completion_tokens: response.usage.output_tokens ?? 0, + total_tokens: + (response.usage.input_tokens ?? 0) + + (response.usage.output_tokens ?? 0), + } + : undefined; + + return { text, usage }; + } } /** diff --git a/packages/llm-sdk/src/core/types.ts b/packages/llm-sdk/src/core/types.ts index 53a16e9..b23b233 100644 --- a/packages/llm-sdk/src/core/types.ts +++ b/packages/llm-sdk/src/core/types.ts @@ -111,7 +111,7 @@ export interface ToolMessage { /** * Content parts for multimodal user messages */ -export type UserContentPart = TextPart | ImagePart | FilePart; +export type UserContentPart = TextPart | ImagePart | FilePart | AudioPart; export interface TextPart { type: "text"; @@ -134,6 +134,16 @@ export interface FilePart { mimeType: string; } +export interface AudioPart { + type: "input_audio"; + input_audio: { + /** Base64-encoded audio data */ + data: string; + /** Audio format (e.g., 'mp3', 'wav', 'ogg', 'webm') */ + format: string; + }; +} + // ============================================ // Tool Types // ============================================ diff --git a/packages/llm-sdk/src/index.ts b/packages/llm-sdk/src/index.ts index 4218f5b..321d4dd 100644 --- a/packages/llm-sdk/src/index.ts +++ b/packages/llm-sdk/src/index.ts @@ -51,6 +51,9 @@ export type { ImagePart, FilePart, + // Content Parts + AudioPart, + // Tools Tool, ToolContext, @@ -159,6 +162,9 @@ export type { LLMAdapter, ChatCompletionRequest, AdapterFactory, + ResponseRequest, + ResponseResult, + McpServerConfig, } from "./adapters/base"; // Provider types (no implementations - use subpath imports) diff --git a/packages/llm-sdk/src/server/runtime.ts b/packages/llm-sdk/src/server/runtime.ts index 635e8a3..53a63b8 100644 --- a/packages/llm-sdk/src/server/runtime.ts +++ b/packages/llm-sdk/src/server/runtime.ts @@ -15,7 +15,12 @@ import type { } from "../core/stream-events"; import type { AIProvider } from "../providers/types"; import { createMessage } from "../core/stream-events"; -import type { LLMAdapter, ChatCompletionRequest } from "../adapters/base"; +import type { + LLMAdapter, + ChatCompletionRequest, + ResponseRequest, + ResponseResult, +} from "../adapters/base"; import type { RuntimeConfig, ChatRequest, @@ -2037,6 +2042,44 @@ export class Runtime { }); } + /** + * Responses API — MCP tools + reasoning + structured output. + * Uses OpenAI /v1/responses for OpenAI models, Anthropic Messages API with + * beta headers for Anthropic models. Falls back through the chain automatically. + * + * @example + * ```typescript + * const result = await runtime.response({ + * prompt: "Generate FAQs from the knowledge base", + * mcpServers: [{ type: "mcp", server_label: "kb", server_url: "https://..." }], + * reasoningEffort: "high", + * outputSchema: { name: "faqs", schema: { ... } }, + * }); + * console.log(result.text); + * ``` + */ + async response(request: ResponseRequest): Promise { + // Resolve the underlying adapter from either config shape + let resolvedAdapter: LLMAdapter | undefined; + + if ("adapter" in this.config) { + resolvedAdapter = this.config.adapter; + } else if ("provider" in this.config) { + resolvedAdapter = (this.config.provider as any).languageModel?.( + this.config.model, + ); + } + + if (!resolvedAdapter || typeof resolvedAdapter.respond !== "function") { + throw new Error( + `[llm-sdk] runtime.response() is not supported by the current adapter (${resolvedAdapter?.provider ?? "unknown"}). ` + + "Only OpenAI and Anthropic adapters implement respond().", + ); + } + + return resolvedAdapter.respond(request); + } + /** * Generate a complete response (non-streaming) *