From 6fb3bb2b861d93dcec79c73ea2abadc2971ad206 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 19:03:09 +0100 Subject: [PATCH 01/34] feat(voice): pluggable voice backend with Gemini Live & Qwen Realtime Rebased from Overbaker/hapi#401 onto current main. Adds a pluggable voice backend architecture that extends the existing ElevenLabs integration: - Gemini 2.5 Live (gemini-live): Google real-time audio via WebSocket with full function calling (messageCodingAgent, processPermissionRequest) - Qwen Realtime (qwen-realtime): Alibaba DashScope via hub WebSocket proxy (browser cannot set Authorization header directly) - VoiceBackendSession: dynamic backend selector with React.lazy loading, gates voice button until backend module is registered - Hub WS proxies: JWT-authenticated /api/voice/gemini-ws and /api/voice/qwen-ws endpoints in Bun.serve, with message queueing during upstream connect to prevent dropped setup frames - AudioWorklet pipeline: inline Blob URL recorder, 24 kHz PCM player, serial tool call execution, AudioContext created in user gesture for mobile - Backend discovery: GET /voice/backend + POST /voice/gemini-token / POST /voice/qwen-token hub routes; frontend auto-detects active backend Merge notes: - Rebased 135 upstream commits cleanly; HappyComposer keeps upstream's configurable enter-behavior setting (supersedes hard-coded Ctrl+Enter) - Converted gemini test files from bun:test to vitest (web package uses vitest) - All 221 hub tests and 636 web tests pass; TypeScript clean --- hub/src/socket/server.ts | 1 + hub/src/web/routes/voice.test.ts | 154 ++++++- hub/src/web/routes/voice.ts | 63 ++- hub/src/web/server.ts | 205 ++++++++- shared/src/voice.ts | 104 ++++- web/src/api/client.ts | 33 ++ web/src/api/voice.ts | 64 +++ web/src/components/SessionChat.tsx | 12 +- web/src/realtime/GeminiLiveVoiceSession.tsx | 409 +++++++++++++++++ web/src/realtime/QwenVoiceSession.tsx | 417 ++++++++++++++++++ web/src/realtime/RealtimeVoiceSession.tsx | 3 + web/src/realtime/VoiceBackendSession.tsx | 63 +++ web/src/realtime/gemini/audioPlayer.ts | 75 ++++ web/src/realtime/gemini/audioRecorder.ts | 139 ++++++ .../realtime/gemini/pcm-recorder.worklet.ts | 35 ++ web/src/realtime/gemini/pcmUtils.test.ts | 60 +++ web/src/realtime/gemini/pcmUtils.ts | 39 ++ web/src/realtime/gemini/toolAdapter.test.ts | 28 ++ web/src/realtime/gemini/toolAdapter.ts | 76 ++++ web/src/realtime/index.ts | 5 +- web/tsconfig.json | 3 +- 21 files changed, 1971 insertions(+), 17 deletions(-) create mode 100644 web/src/realtime/GeminiLiveVoiceSession.tsx create mode 100644 web/src/realtime/QwenVoiceSession.tsx create mode 100644 web/src/realtime/VoiceBackendSession.tsx create mode 100644 web/src/realtime/gemini/audioPlayer.ts create mode 100644 web/src/realtime/gemini/audioRecorder.ts create mode 100644 web/src/realtime/gemini/pcm-recorder.worklet.ts create mode 100644 web/src/realtime/gemini/pcmUtils.test.ts create mode 100644 web/src/realtime/gemini/pcmUtils.ts create mode 100644 web/src/realtime/gemini/toolAdapter.test.ts create mode 100644 web/src/realtime/gemini/toolAdapter.ts diff --git a/hub/src/socket/server.ts b/hub/src/socket/server.ts index af7533e5c8..6db62990f7 100644 --- a/hub/src/socket/server.ts +++ b/hub/src/socket/server.ts @@ -67,6 +67,7 @@ export function createSocketServer(deps: SocketServerDeps): { const engine = new Engine({ path: '/socket.io/', cors: corsOptions, + maxHttpBufferSize: 55 * 1024 * 1024, // 55MB to match upload limit allowRequest: async (req) => { const origin = req.headers.get('origin') if (!origin || allowAllOrigins || corsOrigins.includes(origin)) { diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts index 15249c7239..9cc3f42327 100644 --- a/hub/src/web/routes/voice.test.ts +++ b/hub/src/web/routes/voice.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, mock } from 'bun:test' +import { describe, expect, it, mock, test, afterEach } from 'bun:test' import { Hono } from 'hono' import { SignJWT } from 'jose' import type { WebAppEnv } from '../middleware/auth' @@ -188,3 +188,155 @@ describe('POST /api/voice/token', () => { else delete process.env.ELEVENLABS_AGENT_ID }) }) + +describe('GET /api/voice/backend', () => { + const originalEnv = process.env.VOICE_BACKEND + + afterEach(() => { + if (originalEnv === undefined) { + delete process.env.VOICE_BACKEND + } else { + process.env.VOICE_BACKEND = originalEnv + } + }) + + test('returns elevenlabs by default', async () => { + delete process.env.VOICE_BACKEND + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) + + test('returns gemini-live when configured', async () => { + process.env.VOICE_BACKEND = 'gemini-live' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('gemini-live') + }) + + test('returns qwen-realtime when configured', async () => { + process.env.VOICE_BACKEND = 'qwen-realtime' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('qwen-realtime') + }) + + test('falls back to elevenlabs for unknown values', async () => { + process.env.VOICE_BACKEND = 'unknown-backend' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) +}) + +describe('POST /api/voice/gemini-token', () => { + const origGemini = process.env.GEMINI_API_KEY + const origGoogle = process.env.GOOGLE_API_KEY + + afterEach(() => { + if (origGemini === undefined) delete process.env.GEMINI_API_KEY + else process.env.GEMINI_API_KEY = origGemini + if (origGoogle === undefined) delete process.env.GOOGLE_API_KEY + else process.env.GOOGLE_API_KEY = origGoogle + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.GEMINI_API_KEY + delete process.env.GOOGLE_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns proxied wsUrl when GEMINI_API_KEY is set', async () => { + process.env.GEMINI_API_KEY = 'test-gemini-key' + delete process.env.GOOGLE_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('proxied') + expect(body.wsUrl).toContain('/api/voice/gemini-ws') + }) + + test('falls back to GOOGLE_API_KEY', async () => { + delete process.env.GEMINI_API_KEY + process.env.GOOGLE_API_KEY = 'test-google-key' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('proxied') + expect(body.wsUrl).toContain('/api/voice/gemini-ws') + }) +}) + +describe('POST /api/voice/qwen-token', () => { + const origDash = process.env.DASHSCOPE_API_KEY + const origQwen = process.env.QWEN_API_KEY + + afterEach(() => { + if (origDash === undefined) delete process.env.DASHSCOPE_API_KEY + else process.env.DASHSCOPE_API_KEY = origDash + if (origQwen === undefined) delete process.env.QWEN_API_KEY + else process.env.QWEN_API_KEY = origQwen + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.DASHSCOPE_API_KEY + delete process.env.QWEN_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns wsUrl when DASHSCOPE_API_KEY is set (no raw key exposed)', async () => { + process.env.DASHSCOPE_API_KEY = 'test-dash-key' + delete process.env.QWEN_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.wsUrl).toContain('/api/voice/qwen-ws') + expect(body).not.toHaveProperty('apiKey') + }) + + test('falls back to QWEN_API_KEY', async () => { + delete process.env.DASHSCOPE_API_KEY + process.env.QWEN_API_KEY = 'test-qwen-key' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.wsUrl).toContain('/api/voice/qwen-ws') + expect(body).not.toHaveProperty('apiKey') + }) +}) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 091b9c2ac5..e8920dbb36 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -4,8 +4,10 @@ import type { WebAppEnv } from '../middleware/auth' import { ELEVENLABS_API_BASE, VOICE_AGENT_NAME, - buildVoiceAgentConfig + buildVoiceAgentConfig, + DEFAULT_VOICE_BACKEND } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' const tokenRequestSchema = z.object({ customAgentId: z.string().optional(), @@ -166,6 +168,65 @@ async function getOrCreateAgentIdForVoice(apiKey: string, voiceId?: string): Pro export function createVoiceRoutes(): Hono { const app = new Hono() + // Return the configured voice backend type + app.get('/voice/backend', (c) => { + const raw = process.env.VOICE_BACKEND + const backend: VoiceBackendType = + raw === 'gemini-live' ? 'gemini-live' + : raw === 'qwen-realtime' ? 'qwen-realtime' + : DEFAULT_VOICE_BACKEND + return c.json({ backend }) + }) + + // Get Gemini API key for Gemini Live voice sessions + // Gemini Live API does not support ephemeral tokens, so we proxy the key. + // The key is short-lived in the browser session and never persisted client-side. + app.post('/voice/gemini-token', async (c) => { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'Gemini API key not configured (set GEMINI_API_KEY or GOOGLE_API_KEY)' + }, 400) + } + + // Use server-side WS proxy to avoid region restrictions. + // The proxy at /api/voice/gemini-ws handles the API key server-side. + // Derive wsUrl from the request origin so remote browsers connect back to the hub, + // not to localhost. HAPI_PUBLIC_URL overrides when set (e.g. behind a reverse proxy). + const requestOrigin = new URL(c.req.url).origin + const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin + const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/gemini-ws' + + return c.json({ + allowed: true, + apiKey: 'proxied', // Dummy — key is handled server-side + wsUrl: wsProxyUrl, // Always proxy — env WS URLs are upstream-only (server-side) + baseUrl: process.env.GEMINI_API_BASE || undefined + }) + }) + + // Check Qwen (DashScope) availability for Qwen Realtime voice sessions + // The actual API key is never sent to the browser — it stays server-side in the WS proxy. + app.post('/voice/qwen-token', async (c) => { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'DashScope API key not configured (set DASHSCOPE_API_KEY or QWEN_API_KEY)' + }, 400) + } + + const requestOrigin = new URL(c.req.url).origin + const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin + const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/qwen-ws' + + return c.json({ + allowed: true, + wsUrl: wsProxyUrl // Always proxy — env WS URLs are upstream-only (server-side) + }) + }) + // Get ElevenLabs ConvAI conversation token app.post('/voice/token', async (c) => { const requestId = crypto.randomUUID() diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index e18f3ddc62..9a4643c924 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -21,13 +21,125 @@ import { createPushRoutes } from './routes/push' import { createVoiceRoutes } from './routes/voice' import type { SSEManager } from '../sse/sseManager' import type { VisibilityTracker } from '../visibility/visibilityTracker' -import type { Server as BunServer } from 'bun' +import type { Server as BunServer, ServerWebSocket } from 'bun' import type { Server as SocketEngine } from '@socket.io/bun-engine' +import { jwtVerify } from 'jose' import type { WebSocketData } from '@socket.io/bun-engine' import { loadEmbeddedAssetMap, type EmbeddedWebAsset } from './embeddedAssets' import { isBunCompiled } from '../utils/bunCompiled' import type { Store } from '../store' +// Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions +function createGeminiProxyWebSocketHandler() { + const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' + const upstreamMap = new WeakMap, WebSocket>() + const pendingMap = new WeakMap, Array>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { _geminiProxy: boolean; apiKey: string } + const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}` + const pending: Array = [] + pendingMap.set(clientWs, pending) + + const upstream = new WebSocket(upstreamUrl) + upstreamMap.set(clientWs, upstream) + + upstream.onopen = () => { + // Flush any messages queued while upstream was connecting (e.g. setup frame) + for (const queued of pending.splice(0)) { + upstream.send(typeof queued === 'string' ? queued : queued) + } + pendingMap.delete(clientWs) + } + upstream.onmessage = (event) => { + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + } + } catch { /* client gone */ } + } + upstream.onerror = () => { + pendingMap.delete(clientWs) + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + pendingMap.delete(clientWs) + try { clientWs.close(event.code, event.reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + const upstream = upstreamMap.get(clientWs) + if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(typeof message === 'string' ? message : message) + } else if (upstream?.readyState === WebSocket.CONNECTING) { + // Queue messages until upstream opens (critical for the setup frame) + const pending = pendingMap.get(clientWs) + if (pending) pending.push(message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + const upstream = upstreamMap.get(clientWs) + pendingMap.delete(clientWs) + if (upstream) { + try { upstream.close(code, reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} + +// Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header) +function createQwenProxyWebSocketHandler() { + const QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' + const upstreamMap = new WeakMap, WebSocket>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { apiKey: string; model: string } + const upstreamUrl = `${process.env.QWEN_REALTIME_WS_URL || QWEN_WS_BASE}?model=${encodeURIComponent(data.model)}` + + const upstream = new WebSocket(upstreamUrl, { + headers: { 'Authorization': `Bearer ${data.apiKey}` } + } as unknown as string[]) + + upstreamMap.set(clientWs, upstream) + + upstream.onopen = () => { + // Connection ready — upstream will send session.created + } + upstream.onmessage = (event) => { + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + } + } catch { /* client gone */ } + } + upstream.onerror = () => { + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + try { clientWs.close(event.code, event.reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + const upstream = upstreamMap.get(clientWs) + if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(typeof message === 'string' ? message : message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + const upstream = upstreamMap.get(clientWs) + if (upstream) { + try { upstream.close(code, reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} + function findWebappDistDir(): { distDir: string; indexHtmlPath: string } { const candidates = [ join(process.cwd(), '..', 'web', 'dist'), @@ -232,17 +344,100 @@ export async function startWebServer(options: { const configuration = getConfiguration() const socketHandler = options.socketEngine.handler() - const server = Bun.serve({ + // Wrap socket.io websocket handler to also support Gemini/Qwen proxy connections + const originalWsHandler = socketHandler.websocket + const geminiProxyHandler = createGeminiProxyWebSocketHandler() + const qwenProxyHandler = createQwenProxyWebSocketHandler() + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const server = (Bun.serve as any)({ hostname: configuration.listenHost, port: configuration.listenPort, idleTimeout: Math.max(30, socketHandler.idleTimeout), maxRequestBodySize: Math.max(socketHandler.maxRequestBodySize, 68 * 1024 * 1024), - websocket: socketHandler.websocket, - fetch: (req, server) => { + websocket: { + ...originalWsHandler, + open(ws: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.open(wsAny) + } else if (wsAny.data?._qwenProxy) { + qwenProxyHandler.open(wsAny) + } else { + originalWsHandler.open?.(ws as never) + } + }, + message(ws: unknown, message: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.message(wsAny, message as string) + } else if (wsAny.data?._qwenProxy) { + qwenProxyHandler.message(wsAny, message as string) + } else { + originalWsHandler.message?.(ws as never, message as never) + } + }, + close(ws: unknown, code: number, reason: string) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.close(wsAny, code, reason) + } else if (wsAny.data?._qwenProxy) { + qwenProxyHandler.close(wsAny, code, reason) + } else { + originalWsHandler.close?.(ws as never, code as never, reason as never) + } + } + }, + fetch: async (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => { const url = new URL(req.url) if (url.pathname.startsWith('/socket.io/')) { - return socketHandler.fetch(req, server) + return socketHandler.fetch(req, server as never) + } + + // Voice WebSocket proxies — require JWT auth via query param + // (browser WebSocket API cannot set custom headers) + if (url.pathname === '/api/voice/gemini-ws' || url.pathname === '/api/voice/qwen-ws') { + const token = url.searchParams.get('token') + if (!token) { + return new Response('Missing authorization token', { status: 401 }) + } + try { + await jwtVerify(token, options.jwtSecret, { algorithms: ['HS256'] }) + } catch { + return new Response('Invalid token', { status: 401 }) + } } + + // Gemini Live WebSocket proxy + if (url.pathname === '/api/voice/gemini-ws') { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return new Response('Gemini API key not configured', { status: 400 }) + } + const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { + data: { _geminiProxy: true, apiKey } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response + } + // Qwen Realtime WebSocket proxy + if (url.pathname === '/api/voice/qwen-ws') { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + const model = url.searchParams.get('model') || 'qwen3.5-omni-plus-realtime' + if (!apiKey) { + return new Response('DashScope API key not configured', { status: 400 }) + } + const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { + data: { _qwenProxy: true, apiKey, model } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response + } + return app.fetch(req) } }) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 2843d84eb4..51cd681112 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -8,7 +8,11 @@ export const ELEVENLABS_API_BASE = 'https://api.elevenlabs.io/v1' export const VOICE_AGENT_NAME = 'Hapi Voice Assistant' -export const VOICE_SYSTEM_PROMPT = `# Identity +export const VOICE_SYSTEM_PROMPT = `# CRITICAL RULE - Tool Usage + +You MUST call the messageCodingAgent tool for ANY request related to coding, files, development, debugging, or tasks for the agent. Do NOT respond verbally to these requests — call the tool FIRST, then briefly confirm. This is your most important behavior. + +# Identity You are Hapi Voice Assistant. You bridge voice communication between users and their AI coding agents in the Hapi ecosystem. @@ -136,9 +140,28 @@ For builds, tests, or large file operations: - Treat garbled input as phonetic hints and ask for clarification - Correct yourself immediately if you realize you made an error - Keep conversations forward-moving with fresh insights -- Assume a technical software developer audience` +- Assume a technical software developer audience + +# First Interaction + +When the user speaks to you for the first time, begin your response with a brief greeting before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` + +/** + * Additional language block appended to VOICE_SYSTEM_PROMPT for Gemini/Qwen + * backends (which don't have a separate language field like ElevenLabs). + */ +export const VOICE_CHINESE_LANGUAGE_BLOCK = ` + +# Language -export const VOICE_FIRST_MESSAGE = "Hey! Hapi here." +IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. +- Greet users in Chinese +- Summarize technical content in Chinese +- Use English only for proper nouns, tool names, and code identifiers +- Keep the same warm, concise conversational style in Chinese` + +/** ElevenLabs first message — language controlled by ElevenLabs language field */ +export const VOICE_FIRST_MESSAGE = "Hey! Hapi here — what can I help you with?" export const VOICE_TOOLS = [ { @@ -261,3 +284,78 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { } } } + +export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' + +export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' +export const QWEN_REALTIME_VOICE = 'Mia' + +export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' + +export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' + +export interface VoiceToolDefinition { + name: string + description: string + parameters: { + type: 'object' + required: string[] + properties: Record + } +} + +type VoiceToolSource = Pick<(typeof VOICE_TOOLS)[number], 'name' | 'description' | 'parameters'> + +function cloneVoiceToolDefinition(tool: VoiceToolSource): VoiceToolDefinition { + const properties: VoiceToolDefinition['parameters']['properties'] = {} + + for (const [key, value] of Object.entries(tool.parameters.properties)) { + properties[key] = { + type: value.type, + description: value.description + } + } + + return { + name: tool.name, + description: tool.description, + parameters: { + type: 'object', + required: [...tool.parameters.required], + properties + } + } +} + +export const VOICE_TOOL_DEFINITIONS: VoiceToolDefinition[] = VOICE_TOOLS.map(cloneVoiceToolDefinition) + +export type GeminiLiveFunctionDeclaration = VoiceToolDefinition + +export interface GeminiLiveConfig { + model: string + systemInstruction: string + tools: Array<{ + functionDeclarations: GeminiLiveFunctionDeclaration[] + }> + responseModalities: ['AUDIO'] +} + +export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclaration[] { + return VOICE_TOOLS.map(cloneVoiceToolDefinition) +} + +export function buildGeminiLiveConfig(): GeminiLiveConfig { + return { + model: GEMINI_LIVE_MODEL, + systemInstruction: VOICE_SYSTEM_PROMPT + VOICE_CHINESE_LANGUAGE_BLOCK, + tools: [ + { + functionDeclarations: buildGeminiLiveFunctionDeclarations() + } + ], + responseModalities: ['AUDIO'] + } +} diff --git a/web/src/api/client.ts b/web/src/api/client.ts index d8dfa1bb93..0e78837722 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -567,4 +567,37 @@ export class ApiClient { body: JSON.stringify(event) }) } + + /** Return the current auth token (for WebSocket query-param auth). */ + getAuthToken(): string | null { + return this.getToken ? this.getToken() : this.token + } + + async fetchVoiceBackend(): Promise<{ backend: string }> { + return await this.request('/api/voice/backend') + } + + async fetchQwenToken(): Promise<{ + allowed: boolean + wsUrl?: string + error?: string + }> { + return await this.request('/api/voice/qwen-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } + + async fetchGeminiToken(): Promise<{ + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string + }> { + return await this.request('/api/voice/gemini-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } } diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index c5e8f27a4d..c6df29e16c 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -15,6 +15,7 @@ import { VOICE_AGENT_NAME, buildVoiceAgentConfig } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' export interface VoiceTokenResponse { allowed: boolean @@ -177,3 +178,66 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise { + try { + return await api.fetchQwenToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} + +export interface VoiceBackendResponse { + backend: VoiceBackendType +} + +export interface GeminiTokenResponse { + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string +} + +/** + * Discover which voice backend the hub is configured to use. + */ +export async function fetchVoiceBackend(api: ApiClient): Promise { + try { + const result = await api.fetchVoiceBackend() + const backend = result.backend === 'gemini-live' ? 'gemini-live' + : result.backend === 'qwen-realtime' ? 'qwen-realtime' + : 'elevenlabs' + return { backend } as VoiceBackendResponse + } catch { + return { backend: 'elevenlabs' } + } +} + +/** + * Fetch a Gemini API key from the hub for Gemini Live voice sessions. + */ +export async function fetchGeminiToken(api: ApiClient): Promise { + try { + return await api.fetchGeminiToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx index 7cd5fb2c2a..39926684b9 100644 --- a/web/src/components/SessionChat.tsx +++ b/web/src/components/SessionChat.tsx @@ -35,7 +35,7 @@ import { useCodexModels } from '@/hooks/queries/useCodexModels' import { useCursorModels } from '@/hooks/queries/useCursorModels' import { useOpencodeModels } from '@/hooks/queries/useOpencodeModels' import { useVoiceOptional } from '@/lib/voice-context' -import { RealtimeVoiceSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' +import { VoiceBackendSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' import { isRemoteTerminalSupported } from '@/utils/terminalSupport' /** @@ -207,6 +207,7 @@ export function SessionChat(props: { // Voice assistant integration const voice = useVoiceOptional() + const [voiceBackendReady, setVoiceBackendReady] = useState(false) // Register session store for voice client tools useEffect(() => { @@ -673,18 +674,19 @@ export function SessionChat(props: { autocompleteSuggestions={props.autocompleteSuggestions} voiceStatus={voice?.status} voiceMicMuted={voice?.micMuted} - onVoiceToggle={voice ? handleVoiceToggle : undefined} - onVoiceMicToggle={voice ? handleVoiceMicToggle : undefined} + onVoiceToggle={voice && voiceBackendReady ? handleVoiceToggle : undefined} + onVoiceMicToggle={voice && voiceBackendReady ? handleVoiceMicToggle : undefined} /> - {/* Voice session component - renders nothing but initializes ElevenLabs */} + {/* Voice session component - renders nothing but initializes voice backend */} {voice && ( - )} diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx new file mode 100644 index 0000000000..a0461e092f --- /dev/null +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -0,0 +1,409 @@ +import { useEffect, useRef, useCallback } from 'react' +import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' +import { registerSessionStore } from './realtimeClientTools' +import { fetchGeminiToken } from '@/api/voice' +import { GeminiAudioRecorder } from './gemini/audioRecorder' +import { GeminiAudioPlayer } from './gemini/audioPlayer' +import { handleGeminiFunctionCalls } from './gemini/toolAdapter' +import { buildGeminiLiveConfig } from '@hapi/protocol/voice' +import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' +import type { ApiClient } from '@/api/client' +import type { Session } from '@/types/api' +import type { GeminiFunctionCall } from './gemini/toolAdapter' + +const DEBUG = import.meta.env.DEV + +// Default Gemini Live WebSocket API endpoint (Google direct) +const DEFAULT_GEMINI_LIVE_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' + +interface GeminiLiveState { + ws: WebSocket | null + recorder: GeminiAudioRecorder | null + player: GeminiAudioPlayer | null + playbackContext: AudioContext | null + statusCallback: StatusCallback | null + apiKey: string | null + wsBaseUrl: string | null + modelSpeaking: boolean + micMuted: boolean +} + +const state: GeminiLiveState = { + ws: null, + recorder: null, + player: null, + playbackContext: null, + statusCallback: null, + apiKey: null, + wsBaseUrl: null, + modelSpeaking: false, + micMuted: false +} + +function cleanup() { + if (state.recorder) { + state.recorder.dispose() + state.recorder = null + } + if (state.player) { + state.player.dispose() + state.player = null + } + if (state.playbackContext && state.playbackContext.state !== 'closed') { + void state.playbackContext.close() + } + state.playbackContext = null + if (state.ws) { + if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { + state.ws.close() + } + state.ws = null + } +} + +class GeminiLiveVoiceSessionImpl implements VoiceSession { + private api: ApiClient + + constructor(api: ApiClient) { + this.api = api + } + + async startSession(config: VoiceSessionConfig): Promise { + cleanup() + state.statusCallback?.('connecting') + + // Create playback AudioContext immediately while still inside the user + // gesture (click/tap). Mobile browsers require this for autoplay policy. + // Store in state so cleanup() can close it on failure or stop. + state.playbackContext = new AudioContext({ sampleRate: 24000 }) + await state.playbackContext.resume() + + // Get API key from hub + console.log('[GeminiLive] Fetching token...') + const tokenResp = await fetchGeminiToken(this.api) + console.log('[GeminiLive] Token response:', { allowed: tokenResp.allowed, hasKey: !!tokenResp.apiKey, error: tokenResp.error }) + if (!tokenResp.allowed || !tokenResp.apiKey) { + const msg = tokenResp.error ?? 'Gemini API key not available' + console.error('[GeminiLive] Token failed:', msg) + state.statusCallback?.('error', msg) + throw new Error(msg) + } + state.apiKey = tokenResp.apiKey + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + console.log('[GeminiLive] Requesting microphone...') + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + console.log('[GeminiLive] Microphone granted') + } catch (error) { + console.error('[GeminiLive] Microphone denied:', error) + state.statusCallback?.('error', 'Microphone permission denied') + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect WebSocket — use proxy URL if provided (avoids region restrictions) + const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE + const isProxy = !!state.wsBaseUrl + const authToken = this.api.getAuthToken() || '' + const wsUrl = isProxy + ? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}` + : `${wsBase}?key=${encodeURIComponent(state.apiKey)}` + console.log('[GeminiLive] Connecting WebSocket to:', wsBase, isProxy ? '(proxied)' : '(direct)') + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let setupDone = false + + ws.onopen = () => { + if (DEBUG) console.log('[GeminiLive] WebSocket connected, sending setup') + + const liveConfig = buildGeminiLiveConfig() + const setupMessage = { + setup: { + model: `models/${liveConfig.model}`, + generationConfig: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: 'Aoede' } + } + } + }, + systemInstruction: { + parts: [{ text: liveConfig.systemInstruction }] + }, + tools: liveConfig.tools.map((t) => ({ + functionDeclarations: t.functionDeclarations.map((fd) => ({ + name: fd.name, + description: fd.description, + parameters: fd.parameters + })) + })) + } + } + + ws.send(JSON.stringify(setupMessage)) + } + + ws.onmessage = async (event) => { + let data: Record + try { + if (event.data instanceof Blob) { + const text = await event.data.text() + data = JSON.parse(text) as Record + } else { + data = JSON.parse(event.data as string) as Record + } + } catch { + if (DEBUG) console.warn('[GeminiLive] Failed to parse message') + return + } + + // Log all message types for debugging + const msgKeys = Object.keys(data).filter(k => k !== 'serverContent' || !('modelTurn' in (data.serverContent as Record || {}))) + if (!data.serverContent) { + console.log('[GeminiLive] Message:', msgKeys.join(', '), JSON.stringify(data).slice(0, 200)) + } + + // Setup complete + if (data.setupComplete && !setupDone) { + setupDone = true + if (DEBUG) console.log('[GeminiLive] Setup complete') + state.statusCallback?.('connected') + + // Start audio capture + startAudioCapture(state.playbackContext!) + + // Send initial context if available (no clientContent greeting — it breaks tool calls) + if (config.initialContext) { + sendClientContent(`[Context] ${config.initialContext}`) + } + + resolve() + return + } + + // Server content (audio / text / turn complete) + const serverContent = data.serverContent as { + modelTurn?: { parts?: Array<{ inlineData?: { data: string; mimeType: string }; text?: string }> } + turnComplete?: boolean + } | undefined + + if (serverContent) { + if (serverContent.modelTurn?.parts) { + // Model is generating — mute mic to prevent barge-in from noise + if (!state.modelSpeaking) { + state.modelSpeaking = true + state.recorder?.setMuted(true) + } + for (const part of serverContent.modelTurn.parts) { + if (part.inlineData?.data) { + state.player?.enqueue(part.inlineData.data) + } + if (part.text) { + console.log('[GeminiLive] Text:', part.text) + } + } + } + if (serverContent.turnComplete) { + console.log('[GeminiLive] Turn complete') + // Model done — unmute mic for next user turn + state.modelSpeaking = false + state.recorder?.setMuted(false) + } + } + + // Tool calls + const toolCall = data.toolCall as { + functionCalls?: Array<{ name: string; args: Record; id: string }> + } | undefined + + if (toolCall?.functionCalls && toolCall.functionCalls.length > 0) { + console.log('[GeminiLive] Tool calls:', toolCall.functionCalls.map((c) => c.name)) + + const responses = await handleGeminiFunctionCalls( + toolCall.functionCalls as GeminiFunctionCall[] + ) + + // Send tool responses back + if (state.ws?.readyState === WebSocket.OPEN) { + state.ws.send(JSON.stringify({ + toolResponse: { + functionResponses: responses.map((r) => ({ + id: r.id, + name: r.name, + response: r.response + })) + } + })) + } + } + } + + ws.onerror = (event) => { + console.error('[GeminiLive] WebSocket error:', event) + if (!setupDone) { + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (DEBUG) console.log('[GeminiLive] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + if (!setupDone) { + const message = event.reason || 'WebSocket closed before setup completed' + state.statusCallback?.('error', message) + reject(new Error(message)) + return + } + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + sendClientContent(message) + } + + sendContextualUpdate(update: string): void { + // Send as a system-like context message + sendClientContent(`[System Context Update] ${update}`) + } +} + +function sendClientContent(text: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + clientContent: { + turns: [{ role: 'user', parts: [{ text }] }], + turnComplete: true + } + })) +} + +function sendAudioChunk(base64Pcm: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + // Don't send audio while model is speaking + if (state.modelSpeaking) return + state.ws.send(JSON.stringify({ + realtimeInput: { + mediaChunks: [{ + mimeType: 'audio/pcm;rate=16000', + data: base64Pcm + }] + } + })) +} + +function startAudioCapture(playbackContext: AudioContext): void { + state.player = new GeminiAudioPlayer(playbackContext) + state.recorder = new GeminiAudioRecorder() + + state.recorder.start( + (pcm16Chunk) => sendAudioChunk(pcm16Chunk), + (error) => { + console.error('[GeminiLive] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) + + // Apply initial mute state — the React effect may have run before the recorder existed + if (state.micMuted) { + state.recorder.setMuted(true) + } +} + +// --- React component --- + +export interface GeminiLiveVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + onRegistered?: () => void + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function GeminiLiveVoiceSession({ + api, + micMuted = false, + onStatusChange, + onRegistered, + getSession, + sendMessage, + approvePermission, + denyPermission +}: GeminiLiveVoiceSessionProps) { + const hasRegistered = useRef(false) + + // Store status callback + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + // Register session store for client tools + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + // Register voice session once + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new GeminiLiveVoiceSessionImpl(api)) + hasRegistered.current = true + onRegistered?.() + } catch (error) { + console.error('[GeminiLive] Failed to register voice session:', error) + } + } + }, [api]) // eslint-disable-line react-hooks/exhaustive-deps + + // Sync mic mute state — also persist to module state so startAudioCapture can apply it + useEffect(() => { + state.micMuted = micMuted + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + // Handle barge-in: clear audio queue when user starts speaking + const handleBargeIn = useCallback(() => { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + }, []) + + // Cleanup on unmount + useEffect(() => { + return () => { + cleanup() + } + }, []) + + return null +} diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx new file mode 100644 index 0000000000..f0624cc2da --- /dev/null +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -0,0 +1,417 @@ +import { useEffect, useRef, useCallback } from 'react' +import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' +import { registerSessionStore } from './realtimeClientTools' +import { fetchQwenToken } from '@/api/voice' +import { GeminiAudioRecorder } from './gemini/audioRecorder' +import { GeminiAudioPlayer } from './gemini/audioPlayer' +import { realtimeClientTools } from './realtimeClientTools' +import { + QWEN_REALTIME_MODEL, + QWEN_REALTIME_VOICE, + VOICE_SYSTEM_PROMPT, + VOICE_CHINESE_LANGUAGE_BLOCK, + VOICE_TOOL_DEFINITIONS +} from '@hapi/protocol/voice' +import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' +import type { ApiClient } from '@/api/client' +import type { Session } from '@/types/api' + +const DEBUG = import.meta.env.DEV + +// Qwen WebSocket connects via Hub proxy (browser can't set Authorization header) + +interface QwenState { + ws: WebSocket | null + recorder: GeminiAudioRecorder | null + player: GeminiAudioPlayer | null + playbackContext: AudioContext | null + statusCallback: StatusCallback | null + apiKey: string | null + wsBaseUrl: string | null + micMuted: boolean +} + +const state: QwenState = { + ws: null, + recorder: null, + player: null, + playbackContext: null, + statusCallback: null, + apiKey: null, + wsBaseUrl: null, + micMuted: false +} + +let eventCounter = 0 +function nextEventId(): string { + return `evt_${++eventCounter}` +} + +function cleanup() { + if (state.recorder) { + state.recorder.dispose() + state.recorder = null + } + if (state.player) { + state.player.dispose() + state.player = null + } + if (state.playbackContext && state.playbackContext.state !== 'closed') { + void state.playbackContext.close() + } + state.playbackContext = null + if (state.ws) { + if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { + state.ws.close() + } + state.ws = null + } +} + +function sendEvent(type: string, payload?: Record): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + event_id: nextEventId(), + type, + ...payload + })) +} + +class QwenVoiceSessionImpl implements VoiceSession { + private api: ApiClient + + constructor(api: ApiClient) { + this.api = api + } + + async startSession(config: VoiceSessionConfig): Promise { + cleanup() + state.statusCallback?.('connecting') + + // Create playback AudioContext immediately while still inside the user + // gesture (click/tap). Mobile browsers require this for autoplay policy. + // Store in state so cleanup() can close it on failure or stop. + state.playbackContext = new AudioContext({ sampleRate: 24000 }) + await state.playbackContext.resume() + + // Check Qwen availability (hub no longer sends the raw API key) + const tokenResp = await fetchQwenToken(this.api) + if (!tokenResp.allowed) { + const msg = tokenResp.error ?? 'DashScope API key not available' + state.statusCallback?.('error', msg) + throw new Error(msg) + } + state.apiKey = null // key stays server-side + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + } catch (error) { + state.statusCallback?.('error', 'Microphone permission denied') + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect via Hub WebSocket proxy (DashScope requires Authorization header, + // which browser WebSocket API doesn't support) + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + const defaultProxyUrl = `${protocol}//${window.location.host}/api/voice/qwen-ws` + const proxyUrl = state.wsBaseUrl || defaultProxyUrl + const model = QWEN_REALTIME_MODEL + const authToken = this.api.getAuthToken() || '' + const separator = proxyUrl.includes('?') ? '&' : '?' + const wsUrl = `${proxyUrl}${separator}model=${encodeURIComponent(model)}&token=${encodeURIComponent(authToken)}` + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let sessionReady = false + + ws.onopen = () => { + if (DEBUG) console.log('[Qwen] WebSocket connected') + } + + ws.onmessage = async (event) => { + let data: Record + try { + data = JSON.parse(event.data as string) as Record + } catch { + if (DEBUG) console.warn('[Qwen] Failed to parse message') + return + } + + const eventType = data.type as string + + // Session created - send configuration + if (eventType === 'session.created' && !sessionReady) { + if (DEBUG) console.log('[Qwen] Session created') + + // Build tools config + const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ + type: 'function' as const, + name: td.name, + description: td.description, + parameters: td.parameters + })) + + // Send session.update with full configuration + const basePrompt = VOICE_SYSTEM_PROMPT + VOICE_CHINESE_LANGUAGE_BLOCK + const instructions = config.initialContext + ? `${basePrompt}\n\n[Current Context]\n${config.initialContext}` + : basePrompt + + sendEvent('session.update', { + session: { + modalities: ['text', 'audio'], + voice: QWEN_REALTIME_VOICE, + input_audio_format: 'pcm', + output_audio_format: 'pcm', + instructions, + temperature: 0.7, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 800, + prefix_padding_ms: 300 + }, + tools, + tool_choice: 'auto' + } + }) + return + } + + // Session updated - ready to go + if (eventType === 'session.updated') { + sessionReady = true + if (DEBUG) console.log('[Qwen] Session configured') + state.statusCallback?.('connected') + startAudioCapture(state.playbackContext!) + resolve() + return + } + + // Audio output streaming + if (eventType === 'response.audio.delta') { + const delta = data.delta as string + if (delta) { + state.player?.enqueue(delta) + } + return + } + + // Text transcript (for debug) + if (eventType === 'response.audio_transcript.delta' && DEBUG) { + console.log('[Qwen] Transcript:', data.delta) + return + } + + // Function call complete + if (eventType === 'response.function_call_arguments.done') { + const callId = data.call_id as string + const fnName = data.name as string + const argsStr = data.arguments as string + + if (DEBUG) console.log('[Qwen] Tool call:', fnName, argsStr) + + let args: Record = {} + try { args = JSON.parse(argsStr) } catch { /* empty */ } + + // Execute the tool + const handler = fnName === 'messageCodingAgent' + ? realtimeClientTools.messageCodingAgent + : fnName === 'processPermissionRequest' + ? realtimeClientTools.processPermissionRequest + : null + + const result = handler + ? await handler(args) + : `error (unknown tool: ${fnName})` + + // Send function result back + sendEvent('conversation.item.create', { + item: { + type: 'function_call_output', + call_id: callId, + output: typeof result === 'string' ? result : JSON.stringify(result) + } + }) + // Trigger model to continue + sendEvent('response.create') + return + } + + // VAD: user started speaking - barge-in + if (eventType === 'input_audio_buffer.speech_started') { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + return + } + + // Response done + if (eventType === 'response.done' && DEBUG) { + const resp = data.response as Record | undefined + const usage = resp?.usage as Record | undefined + if (usage) console.log('[Qwen] Usage:', usage) + return + } + + // Error + if (eventType === 'error') { + const err = data.error as { message?: string } | undefined + const message = err?.message || 'Realtime session setup failed' + console.error('[Qwen] Server error:', message) + state.statusCallback?.('error', message) + if (!sessionReady) { + reject(new Error(message)) + ws.close() + } + return + } + } + + ws.onerror = (event) => { + console.error('[Qwen] WebSocket error:', event) + if (!sessionReady) { + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + if (!sessionReady) { + const message = event.reason || 'WebSocket closed before setup completed' + state.statusCallback?.('error', message) + reject(new Error(message)) + return + } + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + // Send text as a user message via conversation.item.create + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: message }] + } + }) + sendEvent('response.create') + } + + sendContextualUpdate(update: string): void { + // Send context as a system-like user message + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: `[System Context Update] ${update}` }] + } + }) + } +} + +function startAudioCapture(playbackContext: AudioContext): void { + state.player = new GeminiAudioPlayer(playbackContext) + state.recorder = new GeminiAudioRecorder() + + state.recorder.start( + (base64Pcm) => { + sendEvent('input_audio_buffer.append', { audio: base64Pcm }) + }, + (error) => { + console.error('[Qwen] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) + + // Apply initial mute state — the React effect may have run before the recorder existed + if (state.micMuted) { + state.recorder.setMuted(true) + } +} + +// --- React component --- + +export interface QwenVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + onRegistered?: () => void + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function QwenVoiceSession({ + api, + micMuted = false, + onStatusChange, + onRegistered, + getSession, + sendMessage, + approvePermission, + denyPermission +}: QwenVoiceSessionProps) { + const hasRegistered = useRef(false) + + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new QwenVoiceSessionImpl(api)) + hasRegistered.current = true + onRegistered?.() + } catch (error) { + console.error('[Qwen] Failed to register voice session:', error) + } + } + }, [api]) // eslint-disable-line react-hooks/exhaustive-deps + + // Sync mic mute state — also persist to module state so startAudioCapture can apply it + useEffect(() => { + state.micMuted = micMuted + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + useEffect(() => { + return () => { cleanup() } + }, []) + + return null +} diff --git a/web/src/realtime/RealtimeVoiceSession.tsx b/web/src/realtime/RealtimeVoiceSession.tsx index 428d1c1715..89e438b749 100644 --- a/web/src/realtime/RealtimeVoiceSession.tsx +++ b/web/src/realtime/RealtimeVoiceSession.tsx @@ -136,6 +136,7 @@ export interface RealtimeVoiceSessionProps { api: ApiClient micMuted?: boolean onStatusChange?: StatusCallback + onRegistered?: () => void getSession?: (sessionId: string) => Session | null sendMessage?: (sessionId: string, message: string) => void approvePermission?: (sessionId: string, requestId: string) => Promise @@ -146,6 +147,7 @@ export function RealtimeVoiceSession({ api, micMuted: micMutedProp = false, onStatusChange, + onRegistered, getSession, sendMessage, approvePermission, @@ -241,6 +243,7 @@ export function RealtimeVoiceSession({ try { registerVoiceSession(new RealtimeVoiceSessionImpl(api)) hasRegistered.current = true + onRegistered?.() } catch (error) { console.error('[Voice] Failed to register voice session:', error) } diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx new file mode 100644 index 0000000000..b990d07bd7 --- /dev/null +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -0,0 +1,63 @@ +import { lazy, Suspense, useCallback, useEffect, useState } from 'react' +import { RealtimeVoiceSession } from './RealtimeVoiceSession' +import type { RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +import { fetchVoiceBackend } from '@/api/voice' +import type { ApiClient } from '@/api/client' +import type { VoiceBackendType } from '@hapi/protocol/voice' + +// Lazy-load alternative backends to avoid bundling when using ElevenLabs +const GeminiLiveVoiceSession = lazy(() => + import('./GeminiLiveVoiceSession').then((m) => ({ default: m.GeminiLiveVoiceSession })) +) +const QwenVoiceSession = lazy(() => + import('./QwenVoiceSession').then((m) => ({ default: m.QwenVoiceSession })) +) + +export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { + api: ApiClient + onReadyChange?: (ready: boolean) => void +} + +/** + * Dynamically selects the voice session component based on the hub's configured backend. + * Queries GET /voice/backend once on mount and renders the appropriate component. + * Only signals readiness after the selected backend has mounted and registered its session. + */ +export function VoiceBackendSession(props: VoiceBackendSessionProps) { + const [backend, setBackend] = useState(null) + + useEffect(() => { + let cancelled = false + fetchVoiceBackend(props.api).then((resp) => { + if (!cancelled) setBackend(resp.backend) + }) + return () => { + cancelled = true + props.onReadyChange?.(false) + } + }, [props.api]) // eslint-disable-line react-hooks/exhaustive-deps + + const handleRegistered = useCallback(() => { + props.onReadyChange?.(true) + }, [props.onReadyChange]) + + if (!backend) return null + + if (backend === 'gemini-live') { + return ( + + + + ) + } + + if (backend === 'qwen-realtime') { + return ( + + + + ) + } + + return +} diff --git a/web/src/realtime/gemini/audioPlayer.ts b/web/src/realtime/gemini/audioPlayer.ts new file mode 100644 index 0000000000..23d1d341e4 --- /dev/null +++ b/web/src/realtime/gemini/audioPlayer.ts @@ -0,0 +1,75 @@ +import { base64ToArrayBuffer, pcm16ToFloat32 } from './pcmUtils'; + +export class GeminiAudioPlayer { + private audioContext: AudioContext; + private ownsContext: boolean; + private lastEndTime: number = 0; + private activeSources: AudioBufferSourceNode[] = []; + + constructor(audioContext?: AudioContext) { + if (audioContext) { + this.audioContext = audioContext; + this.ownsContext = false; + } else { + this.audioContext = new AudioContext({ sampleRate: 24000 }); + this.ownsContext = true; + } + this.lastEndTime = this.audioContext.currentTime; + } + + enqueue(base64Pcm: string): void { + if (this.audioContext.state === 'suspended') { + this.audioContext.resume(); + } + + const arrayBuffer = base64ToArrayBuffer(base64Pcm); + const float32Data = pcm16ToFloat32(arrayBuffer); + + if (float32Data.length === 0) return; + + const audioBuffer = this.audioContext.createBuffer(1, float32Data.length, 24000); + audioBuffer.copyToChannel(new Float32Array(float32Data), 0); + + const source = this.audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(this.audioContext.destination); + + const startTime = Math.max(this.audioContext.currentTime, this.lastEndTime); + + source.onended = () => { + const index = this.activeSources.indexOf(source); + if (index > -1) { + this.activeSources.splice(index, 1); + } + }; + + source.start(startTime); + this.activeSources.push(source); + + this.lastEndTime = startTime + audioBuffer.duration; + } + + clearQueue(): void { + this.activeSources.forEach(source => { + try { + source.stop(); + } catch (e) { + // Ignore if already stopped + } + source.disconnect(); + }); + this.activeSources = []; + this.lastEndTime = this.audioContext.currentTime; + } + + isPlaying(): boolean { + return this.lastEndTime > this.audioContext.currentTime; + } + + dispose(): void { + this.clearQueue(); + if (this.ownsContext && this.audioContext.state !== 'closed') { + this.audioContext.close(); + } + } +} diff --git a/web/src/realtime/gemini/audioRecorder.ts b/web/src/realtime/gemini/audioRecorder.ts new file mode 100644 index 0000000000..98813212a0 --- /dev/null +++ b/web/src/realtime/gemini/audioRecorder.ts @@ -0,0 +1,139 @@ +import { float32ToPcm16, arrayBufferToBase64 } from './pcmUtils'; + +// Inline worklet source to avoid Vite bundling issues with ?url imports. +// AudioWorklet.addModule() requires a URL to valid JS, so we create a Blob URL. +const WORKLET_SOURCE = ` +class PcmRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.buffer = new Float32Array(4096); + this.idx = 0; + } + process(inputs) { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.idx++] = channel[i]; + if (this.idx >= 4096) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.idx = 0; + } + } + } + return true; + } +} +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); +`; + +function createWorkletUrl(): string { + const blob = new Blob([WORKLET_SOURCE], { type: 'application/javascript' }); + return URL.createObjectURL(blob); +} + +export class GeminiAudioRecorder { + private audioContext: AudioContext | null = null; + private mediaStream: MediaStream | null = null; + private sourceNode: MediaStreamAudioSourceNode | null = null; + private workletNode: AudioWorkletNode | null = null; + private scriptNode: ScriptProcessorNode | null = null; + + async start(onChunk: (base64Pcm: string) => void, onError?: (error: Error) => void): Promise { + try { + this.mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { sampleRate: 16000, channelCount: 1 } + }); + + this.mediaStream.getTracks().forEach((track) => { + track.onended = () => { + if (onError) onError(new Error('Microphone disconnected')); + }; + }); + + this.audioContext = new AudioContext({ sampleRate: 16000 }); + if (this.audioContext.state === 'suspended') { + await this.audioContext.resume(); + } + + this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); + + try { + const workletUrl = createWorkletUrl(); + await this.audioContext.audioWorklet.addModule(workletUrl); + URL.revokeObjectURL(workletUrl); + + this.workletNode = new AudioWorkletNode(this.audioContext, 'pcm-recorder-processor'); + this.workletNode.port.onmessage = (event) => { + const pcm16 = float32ToPcm16(event.data.samples); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + // Connect source → worklet → silent sink → destination. + // The downstream connection is required so the audio graph pulls + // frames through the worklet node and port.onmessage fires. + const sink = this.audioContext.createGain(); + sink.gain.value = 0; + this.sourceNode.connect(this.workletNode); + this.workletNode.connect(sink); + sink.connect(this.audioContext.destination); + } catch (e) { + console.warn('[GeminiLive] AudioWorklet failed, falling back to ScriptProcessorNode', e); + this.scriptNode = this.audioContext.createScriptProcessor(4096, 1, 1); + this.scriptNode.onaudioprocess = (event) => { + const inputData = event.inputBuffer.getChannelData(0); + const pcm16 = float32ToPcm16(new Float32Array(inputData)); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + this.sourceNode.connect(this.scriptNode); + this.scriptNode.connect(this.audioContext.destination); + } + } catch (e) { + if (onError) onError(e instanceof Error ? e : new Error(String(e))); + throw e; + } + } + + stop(): void { + if (this.mediaStream) { + this.mediaStream.getTracks().forEach(track => { + track.onended = null; + track.stop(); + }); + this.mediaStream = null; + } + + if (this.scriptNode) { + this.scriptNode.disconnect(); + this.scriptNode = null; + } + + if (this.workletNode) { + this.workletNode.disconnect(); + this.workletNode = null; + } + + if (this.sourceNode) { + this.sourceNode.disconnect(); + this.sourceNode = null; + } + + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + } + + setMuted(muted: boolean): void { + if (this.mediaStream) { + this.mediaStream.getAudioTracks().forEach(track => { + track.enabled = !muted; + }); + } + } + + dispose(): void { + this.stop(); + } +} diff --git a/web/src/realtime/gemini/pcm-recorder.worklet.ts b/web/src/realtime/gemini/pcm-recorder.worklet.ts new file mode 100644 index 0000000000..404f65445b --- /dev/null +++ b/web/src/realtime/gemini/pcm-recorder.worklet.ts @@ -0,0 +1,35 @@ +// AudioWorklet processor runs in a separate scope with its own globals. +// These declarations satisfy TypeScript without pulling in DOM lib types. +declare class AudioWorkletProcessor { + readonly port: MessagePort + constructor() +} +declare function registerProcessor(name: string, ctor: new () => AudioWorkletProcessor): void + +class PcmRecorderProcessor extends AudioWorkletProcessor { + private buffer: Float32Array; + private bufferSize = 4096; + private bufferIndex = 0; + + constructor() { + super(); + this.buffer = new Float32Array(this.bufferSize); + } + + process(inputs: Float32Array[][]): boolean { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.bufferIndex++] = channel[i]; + if (this.bufferIndex >= this.bufferSize) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.bufferIndex = 0; + } + } + } + return true; + } +} + +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); diff --git a/web/src/realtime/gemini/pcmUtils.test.ts b/web/src/realtime/gemini/pcmUtils.test.ts new file mode 100644 index 0000000000..1b2159f5cd --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.test.ts @@ -0,0 +1,60 @@ +import { describe, test, expect } from 'vitest' +import { + float32ToPcm16, + pcm16ToFloat32, + arrayBufferToBase64, + base64ToArrayBuffer +} from './pcmUtils' + +describe('pcmUtils', () => { + describe('float32ToPcm16 / pcm16ToFloat32 round-trip', () => { + test('preserves signal within quantization error', () => { + const input = new Float32Array([0, 0.5, -0.5, 1.0, -1.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(output.length).toBe(input.length) + for (let i = 0; i < input.length; i++) { + expect(Math.abs(output[i] - input[i])).toBeLessThan(0.001) + } + }) + + test('clamps values outside [-1, 1]', () => { + const input = new Float32Array([2.0, -2.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(Math.abs(output[0] - 1.0)).toBeLessThan(0.001) + expect(Math.abs(output[1] - (-1.0))).toBeLessThan(0.001) + }) + + test('handles empty input', () => { + const input = new Float32Array(0) + const pcm16 = float32ToPcm16(input) + expect(pcm16.byteLength).toBe(0) + const output = pcm16ToFloat32(pcm16) + expect(output.length).toBe(0) + }) + }) + + describe('arrayBufferToBase64 / base64ToArrayBuffer round-trip', () => { + test('preserves binary data', () => { + const original = new Uint8Array([0, 1, 127, 128, 255]) + const base64 = arrayBufferToBase64(original.buffer) + const restored = new Uint8Array(base64ToArrayBuffer(base64)) + + expect(restored.length).toBe(original.length) + for (let i = 0; i < original.length; i++) { + expect(restored[i]).toBe(original[i]) + } + }) + + test('handles empty buffer', () => { + const empty = new ArrayBuffer(0) + const base64 = arrayBufferToBase64(empty) + expect(base64).toBe('') + const restored = base64ToArrayBuffer(base64) + expect(restored.byteLength).toBe(0) + }) + }) +}) diff --git a/web/src/realtime/gemini/pcmUtils.ts b/web/src/realtime/gemini/pcmUtils.ts new file mode 100644 index 0000000000..67e2928fc0 --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.ts @@ -0,0 +1,39 @@ +export function float32ToPcm16(samples: Float32Array): ArrayBuffer { + const buffer = new ArrayBuffer(samples.length * 2); + const view = new DataView(buffer); + for (let i = 0; i < samples.length; i++) { + let s = Math.max(-1, Math.min(1, samples[i])); + s = s < 0 ? s * 0x8000 : s * 0x7FFF; + view.setInt16(i * 2, s, true); + } + return buffer; +} + +export function pcm16ToFloat32(buffer: ArrayBuffer): Float32Array { + const int16Array = new Int16Array(buffer); + const float32Array = new Float32Array(int16Array.length); + for (let i = 0; i < int16Array.length; i++) { + const s = int16Array[i]; + float32Array[i] = s < 0 ? s / 0x8000 : s / 0x7FFF; + } + return float32Array; +} + +export function arrayBufferToBase64(buffer: ArrayBuffer): string { + let binary = ''; + const bytes = new Uint8Array(buffer); + const len = bytes.byteLength; + for (let i = 0; i < len; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +} + +export function base64ToArrayBuffer(base64: string): ArrayBuffer { + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes.buffer; +} diff --git a/web/src/realtime/gemini/toolAdapter.test.ts b/web/src/realtime/gemini/toolAdapter.test.ts new file mode 100644 index 0000000000..651e890a10 --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.test.ts @@ -0,0 +1,28 @@ +import { describe, test, expect } from 'vitest' +import { handleGeminiFunctionCall, handleGeminiFunctionCalls } from './toolAdapter' +import type { GeminiFunctionCall } from './toolAdapter' + +describe('toolAdapter', () => { + test('returns error for unknown tool', async () => { + const call: GeminiFunctionCall = { + name: 'unknownTool', + args: {}, + id: 'call-1' + } + const resp = await handleGeminiFunctionCall(call) + expect(resp.name).toBe('unknownTool') + expect(resp.id).toBe('call-1') + expect(resp.response.result).toContain('unknown tool') + }) + + test('handles multiple calls in parallel', async () => { + const calls: GeminiFunctionCall[] = [ + { name: 'unknownA', args: {}, id: 'a' }, + { name: 'unknownB', args: {}, id: 'b' } + ] + const responses = await handleGeminiFunctionCalls(calls) + expect(responses.length).toBe(2) + expect(responses[0].id).toBe('a') + expect(responses[1].id).toBe('b') + }) +}) diff --git a/web/src/realtime/gemini/toolAdapter.ts b/web/src/realtime/gemini/toolAdapter.ts new file mode 100644 index 0000000000..dbf4dee9c9 --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.ts @@ -0,0 +1,76 @@ +import { realtimeClientTools } from '../realtimeClientTools' + +/** + * Gemini Live API function call from server. + * Matches the `toolCall` shape in a BidiGenerateContent serverMessage. + */ +export interface GeminiFunctionCall { + name: string + args: Record + id: string +} + +/** + * Response sent back to Gemini Live via `toolResponse`. + */ +export interface GeminiFunctionResponse { + name: string + id: string + response: { result: string } +} + +type ClientToolHandler = (parameters: unknown) => Promise + +const toolHandlers: Record = { + messageCodingAgent: realtimeClientTools.messageCodingAgent, + processPermissionRequest: realtimeClientTools.processPermissionRequest +} + +/** + * Execute a Gemini Live function call using the existing client tool handlers. + * Returns a GeminiFunctionResponse ready to send back over the WebSocket. + */ +export async function handleGeminiFunctionCall( + call: GeminiFunctionCall +): Promise { + const handler = toolHandlers[call.name] + + if (!handler) { + return { + name: call.name, + id: call.id, + response: { result: `error (unknown tool: ${call.name})` } + } + } + + try { + const result = await handler(call.args) + return { + name: call.name, + id: call.id, + response: { result } + } + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown error' + return { + name: call.name, + id: call.id, + response: { result: `error (${message})` } + } + } +} + +/** + * Process multiple function calls sequentially to avoid racing on shared + * session state (e.g. processPermissionRequest resolving the same pending + * request twice when calls run in parallel). + */ +export async function handleGeminiFunctionCalls( + calls: GeminiFunctionCall[] +): Promise { + const responses: GeminiFunctionResponse[] = [] + for (const call of calls) { + responses.push(await handleGeminiFunctionCall(call)) + } + return responses +} diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index 58b7b229e3..07ebfe8f0a 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -15,8 +15,11 @@ export { // Client tools export { realtimeClientTools, registerSessionStore } from './realtimeClientTools' -// Voice session component +// Voice session components export { RealtimeVoiceSession, type RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +export { GeminiLiveVoiceSession, type GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' +export { QwenVoiceSession, type QwenVoiceSessionProps } from './QwenVoiceSession' +export { VoiceBackendSession, type VoiceBackendSessionProps } from './VoiceBackendSession' // Voice hooks export { voiceHooks, registerVoiceHooksStore } from './hooks/voiceHooks' diff --git a/web/tsconfig.json b/web/tsconfig.json index 8b0682a4bb..de7bcdca50 100644 --- a/web/tsconfig.json +++ b/web/tsconfig.json @@ -11,5 +11,6 @@ "@/*": ["./src/*"] } }, - "include": ["src"] + "include": ["src"], + "exclude": ["src/**/*.test.ts", "src/**/*.test.tsx", "src/**/*.spec.ts", "src/**/*.spec.tsx"] } From 4f4db150668d4b0c59cbf95e98b60b7eae5aa5d6 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 19:56:05 +0100 Subject: [PATCH 02/34] fix(voice): restore user mic mute state after Gemini turn completes turnComplete handler was unconditionally calling setMuted(false), which re-enabled the mic track even when the user had manually muted. Now restores to state.micMuted instead. --- web/src/realtime/GeminiLiveVoiceSession.tsx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index a0461e092f..127cc30402 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -212,9 +212,9 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { } if (serverContent.turnComplete) { console.log('[GeminiLive] Turn complete') - // Model done — unmute mic for next user turn + // Restore to user's chosen mute state, not unconditionally unmuted state.modelSpeaking = false - state.recorder?.setMuted(false) + state.recorder?.setMuted(state.micMuted) } } @@ -279,17 +279,18 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { } sendContextualUpdate(update: string): void { - // Send as a system-like context message - sendClientContent(`[System Context Update] ${update}`) + // Append context without triggering a response — turnComplete: false accumulates + // silently until the next sendTextMessage fires with turnComplete: true + sendClientContent(`[System Context Update] ${update}`, false) } } -function sendClientContent(text: string): void { +function sendClientContent(text: string, turnComplete = true): void { if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return state.ws.send(JSON.stringify({ clientContent: { turns: [{ role: 'user', parts: [{ text }] }], - turnComplete: true + turnComplete } })) } From 03db00b55501323313cd02657b4fe8e01a324d9e Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 21:11:24 +0100 Subject: [PATCH 03/34] fix(voice): remove hard-coded Chinese language from Gemini backend buildGeminiLiveConfig was appending VOICE_CHINESE_LANGUAGE_BLOCK which forced Gemini to always respond in Mandarin regardless of user locale. Gemini now uses the neutral base prompt and responds in the language the user speaks to it, consistent with the ElevenLabs behaviour. --- shared/src/voice.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 51cd681112..21a7da64b5 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -350,7 +350,7 @@ export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclara export function buildGeminiLiveConfig(): GeminiLiveConfig { return { model: GEMINI_LIVE_MODEL, - systemInstruction: VOICE_SYSTEM_PROMPT + VOICE_CHINESE_LANGUAGE_BLOCK, + systemInstruction: VOICE_SYSTEM_PROMPT, tools: [ { functionDeclarations: buildGeminiLiveFunctionDeclarations() From cdea28d509c5f0aed005402fd4812ae91207ffe7 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 21:13:13 +0100 Subject: [PATCH 04/34] fix(voice): reset modelSpeaking in cleanup to unblock mic on restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the session closes while Gemini is mid-speech, cleanup() left state.modelSpeaking=true. The next startSession() would then drop all mic audio in sendAudioChunk() until a model turn eventually flipped the flag — effectively deaf until page reload. --- web/src/realtime/GeminiLiveVoiceSession.tsx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 127cc30402..25786a6dff 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -59,6 +59,8 @@ function cleanup() { } state.ws = null } + // Always reset modelSpeaking so a restart doesn't begin with audio capture silenced + state.modelSpeaking = false } class GeminiLiveVoiceSessionImpl implements VoiceSession { From 0628457ed40df2565861b5ffe0cc88ab766d3d6f Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 22:21:22 +0100 Subject: [PATCH 05/34] fix(voice): guard stale close handlers in Gemini and Qwen sessions ws.onclose operated on module-level state.ws, not the socket that fired the event. A rapid stop/restart could cause the old socket's onclose to call cleanup() after the new socket was assigned, tearing down the live session. Guard with `if (state.ws !== ws) return` before cleanup. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/GeminiLiveVoiceSession.tsx | 1 + web/src/realtime/QwenVoiceSession.tsx | 1 + 2 files changed, 2 insertions(+) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 25786a6dff..1d70491944 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -256,6 +256,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { } ws.onclose = (event) => { + if (state.ws !== ws) return if (DEBUG) console.log('[GeminiLive] WebSocket closed:', event.code, event.reason) cleanup() resetRealtimeSessionState() diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index f0624cc2da..f06f5a0548 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -283,6 +283,7 @@ class QwenVoiceSessionImpl implements VoiceSession { } ws.onclose = (event) => { + if (state.ws !== ws) return if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) cleanup() resetRealtimeSessionState() From 89948352ff36ae4fe82105dd9be1d87a9d5c8d03 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 22:37:47 +0100 Subject: [PATCH 06/34] fix(voice): remove hard-coded Chinese language from Qwen backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Matches the Gemini fix — both backends now use VOICE_SYSTEM_PROMPT without the Chinese language block, giving consistent English-default behaviour across all non-ElevenLabs backends. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/QwenVoiceSession.tsx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index f06f5a0548..1fa83fd40d 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -9,7 +9,6 @@ import { QWEN_REALTIME_MODEL, QWEN_REALTIME_VOICE, VOICE_SYSTEM_PROMPT, - VOICE_CHINESE_LANGUAGE_BLOCK, VOICE_TOOL_DEFINITIONS } from '@hapi/protocol/voice' import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' @@ -158,7 +157,7 @@ class QwenVoiceSessionImpl implements VoiceSession { })) // Send session.update with full configuration - const basePrompt = VOICE_SYSTEM_PROMPT + VOICE_CHINESE_LANGUAGE_BLOCK + const basePrompt = VOICE_SYSTEM_PROMPT const instructions = config.initialContext ? `${basePrompt}\n\n[Current Context]\n${config.initialContext}` : basePrompt From a9dfd1c6b8c83747321f041260b1bcb7a8359854 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 22:49:27 +0100 Subject: [PATCH 07/34] feat(voice): proactive/reactive toggle in voice settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a "Proactive voice" toggle (default: off = reactive) to the Voice Assistant settings section. Reactive (default): initial context and agent-ready events are fed silently; the assistant waits for the user to speak first. Proactive: original behaviour — Gemini/Qwen narrate context on connect and speak unprompted when the agent finishes a task. ElevenLabs is also affected via onReady sending a user message rather than a silent update. Covers all three backends uniformly. localStorage key: hapi-voice-proactive. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/lib/locales/en.ts | 2 ++ web/src/lib/locales/zh-CN.ts | 2 ++ web/src/realtime/GeminiLiveVoiceSession.tsx | 4 ++- web/src/realtime/hooks/voiceHooks.ts | 8 ++++- web/src/routes/settings/index.tsx | 37 ++++++++++++++++++++- 5 files changed, 50 insertions(+), 3 deletions(-) diff --git a/web/src/lib/locales/en.ts b/web/src/lib/locales/en.ts index 842097126a..eb97380fa2 100644 --- a/web/src/lib/locales/en.ts +++ b/web/src/lib/locales/en.ts @@ -418,6 +418,8 @@ export default { 'settings.voice.autoDetect': 'Auto-detect', 'settings.voice.voice': 'Voice', 'settings.voice.voiceDefault': 'Default', + 'settings.voice.proactive': 'Proactive voice', + 'settings.voice.proactive.description': 'When on, the assistant narrates agent activity and speaks unprompted when the agent finishes a task. When off, it waits for you to speak first.', 'settings.about.title': 'About', 'settings.about.website': 'Website', 'settings.about.appVersion': 'App Version', diff --git a/web/src/lib/locales/zh-CN.ts b/web/src/lib/locales/zh-CN.ts index c1873a303a..20e63696ac 100644 --- a/web/src/lib/locales/zh-CN.ts +++ b/web/src/lib/locales/zh-CN.ts @@ -420,6 +420,8 @@ export default { 'settings.voice.autoDetect': '自动检测', 'settings.voice.voice': '声音', 'settings.voice.voiceDefault': '默认', + 'settings.voice.proactive': '主动语音', + 'settings.voice.proactive.description': '开启后,助手会主动播报代理活动,并在任务完成时主动发言。关闭后,助手等待您先开口。', 'settings.about.title': '关于', 'settings.about.website': '官方网站', 'settings.about.appVersion': '应用版本', diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 1d70491944..6436ca1264 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -182,8 +182,10 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { startAudioCapture(state.playbackContext!) // Send initial context if available (no clientContent greeting — it breaks tool calls) + // In reactive mode (default) send silently so Gemini doesn't narrate on connect. if (config.initialContext) { - sendClientContent(`[Context] ${config.initialContext}`) + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + sendClientContent(`[Context] ${config.initialContext}`, proactive) } resolve() diff --git a/web/src/realtime/hooks/voiceHooks.ts b/web/src/realtime/hooks/voiceHooks.ts index c6318d3c16..e876302945 100644 --- a/web/src/realtime/hooks/voiceHooks.ts +++ b/web/src/realtime/hooks/voiceHooks.ts @@ -151,7 +151,13 @@ export const voiceHooks = { reportSession(sessionId) const messages = messagesGetter?.(sessionId) ?? [] const lastAssistantText = extractLastAssistantSpeakable(messages) - reportTextUpdate(formatReadyEvent(sessionId, lastAssistantText)) + const update = formatReadyEvent(sessionId, lastAssistantText) + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + if (proactive) { + reportTextUpdate(update) + } else { + reportContextualUpdate(update) + } }, /** diff --git a/web/src/routes/settings/index.tsx b/web/src/routes/settings/index.tsx index d754e9ad75..d798a64e48 100644 --- a/web/src/routes/settings/index.tsx +++ b/web/src/routes/settings/index.tsx @@ -349,6 +349,20 @@ export default function SettingsPage() { const [playingVoiceId, setPlayingVoiceId] = useState(null) const currentAudioRef = useRef(null) + // Voice proactive mode - read from localStorage, default false (reactive) + const [voiceProactive, setVoiceProactive] = useState(() => { + return localStorage.getItem('hapi-voice-proactive') === 'true' + }) + + const handleVoiceProactiveChange = (value: boolean) => { + setVoiceProactive(value) + if (value) { + localStorage.setItem('hapi-voice-proactive', 'true') + } else { + localStorage.removeItem('hapi-voice-proactive') + } + } + const fontScaleOptions = getFontScaleOptions() const terminalFontSizeOptions = getTerminalFontSizeOptions() const composerEnterBehaviorOptions = getComposerEnterBehaviorOptions() @@ -988,7 +1002,6 @@ export default function SettingsPage() { )} -
+ +
+
+ {t('settings.voice.proactive')} + +
+

{t('settings.voice.proactive.description')}

+
{/* About section */} From ea8ac50b4c78bad55990922a161d1f04a90882c0 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 25 May 2026 23:10:00 +0100 Subject: [PATCH 08/34] fix(voice): normalize WS close codes, drop barrel re-exports, fix SSE visibility - hub/server.ts: add toClientCloseCode() to normalize reserved upstream close codes (1005/1006/1015) to 1011 before forwarding to browser; abnormal upstream drops (1006) would otherwise throw on clientWs.close() and leave the browser socket open - realtime/index.ts: remove static GeminiLiveVoiceSession and QwenVoiceSession barrel exports; VoiceBackendSession lazy-imports both, so barrel re-exports created static dependencies that defeated the intended code-split - App.tsx: gate global useVisibilityReporter on !sessionEventSubscription so the always-on SSE connection does not suppress native Web Push notifications for sessions the user is not currently viewing via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/web/server.ts | 13 +++++++++++-- web/src/App.tsx | 2 +- web/src/realtime/index.ts | 2 -- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 9a4643c924..1361c4f8d7 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -29,6 +29,15 @@ import { loadEmbeddedAssetMap, type EmbeddedWebAsset } from './embeddedAssets' import { isBunCompiled } from '../utils/bunCompiled' import type { Store } from '../store' +// Normalise upstream close codes before forwarding to the browser client. +// Codes 1005/1006/1015 are reserved and cannot be sent in a close frame; +// abnormal upstream drops commonly produce 1006, which would throw on clientWs.close(). +function toClientCloseCode(code: number): number { + return code >= 1000 && code <= 4999 && code !== 1005 && code !== 1006 && code !== 1015 + ? code + : 1011 +} + // Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions function createGeminiProxyWebSocketHandler() { const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' @@ -65,7 +74,7 @@ function createGeminiProxyWebSocketHandler() { } upstream.onclose = (event) => { pendingMap.delete(clientWs) - try { clientWs.close(event.code, event.reason) } catch { /* */ } + try { clientWs.close(toClientCloseCode(event.code), event.reason || 'Upstream closed') } catch { /* client gone */ } upstreamMap.delete(clientWs) } }, @@ -120,7 +129,7 @@ function createQwenProxyWebSocketHandler() { try { clientWs.close(1011, 'Upstream error') } catch { /* */ } } upstream.onclose = (event) => { - try { clientWs.close(event.code, event.reason) } catch { /* */ } + try { clientWs.close(toClientCloseCode(event.code), event.reason || 'Upstream closed') } catch { /* client gone */ } upstreamMap.delete(clientWs) } }, diff --git a/web/src/App.tsx b/web/src/App.tsx index fc23635acf..af7c6599ec 100644 --- a/web/src/App.tsx +++ b/web/src/App.tsx @@ -327,7 +327,7 @@ function AppInner() { useVisibilityReporter({ api, subscriptionId: globalSubscriptionId, - enabled: sseEnabled + enabled: sseEnabled && !sessionEventSubscription }) useVisibilityReporter({ diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index 07ebfe8f0a..d3e58c686f 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -17,8 +17,6 @@ export { realtimeClientTools, registerSessionStore } from './realtimeClientTools // Voice session components export { RealtimeVoiceSession, type RealtimeVoiceSessionProps } from './RealtimeVoiceSession' -export { GeminiLiveVoiceSession, type GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' -export { QwenVoiceSession, type QwenVoiceSessionProps } from './QwenVoiceSession' export { VoiceBackendSession, type VoiceBackendSessionProps } from './VoiceBackendSession' // Voice hooks From d7c04db8435b2af49383d6f2233f32e7150776f7 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 09:19:53 +0100 Subject: [PATCH 09/34] fix(voice): respect language setting in Gemini/Qwen; fix voice-start toggle label - buildGeminiLiveConfig() now accepts optional language param; appends VOICE_CHINESE_LANGUAGE_BLOCK only when language === 'zh' - GeminiLiveVoiceSession passes config.language through - QwenVoiceSession conditionally builds basePrompt from language setting - Fixes silent no-op when user selects Chinese in voice settings on Gemini/Qwen backends (was ElevenLabs-only) - Rename voice-start toggle label to 'Start voice session with summary' - Fix description: clarifies the choice is about session-open behaviour (summary vs greeting), not ongoing narration via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- .gitignore | 1 + shared/src/voice.ts | 7 +++++-- web/src/lib/locales/en.ts | 4 ++-- web/src/lib/locales/zh-CN.ts | 4 ++-- web/src/realtime/GeminiLiveVoiceSession.tsx | 2 +- web/src/realtime/QwenVoiceSession.tsx | 5 ++++- 6 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index dc3c3d28a1..5607ecc3f8 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ coverage/ # Claude local settings .claude/settings.local.json +AGENTS.local.md localdocs/ execplan/ diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 21a7da64b5..8e76aaa954 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -347,10 +347,13 @@ export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclara return VOICE_TOOLS.map(cloneVoiceToolDefinition) } -export function buildGeminiLiveConfig(): GeminiLiveConfig { +export function buildGeminiLiveConfig(language?: string): GeminiLiveConfig { + const systemInstruction = language === 'zh' + ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` + : VOICE_SYSTEM_PROMPT return { model: GEMINI_LIVE_MODEL, - systemInstruction: VOICE_SYSTEM_PROMPT, + systemInstruction, tools: [ { functionDeclarations: buildGeminiLiveFunctionDeclarations() diff --git a/web/src/lib/locales/en.ts b/web/src/lib/locales/en.ts index eb97380fa2..a153430522 100644 --- a/web/src/lib/locales/en.ts +++ b/web/src/lib/locales/en.ts @@ -418,8 +418,8 @@ export default { 'settings.voice.autoDetect': 'Auto-detect', 'settings.voice.voice': 'Voice', 'settings.voice.voiceDefault': 'Default', - 'settings.voice.proactive': 'Proactive voice', - 'settings.voice.proactive.description': 'When on, the assistant narrates agent activity and speaks unprompted when the agent finishes a task. When off, it waits for you to speak first.', + 'settings.voice.proactive': 'Start voice session with summary', + 'settings.voice.proactive.description': 'When on, starting a voice session opens with a spoken summary of current agent activity. When off, the assistant greets you and waits for you to speak.', 'settings.about.title': 'About', 'settings.about.website': 'Website', 'settings.about.appVersion': 'App Version', diff --git a/web/src/lib/locales/zh-CN.ts b/web/src/lib/locales/zh-CN.ts index 20e63696ac..dc9049674e 100644 --- a/web/src/lib/locales/zh-CN.ts +++ b/web/src/lib/locales/zh-CN.ts @@ -420,8 +420,8 @@ export default { 'settings.voice.autoDetect': '自动检测', 'settings.voice.voice': '声音', 'settings.voice.voiceDefault': '默认', - 'settings.voice.proactive': '主动语音', - 'settings.voice.proactive.description': '开启后,助手会主动播报代理活动,并在任务完成时主动发言。关闭后,助手等待您先开口。', + 'settings.voice.proactive': '以摘要开始语音会话', + 'settings.voice.proactive.description': '开启后,启动语音会话时将朗读当前代理活动的摘要。关闭后,助手向您打招呼并等待您先开口。', 'settings.about.title': '关于', 'settings.about.website': '官方网站', 'settings.about.appVersion': '应用版本', diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 6436ca1264..f6b851881a 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -124,7 +124,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { ws.onopen = () => { if (DEBUG) console.log('[GeminiLive] WebSocket connected, sending setup') - const liveConfig = buildGeminiLiveConfig() + const liveConfig = buildGeminiLiveConfig(config.language) const setupMessage = { setup: { model: `models/${liveConfig.model}`, diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 1fa83fd40d..f72458e809 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -9,6 +9,7 @@ import { QWEN_REALTIME_MODEL, QWEN_REALTIME_VOICE, VOICE_SYSTEM_PROMPT, + VOICE_CHINESE_LANGUAGE_BLOCK, VOICE_TOOL_DEFINITIONS } from '@hapi/protocol/voice' import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' @@ -157,7 +158,9 @@ class QwenVoiceSessionImpl implements VoiceSession { })) // Send session.update with full configuration - const basePrompt = VOICE_SYSTEM_PROMPT + const basePrompt = config.language === 'zh' + ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` + : VOICE_SYSTEM_PROMPT const instructions = config.initialContext ? `${basePrompt}\n\n[Current Context]\n${config.initialContext}` : basePrompt From d68f97dd843cc2cf0bf458ff273b54b900f050cc Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 09:32:25 +0100 Subject: [PATCH 10/34] fix(voice): send greeting trigger in reactive mode for Gemini Gemini Live has no built-in first-message like ElevenLabs agents do; without an explicit turnComplete:true it sits silently. In reactive mode (default, toggle off) now sends a greeting instruction after any silent context feed so Gemini introduces itself and invites the user to speak. Proactive mode is unchanged: the context summary is the opening speech. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/GeminiLiveVoiceSession.tsx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index f6b851881a..0961468fc1 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -181,13 +181,18 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { // Start audio capture startAudioCapture(state.playbackContext!) - // Send initial context if available (no clientContent greeting — it breaks tool calls) - // In reactive mode (default) send silently so Gemini doesn't narrate on connect. + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + if (config.initialContext) { - const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + // Proactive: speak the summary. Reactive: feed context silently. sendClientContent(`[Context] ${config.initialContext}`, proactive) } + if (!proactive) { + // Gemini won't self-start; send a greeting trigger so it introduces itself. + sendClientContent('[Introduce yourself briefly and invite the user to speak.]', true) + } + resolve() return } From f67296cf237ba2638615809966786746ebbd462c Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 09:36:10 +0100 Subject: [PATCH 11/34] fix(voice): suppress Gemini self-identification and context leak in greeting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - VOICE_SYSTEM_PROMPT: explicit instruction never to call itself Gemini, Google, or any underlying model/provider name — always HAPI - Greeting trigger text: instruct to greet as HAPI only, suppress model name and any reference to context/recent activity in the opening line via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- shared/src/voice.ts | 2 ++ web/src/realtime/GeminiLiveVoiceSession.tsx | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 8e76aaa954..8d30034a2c 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -16,6 +16,8 @@ You MUST call the messageCodingAgent tool for ANY request related to coding, fil You are Hapi Voice Assistant. You bridge voice communication between users and their AI coding agents in the Hapi ecosystem. +IMPORTANT: Never refer to yourself as Gemini, Google, or any underlying model or provider name. You are HAPI — always. + You are friendly, proactive, and highly intelligent with a world-class engineering background. Your approach is warm, witty, and relaxed, balancing professionalism with an approachable vibe. # Environment Overview diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 0961468fc1..458927d845 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -190,7 +190,8 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { if (!proactive) { // Gemini won't self-start; send a greeting trigger so it introduces itself. - sendClientContent('[Introduce yourself briefly and invite the user to speak.]', true) + // Explicitly suppress model name and context leak. + sendClientContent('[Greet the user as HAPI. Say a brief hello and invite them to speak. Do not mention Gemini or any model name. Do not reference any context or recent activity.]', true) } resolve() From 1a10d866335d572fd36610ae904f92febc9f906e Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 10:07:19 +0100 Subject: [PATCH 12/34] =?UTF-8?q?fix(voice):=20address=20code=20review=20f?= =?UTF-8?q?indings=20=E2=80=94=20error=20handling,=20proxy,=20audio?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini + Qwen client: - onerror now sets setupDone/sessionReady and nulls state.ws before calling reject(), so the stale-close guard trips in onclose and prevents a duplicate statusCallback('error') on WS failure Gemini client: - Proactive mode with no initialContext now falls through to the greeting trigger instead of sitting silently - Remove unused handleBargeIn callback (dead code) Qwen client: - Add input_audio_sample_rate: 16000 to session.update so PCM rate is declared explicitly rather than relying on DashScope's default Hub proxy: - Remove no-op ternary in Gemini flush loop and message handler (typeof x === 'string' ? x : x); use upstream.send(msg) directly - Qwen onerror now calls upstreamMap.delete() before closing client, eliminating the stale map entry window - Align Qwen hub fallback model string with QWEN_REALTIME_MODEL constant ('qwen3-omni-flash-realtime') via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/web/server.ts | 9 ++++---- web/src/realtime/GeminiLiveVoiceSession.tsx | 25 +++++++++------------ web/src/realtime/QwenVoiceSession.tsx | 3 +++ 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 1361c4f8d7..6d638c2d3a 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -57,7 +57,7 @@ function createGeminiProxyWebSocketHandler() { upstream.onopen = () => { // Flush any messages queued while upstream was connecting (e.g. setup frame) for (const queued of pending.splice(0)) { - upstream.send(typeof queued === 'string' ? queued : queued) + upstream.send(queued) } pendingMap.delete(clientWs) } @@ -81,7 +81,7 @@ function createGeminiProxyWebSocketHandler() { message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { const upstream = upstreamMap.get(clientWs) if (upstream?.readyState === WebSocket.OPEN) { - upstream.send(typeof message === 'string' ? message : message) + upstream.send(message) } else if (upstream?.readyState === WebSocket.CONNECTING) { // Queue messages until upstream opens (critical for the setup frame) const pending = pendingMap.get(clientWs) @@ -126,6 +126,7 @@ function createQwenProxyWebSocketHandler() { } catch { /* client gone */ } } upstream.onerror = () => { + upstreamMap.delete(clientWs) try { clientWs.close(1011, 'Upstream error') } catch { /* */ } } upstream.onclose = (event) => { @@ -136,7 +137,7 @@ function createQwenProxyWebSocketHandler() { message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { const upstream = upstreamMap.get(clientWs) if (upstream?.readyState === WebSocket.OPEN) { - upstream.send(typeof message === 'string' ? message : message) + upstream.send(message) } }, close(clientWs: ServerWebSocket, code: number, reason: string) { @@ -434,7 +435,7 @@ export async function startWebServer(options: { // Qwen Realtime WebSocket proxy if (url.pathname === '/api/voice/qwen-ws') { const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY - const model = url.searchParams.get('model') || 'qwen3.5-omni-plus-realtime' + const model = url.searchParams.get('model') || 'qwen3-omni-flash-realtime' if (!apiKey) { return new Response('DashScope API key not configured', { status: 400 }) } diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 458927d845..c96d95b60a 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -183,14 +183,15 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' - if (config.initialContext) { - // Proactive: speak the summary. Reactive: feed context silently. - sendClientContent(`[Context] ${config.initialContext}`, proactive) - } - - if (!proactive) { - // Gemini won't self-start; send a greeting trigger so it introduces itself. - // Explicitly suppress model name and context leak. + if (proactive && config.initialContext) { + // Proactive with context: speak the summary immediately. + sendClientContent(`[Context] ${config.initialContext}`, true) + } else { + // Reactive, or proactive with no context: feed context silently if + // available, then trigger a greeting so Gemini doesn't sit silent. + if (config.initialContext) { + sendClientContent(`[Context] ${config.initialContext}`, false) + } sendClientContent('[Greet the user as HAPI. Say a brief hello and invite them to speak. Do not mention Gemini or any model name. Do not reference any context or recent activity.]', true) } @@ -258,6 +259,8 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { ws.onerror = (event) => { console.error('[GeminiLive] WebSocket error:', event) if (!setupDone) { + setupDone = true + state.ws = null // make stale-close guard trip in onclose state.statusCallback?.('error', 'WebSocket connection failed') reject(new Error('WebSocket connection failed')) } @@ -403,12 +406,6 @@ export function GeminiLiveVoiceSession({ } }, [micMuted]) - // Handle barge-in: clear audio queue when user starts speaking - const handleBargeIn = useCallback(() => { - if (state.player?.isPlaying()) { - state.player.clearQueue() - } - }, []) // Cleanup on unmount useEffect(() => { diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index f72458e809..3651b4ed9b 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -170,6 +170,7 @@ class QwenVoiceSessionImpl implements VoiceSession { modalities: ['text', 'audio'], voice: QWEN_REALTIME_VOICE, input_audio_format: 'pcm', + input_audio_sample_rate: 16000, output_audio_format: 'pcm', instructions, temperature: 0.7, @@ -279,6 +280,8 @@ class QwenVoiceSessionImpl implements VoiceSession { ws.onerror = (event) => { console.error('[Qwen] WebSocket error:', event) if (!sessionReady) { + sessionReady = true + state.ws = null // make stale-close guard trip in onclose state.statusCallback?.('error', 'WebSocket connection failed') reject(new Error('WebSocket connection failed')) } From 580f82c157553b78e171ea885a0674736b664e23 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 11:20:29 +0100 Subject: [PATCH 13/34] fix(voice): trailing-slash WS URL, Qwen session.update schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hub/voice.ts: - Replace string-concat WS URL construction with buildVoiceWsUrl() which uses URL API to set protocol/pathname cleanly — fixes double-slash when HAPI_PUBLIC_URL has a trailing slash (would silently skip the proxy route) QwenVoiceSession.tsx: - Wrap tool definitions in {type:'function', function:{...}} as required by Qwen-Omni realtime schema — previous flat shape caused session.update rejection before audio capture could start - Use pcm16/pcm24 audio formats matching DashScope spec; remove input_audio_sample_rate (encoded in format name) via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/web/routes/voice.ts | 13 +++++++++++-- web/src/realtime/QwenVoiceSession.tsx | 13 +++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index e8920dbb36..3ccc481551 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -9,6 +9,15 @@ import { } from '@hapi/protocol/voice' import type { VoiceBackendType } from '@hapi/protocol/voice' +function buildVoiceWsUrl(base: string, pathname: string): string { + const url = new URL(base) + url.protocol = url.protocol === 'https:' ? 'wss:' : 'ws:' + url.pathname = pathname + url.search = '' + url.hash = '' + return url.toString() +} + const tokenRequestSchema = z.object({ customAgentId: z.string().optional(), customApiKey: z.string().optional(), @@ -196,7 +205,7 @@ export function createVoiceRoutes(): Hono { // not to localhost. HAPI_PUBLIC_URL overrides when set (e.g. behind a reverse proxy). const requestOrigin = new URL(c.req.url).origin const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin - const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/gemini-ws' + const wsProxyUrl = buildVoiceWsUrl(publicUrl, '/api/voice/gemini-ws') return c.json({ allowed: true, @@ -219,7 +228,7 @@ export function createVoiceRoutes(): Hono { const requestOrigin = new URL(c.req.url).origin const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin - const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/qwen-ws' + const wsProxyUrl = buildVoiceWsUrl(publicUrl, '/api/voice/qwen-ws') return c.json({ allowed: true, diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 3651b4ed9b..5c46e342d4 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -152,9 +152,11 @@ class QwenVoiceSessionImpl implements VoiceSession { // Build tools config const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ type: 'function' as const, - name: td.name, - description: td.description, - parameters: td.parameters + function: { + name: td.name, + description: td.description, + parameters: td.parameters + } })) // Send session.update with full configuration @@ -169,9 +171,8 @@ class QwenVoiceSessionImpl implements VoiceSession { session: { modalities: ['text', 'audio'], voice: QWEN_REALTIME_VOICE, - input_audio_format: 'pcm', - input_audio_sample_rate: 16000, - output_audio_format: 'pcm', + input_audio_format: 'pcm16', + output_audio_format: 'pcm24', instructions, temperature: 0.7, turn_detection: { From 6e8eba62eaf2bf0f22a4019a49c884aab3692646 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 11:28:50 +0100 Subject: [PATCH 14/34] fix(voice): await audio capture before setMuted; sanitize upstream close codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GeminiLiveVoiceSession + QwenVoiceSession: - startAudioCapture() is now async and awaits recorder.start() before calling setMuted() — previously setMuted ran before getUserMedia resolved so a session restarted while muted would open the mic anyway - statusCallback('connected') now fires after audio is ready - setMuted() called unconditionally (not just when true) to correctly apply saved state in either direction hub/src/web/server.ts: - Both Gemini and Qwen close() handlers now pass the client code through toClientCloseCode() before forwarding to upstream — prevents reserved codes (e.g. 1006) from causing WebSocket.close() to throw and leave the upstream session open until provider timeout - Reason string capped at 123 bytes (WebSocket protocol limit) via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/web/server.ts | 4 ++-- web/src/realtime/GeminiLiveVoiceSession.tsx | 16 +++++++--------- web/src/realtime/QwenVoiceSession.tsx | 12 +++++------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 6d638c2d3a..56c628970d 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -92,7 +92,7 @@ function createGeminiProxyWebSocketHandler() { const upstream = upstreamMap.get(clientWs) pendingMap.delete(clientWs) if (upstream) { - try { upstream.close(code, reason) } catch { /* */ } + try { upstream.close(toClientCloseCode(code), (reason || 'Client closed').slice(0, 123)) } catch { /* */ } upstreamMap.delete(clientWs) } } @@ -143,7 +143,7 @@ function createQwenProxyWebSocketHandler() { close(clientWs: ServerWebSocket, code: number, reason: string) { const upstream = upstreamMap.get(clientWs) if (upstream) { - try { upstream.close(code, reason) } catch { /* */ } + try { upstream.close(toClientCloseCode(code), (reason || 'Client closed').slice(0, 123)) } catch { /* */ } upstreamMap.delete(clientWs) } } diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index c96d95b60a..f9aa4a8541 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -176,10 +176,10 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { if (data.setupComplete && !setupDone) { setupDone = true if (DEBUG) console.log('[GeminiLive] Setup complete') - state.statusCallback?.('connected') - // Start audio capture - startAudioCapture(state.playbackContext!) + // Await audio capture so setMuted runs after getUserMedia resolves + await startAudioCapture(state.playbackContext!) + state.statusCallback?.('connected') const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' @@ -323,11 +323,11 @@ function sendAudioChunk(base64Pcm: string): void { })) } -function startAudioCapture(playbackContext: AudioContext): void { +async function startAudioCapture(playbackContext: AudioContext): Promise { state.player = new GeminiAudioPlayer(playbackContext) state.recorder = new GeminiAudioRecorder() - state.recorder.start( + await state.recorder.start( (pcm16Chunk) => sendAudioChunk(pcm16Chunk), (error) => { console.error('[GeminiLive] Audio capture error:', error) @@ -335,10 +335,8 @@ function startAudioCapture(playbackContext: AudioContext): void { } ) - // Apply initial mute state — the React effect may have run before the recorder existed - if (state.micMuted) { - state.recorder.setMuted(true) - } + // Apply mute state after recorder has a stream — safe to call either way + state.recorder.setMuted(state.micMuted) } // --- React component --- diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 5c46e342d4..0d8bda2295 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -192,8 +192,8 @@ class QwenVoiceSessionImpl implements VoiceSession { if (eventType === 'session.updated') { sessionReady = true if (DEBUG) console.log('[Qwen] Session configured') + await startAudioCapture(state.playbackContext!) state.statusCallback?.('connected') - startAudioCapture(state.playbackContext!) resolve() return } @@ -334,11 +334,11 @@ class QwenVoiceSessionImpl implements VoiceSession { } } -function startAudioCapture(playbackContext: AudioContext): void { +async function startAudioCapture(playbackContext: AudioContext): Promise { state.player = new GeminiAudioPlayer(playbackContext) state.recorder = new GeminiAudioRecorder() - state.recorder.start( + await state.recorder.start( (base64Pcm) => { sendEvent('input_audio_buffer.append', { audio: base64Pcm }) }, @@ -348,10 +348,8 @@ function startAudioCapture(playbackContext: AudioContext): void { } ) - // Apply initial mute state — the React effect may have run before the recorder existed - if (state.micMuted) { - state.recorder.setMuted(true) - } + // Apply mute state after recorder has a stream — safe to call either way + state.recorder.setMuted(state.micMuted) } // --- React component --- From 640359647c3e98cb71eb041aeb7369d853c46991 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 11:38:33 +0100 Subject: [PATCH 15/34] fix(voice): wrap startAudioCapture in try/catch to propagate mic errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An unhandled rejection inside the async onmessage callback does not propagate to the outer startSession Promise — the UI hangs on 'connecting' and the provider socket stays partially open. Wrapping the await in try/catch calls cleanup()/statusCallback('error')/reject() so failures surface correctly. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/GeminiLiveVoiceSession.tsx | 13 +++++++++++-- web/src/realtime/QwenVoiceSession.tsx | 10 +++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index f9aa4a8541..75b4615d75 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -177,8 +177,17 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { setupDone = true if (DEBUG) console.log('[GeminiLive] Setup complete') - // Await audio capture so setMuted runs after getUserMedia resolves - await startAudioCapture(state.playbackContext!) + // Await audio capture so setMuted runs after getUserMedia resolves. + // Wrap so a mic failure rejects the outer startSession promise. + try { + await startAudioCapture(state.playbackContext!) + } catch (error) { + const message = error instanceof Error ? error.message : 'Microphone error' + cleanup() + state.statusCallback?.('error', message) + reject(error instanceof Error ? error : new Error(message)) + return + } state.statusCallback?.('connected') const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 0d8bda2295..54a7f03734 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -192,7 +192,15 @@ class QwenVoiceSessionImpl implements VoiceSession { if (eventType === 'session.updated') { sessionReady = true if (DEBUG) console.log('[Qwen] Session configured') - await startAudioCapture(state.playbackContext!) + try { + await startAudioCapture(state.playbackContext!) + } catch (error) { + const message = error instanceof Error ? error.message : 'Microphone error' + cleanup() + state.statusCallback?.('error', message) + reject(error instanceof Error ? error : new Error(message)) + return + } state.statusCallback?.('connected') resolve() return From 3a2d8418d6b67fccc3b7895785654cf2b94a8ada Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 20:21:35 +0100 Subject: [PATCH 16/34] fix(voice): propagate backend discovery failure instead of silently falling back to ElevenLabs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fetchVoiceBackend no longer catches errors and defaults to 'elevenlabs' — any network or server failure now throws so VoiceBackendSession can surface it via onStatusChange('error', ...) rather than silently mounting the wrong backend. VoiceBackendSession also resets backend state to null when api changes, so a stale ElevenLabs registration from a prior discovery cannot persist into a new session. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/api/voice.ts | 15 ++++++--------- web/src/realtime/VoiceBackendSession.tsx | 6 ++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index c6df29e16c..8fc57c806d 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -215,17 +215,14 @@ export interface GeminiTokenResponse { /** * Discover which voice backend the hub is configured to use. + * Throws on network or server error — callers must handle failures explicitly. */ export async function fetchVoiceBackend(api: ApiClient): Promise { - try { - const result = await api.fetchVoiceBackend() - const backend = result.backend === 'gemini-live' ? 'gemini-live' - : result.backend === 'qwen-realtime' ? 'qwen-realtime' - : 'elevenlabs' - return { backend } as VoiceBackendResponse - } catch { - return { backend: 'elevenlabs' } - } + const result = await api.fetchVoiceBackend() + const backend = result.backend === 'gemini-live' ? 'gemini-live' + : result.backend === 'qwen-realtime' ? 'qwen-realtime' + : 'elevenlabs' + return { backend } as VoiceBackendResponse } /** diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx index b990d07bd7..a30cf488fc 100644 --- a/web/src/realtime/VoiceBackendSession.tsx +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -28,8 +28,14 @@ export function VoiceBackendSession(props: VoiceBackendSessionProps) { useEffect(() => { let cancelled = false + setBackend(null) fetchVoiceBackend(props.api).then((resp) => { if (!cancelled) setBackend(resp.backend) + }).catch((err: unknown) => { + if (!cancelled) { + const msg = err instanceof Error ? err.message : 'Could not detect voice backend' + props.onStatusChange?.('error', msg) + } }) return () => { cancelled = true From 533f4e281a48a2e636d864f727730706fc7933ed Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 20:24:18 +0100 Subject: [PATCH 17/34] fix(voice): throw on unrecognised backend value instead of silently falling back to ElevenLabs Unknown backend strings (future values, typos) now throw rather than defaulting to elevenlabs, closing the narrow remaining form of the original misrouting bug. Also removes the unnecessary `as VoiceBackendResponse` cast. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/api/voice.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index 8fc57c806d..df6828127c 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -215,14 +215,15 @@ export interface GeminiTokenResponse { /** * Discover which voice backend the hub is configured to use. - * Throws on network or server error — callers must handle failures explicitly. + * Throws on network/server error or unrecognised backend value — callers must handle failures explicitly. */ export async function fetchVoiceBackend(api: ApiClient): Promise { const result = await api.fetchVoiceBackend() - const backend = result.backend === 'gemini-live' ? 'gemini-live' - : result.backend === 'qwen-realtime' ? 'qwen-realtime' - : 'elevenlabs' - return { backend } as VoiceBackendResponse + const { backend } = result + if (backend === 'elevenlabs' || backend === 'gemini-live' || backend === 'qwen-realtime') { + return { backend } + } + throw new Error(`Unrecognised voice backend: ${backend}`) } /** From ec5a1700981a5fc1b6ebb945ca1132995a54e9ad Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 20:51:30 +0100 Subject: [PATCH 18/34] fix(voice): add Qwen greeting/proactive trigger; fix socket buffer for base64 uploads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qwen session.updated handler now sends the same proactive summary or greeting trigger that Gemini does — previously it started silently in both proactive and reactive modes. maxHttpBufferSize raised to 68 MiB to account for base64 expansion: 50 MiB decoded files become ~66.7 MiB as base64 JSON, so the previous 55 MiB ceiling would disconnect uploads above ~41 MiB before they reached the CLI. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/socket/server.ts | 2 +- web/src/realtime/QwenVoiceSession.tsx | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/hub/src/socket/server.ts b/hub/src/socket/server.ts index 6db62990f7..f464fe62a9 100644 --- a/hub/src/socket/server.ts +++ b/hub/src/socket/server.ts @@ -67,7 +67,7 @@ export function createSocketServer(deps: SocketServerDeps): { const engine = new Engine({ path: '/socket.io/', cors: corsOptions, - maxHttpBufferSize: 55 * 1024 * 1024, // 55MB to match upload limit + maxHttpBufferSize: 68 * 1024 * 1024, // 50 MiB decoded uploads are ~66.7 MiB as base64 JSON allowRequest: async (req) => { const origin = req.headers.get('origin') if (!origin || allowAllOrigins || corsOrigins.includes(origin)) { diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 54a7f03734..55407a96a1 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -202,6 +202,17 @@ class QwenVoiceSessionImpl implements VoiceSession { return } state.statusCallback?.('connected') + + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + if (proactive && config.initialContext) { + this.sendTextMessage(`[Context] ${config.initialContext}`) + } else { + if (config.initialContext) { + this.sendContextualUpdate(config.initialContext) + } + this.sendTextMessage('[Greet the user as HAPI. Say a brief hello and invite them to speak. Do not mention Qwen or any model name. Do not reference any context or recent activity.]') + } + resolve() return } From f107aa36112a02bb5bfc9043e0559f91df9b47f0 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 21:33:09 +0100 Subject: [PATCH 19/34] fix(voice): replace unsupported conversation.item.create with session.update for Qwen text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qwen's realtime API only supports conversation.item.create for function_call_output. Sending it with type:'message' for greetings/context was invalid and could fail before the user spoke. sendTextMessage and sendContextualUpdate now update session instructions via session.update (accumulating context into the system prompt) and trigger response.create only when a spoken reply is needed — matching Qwen's supported client event surface. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/QwenVoiceSession.tsx | 84 ++++++++++++++++----------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 55407a96a1..fdb1325468 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -77,13 +77,40 @@ function sendEvent(type: string, payload?: Record): void { })) } +interface QwenSessionConfig { + modalities: ['text', 'audio'] + voice: string + input_audio_format: 'pcm16' + output_audio_format: 'pcm24' + instructions: string + temperature: number + turn_detection: { + type: 'server_vad' + threshold: number + silence_duration_ms: number + prefix_padding_ms: number + } + tools: Array + tool_choice: 'auto' +} + class QwenVoiceSessionImpl implements VoiceSession { private api: ApiClient + private currentSessionConfig: QwenSessionConfig | null = null constructor(api: ApiClient) { this.api = api } + private updateInstructions(update: string): void { + if (!this.currentSessionConfig) return + this.currentSessionConfig = { + ...this.currentSessionConfig, + instructions: `${this.currentSessionConfig.instructions}\n\n${update}` + } + sendEvent('session.update', { session: this.currentSessionConfig }) + } + async startSession(config: VoiceSessionConfig): Promise { cleanup() state.statusCallback?.('connecting') @@ -167,24 +194,23 @@ class QwenVoiceSessionImpl implements VoiceSession { ? `${basePrompt}\n\n[Current Context]\n${config.initialContext}` : basePrompt - sendEvent('session.update', { - session: { - modalities: ['text', 'audio'], - voice: QWEN_REALTIME_VOICE, - input_audio_format: 'pcm16', - output_audio_format: 'pcm24', - instructions, - temperature: 0.7, - turn_detection: { - type: 'server_vad', - threshold: 0.5, - silence_duration_ms: 800, - prefix_padding_ms: 300 - }, - tools, - tool_choice: 'auto' - } - }) + this.currentSessionConfig = { + modalities: ['text', 'audio'], + voice: QWEN_REALTIME_VOICE, + input_audio_format: 'pcm16', + output_audio_format: 'pcm24', + instructions, + temperature: 0.7, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 800, + prefix_padding_ms: 300 + }, + tools, + tool_choice: 'auto' + } + sendEvent('session.update', { session: this.currentSessionConfig }) return } @@ -324,32 +350,22 @@ class QwenVoiceSessionImpl implements VoiceSession { } async endSession(): Promise { + this.currentSessionConfig = null cleanup() resetRealtimeSessionState() state.statusCallback?.('disconnected') } sendTextMessage(message: string): void { - // Send text as a user message via conversation.item.create - sendEvent('conversation.item.create', { - item: { - type: 'message', - role: 'user', - content: [{ type: 'input_text', text: message }] - } - }) + // Qwen only supports conversation.item.create for function_call_output. + // Inject text as an instruction update then trigger a response. + this.updateInstructions(message) sendEvent('response.create') } sendContextualUpdate(update: string): void { - // Send context as a system-like user message - sendEvent('conversation.item.create', { - item: { - type: 'message', - role: 'user', - content: [{ type: 'input_text', text: `[System Context Update] ${update}` }] - } - }) + // Append context silently — no response.create, so model doesn't speak yet. + this.updateInstructions(`[System Context Update] ${update}`) } } From d9e72ad6606842883e605f2207080bbcbd83d983 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 26 May 2026 21:35:22 +0100 Subject: [PATCH 20/34] fix(voice): guard session.updated re-entry and reset config on session start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit session.updated now returns early after the first ack — subsequent session.update calls (instruction appends) also echo session.updated but must not re-trigger audio capture or the greeting path. currentSessionConfig is now reset to null at the top of startSession so a stale config from a failed previous session cannot leak into the new one. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/QwenVoiceSession.tsx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index fdb1325468..6bcab10b51 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -112,6 +112,7 @@ class QwenVoiceSessionImpl implements VoiceSession { } async startSession(config: VoiceSessionConfig): Promise { + this.currentSessionConfig = null cleanup() state.statusCallback?.('connecting') @@ -214,8 +215,11 @@ class QwenVoiceSessionImpl implements VoiceSession { return } - // Session updated - ready to go + // Session updated - only act on the first one (initial config ack). + // Subsequent session.update calls (for instruction appends) also + // echo session.updated — ignore those after setup is complete. if (eventType === 'session.updated') { + if (sessionReady) return sessionReady = true if (DEBUG) console.log('[Qwen] Session configured') try { @@ -359,6 +363,8 @@ class QwenVoiceSessionImpl implements VoiceSession { sendTextMessage(message: string): void { // Qwen only supports conversation.item.create for function_call_output. // Inject text as an instruction update then trigger a response. + // response.create without a prior conversation.item.create is valid — + // it generates from the current session context (updated instructions). this.updateInstructions(message) sendEvent('response.create') } From 4c24d1120a5822e8a8f062a1bf869563e2e297ab Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Wed, 27 May 2026 02:23:33 +0100 Subject: [PATCH 21/34] fix(voice): assert wsUrl presence for Gemini proxy connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this guard, a missing wsUrl in the hub token response would silently attempt to connect directly to Google with "proxied" as the API key — producing a confusing auth failure instead of a clear error. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/GeminiLiveVoiceSession.tsx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 75b4615d75..4da200a0c5 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -92,6 +92,11 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { } state.apiKey = tokenResp.apiKey state.wsBaseUrl = tokenResp.wsUrl || null + if (!state.wsBaseUrl) { + const msg = 'Hub must provide wsUrl for Gemini connections — direct key connection is not supported' + state.statusCallback?.('error', msg) + throw new Error(msg) + } // Request microphone console.log('[GeminiLive] Requesting microphone...') From c84fbd4826a9c421fec0fb379e7c2d04f4964dcf Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Wed, 27 May 2026 06:22:55 +0100 Subject: [PATCH 22/34] fix(voice): correct Qwen audio formats and default voice DashScope realtime API accepts only 'pcm' for both input and output audio formats. The pcm16/pcm24 values caused session.update rejection before audio capture could start, leaving the Qwen backend unusable. Also updates the default voice from Mia (not in the qwen3-omni-flash- realtime voice list) to Cherry, which is documented as supported. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- shared/src/voice.ts | 2 +- web/src/realtime/QwenVoiceSession.tsx | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 8d30034a2c..ec6715288b 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -290,7 +290,7 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' -export const QWEN_REALTIME_VOICE = 'Mia' +export const QWEN_REALTIME_VOICE = 'Cherry' export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 6bcab10b51..9960991fe9 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -80,8 +80,8 @@ function sendEvent(type: string, payload?: Record): void { interface QwenSessionConfig { modalities: ['text', 'audio'] voice: string - input_audio_format: 'pcm16' - output_audio_format: 'pcm24' + input_audio_format: 'pcm' + output_audio_format: 'pcm' instructions: string temperature: number turn_detection: { @@ -198,8 +198,8 @@ class QwenVoiceSessionImpl implements VoiceSession { this.currentSessionConfig = { modalities: ['text', 'audio'], voice: QWEN_REALTIME_VOICE, - input_audio_format: 'pcm16', - output_audio_format: 'pcm24', + input_audio_format: 'pcm', + output_audio_format: 'pcm', instructions, temperature: 0.7, turn_detection: { From f273ecad9764b3b9a76996b9f22f9c2d1cc749fe Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Wed, 27 May 2026 11:16:20 +0100 Subject: [PATCH 23/34] fix(voice): close AudioContext on failed voice session start Failed token fetch, microphone denial, or WebSocket error during setup left state.playbackContext open. Each failure path now calls cleanup() before throwing/rejecting, preventing AudioContext leaks on mobile browsers with hard limits on concurrent contexts. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/GeminiLiveVoiceSession.tsx | 5 ++++- web/src/realtime/QwenVoiceSession.tsx | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 4da200a0c5..b7ff765945 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -88,6 +88,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { const msg = tokenResp.error ?? 'Gemini API key not available' console.error('[GeminiLive] Token failed:', msg) state.statusCallback?.('error', msg) + cleanup() throw new Error(msg) } state.apiKey = tokenResp.apiKey @@ -95,6 +96,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { if (!state.wsBaseUrl) { const msg = 'Hub must provide wsUrl for Gemini connections — direct key connection is not supported' state.statusCallback?.('error', msg) + cleanup() throw new Error(msg) } @@ -107,6 +109,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { } catch (error) { console.error('[GeminiLive] Microphone denied:', error) state.statusCallback?.('error', 'Microphone permission denied') + cleanup() throw error } finally { permissionStream?.getTracks().forEach((t) => t.stop()) @@ -274,7 +277,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { console.error('[GeminiLive] WebSocket error:', event) if (!setupDone) { setupDone = true - state.ws = null // make stale-close guard trip in onclose + cleanup() state.statusCallback?.('error', 'WebSocket connection failed') reject(new Error('WebSocket connection failed')) } diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 9960991fe9..ace659c6d7 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -127,6 +127,7 @@ class QwenVoiceSessionImpl implements VoiceSession { if (!tokenResp.allowed) { const msg = tokenResp.error ?? 'DashScope API key not available' state.statusCallback?.('error', msg) + cleanup() throw new Error(msg) } state.apiKey = null // key stays server-side @@ -138,6 +139,7 @@ class QwenVoiceSessionImpl implements VoiceSession { permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) } catch (error) { state.statusCallback?.('error', 'Microphone permission denied') + cleanup() throw error } finally { permissionStream?.getTracks().forEach((t) => t.stop()) @@ -331,7 +333,7 @@ class QwenVoiceSessionImpl implements VoiceSession { console.error('[Qwen] WebSocket error:', event) if (!sessionReady) { sessionReady = true - state.ws = null // make stale-close guard trip in onclose + cleanup() state.statusCallback?.('error', 'WebSocket connection failed') reject(new Error('WebSocket connection failed')) } From f991093b7c7cb0b0bbb41128e25b1bff18978ee8 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Wed, 27 May 2026 15:54:23 +0100 Subject: [PATCH 24/34] chore: restore non-voice files to upstream/main state Reverts changes to files that shouldn't differ from upstream: - .gitignore: remove fork-only AGENTS.local.md entry - web/src/App.tsx: restore dual-subscription SSE pattern (scope-aware) - web/src/hooks/useSSE.ts: restore SSEScope/scope parameter - web/src/hooks/useSSE.test.ts: restore (was accidentally deleted) - web/src/lib/appSseSubscriptions.ts: restore (was accidentally deleted) - web/src/lib/appSseSubscriptions.test.ts: restore (was accidentally deleted) - hub/src/sync/syncEngine.ts: restore (off-topic change) --- .gitignore | 1 - web/src/App.tsx | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 5607ecc3f8..dc3c3d28a1 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,6 @@ coverage/ # Claude local settings .claude/settings.local.json -AGENTS.local.md localdocs/ execplan/ diff --git a/web/src/App.tsx b/web/src/App.tsx index af7c6599ec..fc23635acf 100644 --- a/web/src/App.tsx +++ b/web/src/App.tsx @@ -327,7 +327,7 @@ function AppInner() { useVisibilityReporter({ api, subscriptionId: globalSubscriptionId, - enabled: sseEnabled && !sessionEventSubscription + enabled: sseEnabled }) useVisibilityReporter({ From 7b3d9985f992d887fbeb2f2df30ef6c08150f9ed Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Thu, 28 May 2026 17:53:18 +0100 Subject: [PATCH 25/34] fix(voice): harden Gemini and Qwen WS proxies against client abuse Hub sends HAPI-owned Gemini setup on proxy connect and rejects client setup frames. Qwen proxy always uses QWEN_REALTIME_MODEL instead of a client query parameter. Shared buildGeminiLiveSetupMessage() keeps wire format in one place. Co-authored-by: Cursor --- hub/src/web/server.ts | 36 ++++++++++++++++---- shared/src/voice.gemini.test.ts | 21 ++++++++++++ shared/src/voice.ts | 29 ++++++++++++++++ web/src/realtime/GeminiLiveVoiceSession.tsx | 37 +++++---------------- web/src/realtime/QwenVoiceSession.tsx | 4 +-- 5 files changed, 89 insertions(+), 38 deletions(-) create mode 100644 shared/src/voice.gemini.test.ts diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 56c628970d..9d688bc878 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -6,6 +6,7 @@ import { existsSync } from 'node:fs' import { serveStatic } from 'hono/bun' import { getConfiguration } from '../configuration' import { PROTOCOL_VERSION } from '@hapi/protocol' +import { buildGeminiLiveSetupMessage, QWEN_REALTIME_MODEL } from '@hapi/protocol/voice' import type { SyncEngine } from '../sync/syncEngine' import { createAuthMiddleware, type WebAppEnv } from './middleware/auth' import { createAuthRoutes } from './routes/auth' @@ -38,6 +39,21 @@ function toClientCloseCode(code: number): number { : 1011 } +function decodeWsText(message: string | ArrayBuffer | Uint8Array): string { + if (typeof message === 'string') return message + const bytes = message instanceof Uint8Array ? message : new Uint8Array(message) + return new TextDecoder().decode(bytes) +} + +function isGeminiSetupFrame(message: string | ArrayBuffer | Uint8Array): boolean { + try { + const parsed = JSON.parse(decodeWsText(message)) as unknown + return parsed !== null && typeof parsed === 'object' && 'setup' in (parsed as object) + } catch { + return false + } +} + // Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions function createGeminiProxyWebSocketHandler() { const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' @@ -46,7 +62,7 @@ function createGeminiProxyWebSocketHandler() { return { open(clientWs: ServerWebSocket) { - const data = clientWs.data as { _geminiProxy: boolean; apiKey: string } + const data = clientWs.data as { _geminiProxy: boolean; apiKey: string; language?: string } const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}` const pending: Array = [] pendingMap.set(clientWs, pending) @@ -55,9 +71,12 @@ function createGeminiProxyWebSocketHandler() { upstreamMap.set(clientWs, upstream) upstream.onopen = () => { - // Flush any messages queued while upstream was connecting (e.g. setup frame) + // Hub-owned setup only — never forward client setup (prevents generic Gemini proxy abuse). + upstream.send(JSON.stringify(buildGeminiLiveSetupMessage(data.language))) for (const queued of pending.splice(0)) { - upstream.send(queued) + if (!isGeminiSetupFrame(queued)) { + upstream.send(queued) + } } pendingMap.delete(clientWs) } @@ -79,11 +98,14 @@ function createGeminiProxyWebSocketHandler() { } }, message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + if (isGeminiSetupFrame(message)) { + try { clientWs.close(1008, 'Client-provided Gemini setup is not allowed') } catch { /* */ } + return + } const upstream = upstreamMap.get(clientWs) if (upstream?.readyState === WebSocket.OPEN) { upstream.send(message) } else if (upstream?.readyState === WebSocket.CONNECTING) { - // Queue messages until upstream opens (critical for the setup frame) const pending = pendingMap.get(clientWs) if (pending) pending.push(message) } @@ -424,8 +446,10 @@ export async function startWebServer(options: { if (!apiKey) { return new Response('Gemini API key not configured', { status: 400 }) } + const languageParam = url.searchParams.get('language') + const language = languageParam === 'zh' ? 'zh' : undefined const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { - data: { _geminiProxy: true, apiKey } + data: { _geminiProxy: true, apiKey, language } }) if (!upgraded) { return new Response('WebSocket upgrade failed', { status: 500 }) @@ -435,7 +459,7 @@ export async function startWebServer(options: { // Qwen Realtime WebSocket proxy if (url.pathname === '/api/voice/qwen-ws') { const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY - const model = url.searchParams.get('model') || 'qwen3-omni-flash-realtime' + const model = QWEN_REALTIME_MODEL if (!apiKey) { return new Response('DashScope API key not configured', { status: 400 }) } diff --git a/shared/src/voice.gemini.test.ts b/shared/src/voice.gemini.test.ts new file mode 100644 index 0000000000..5bed0ea961 --- /dev/null +++ b/shared/src/voice.gemini.test.ts @@ -0,0 +1,21 @@ +import { describe, expect, test } from 'bun:test' +import { buildGeminiLiveSetupMessage, GEMINI_LIVE_MODEL, GEMINI_LIVE_VOICE } from './voice' + +describe('buildGeminiLiveSetupMessage', () => { + test('locks model and voice to HAPI defaults', () => { + const msg = buildGeminiLiveSetupMessage() + expect(msg.setup.model).toBe(`models/${GEMINI_LIVE_MODEL}`) + const speech = msg.setup.generationConfig as { + speechConfig?: { voiceConfig?: { prebuiltVoiceConfig?: { voiceName?: string } } } + } + expect(speech.speechConfig?.voiceConfig?.prebuiltVoiceConfig?.voiceName).toBe(GEMINI_LIVE_VOICE) + }) + + test('appends Chinese block when language is zh', () => { + const en = buildGeminiLiveSetupMessage() + const zh = buildGeminiLiveSetupMessage('zh') + const enText = (en.setup.systemInstruction as { parts: Array<{ text: string }> }).parts[0].text + const zhText = (zh.setup.systemInstruction as { parts: Array<{ text: string }> }).parts[0].text + expect(zhText.length).toBeGreaterThan(enText.length) + }) +}) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index ec6715288b..956db6e5d0 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -295,6 +295,7 @@ export const QWEN_REALTIME_VOICE = 'Cherry' export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' +export const GEMINI_LIVE_VOICE = 'Aoede' export interface VoiceToolDefinition { name: string @@ -364,3 +365,31 @@ export function buildGeminiLiveConfig(language?: string): GeminiLiveConfig { responseModalities: ['AUDIO'] } } + +/** Wire-format setup frame for Gemini Live BidiGenerateContent (hub proxy + web client). */ +export function buildGeminiLiveSetupMessage(language?: string): { setup: Record } { + const liveConfig = buildGeminiLiveConfig(language) + return { + setup: { + model: `models/${liveConfig.model}`, + generationConfig: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: GEMINI_LIVE_VOICE } + } + } + }, + systemInstruction: { + parts: [{ text: liveConfig.systemInstruction }] + }, + tools: liveConfig.tools.map((t) => ({ + functionDeclarations: t.functionDeclarations.map((fd) => ({ + name: fd.name, + description: fd.description, + parameters: fd.parameters + })) + })) + } + } +} diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index b7ff765945..79025c9f27 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -5,7 +5,7 @@ import { fetchGeminiToken } from '@/api/voice' import { GeminiAudioRecorder } from './gemini/audioRecorder' import { GeminiAudioPlayer } from './gemini/audioPlayer' import { handleGeminiFunctionCalls } from './gemini/toolAdapter' -import { buildGeminiLiveConfig } from '@hapi/protocol/voice' +import { buildGeminiLiveSetupMessage } from '@hapi/protocol/voice' import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' import type { ApiClient } from '@/api/client' import type { Session } from '@/types/api' @@ -119,8 +119,9 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE const isProxy = !!state.wsBaseUrl const authToken = this.api.getAuthToken() || '' + const languageParam = config.language === 'zh' ? '&language=zh' : '' const wsUrl = isProxy - ? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}` + ? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}${languageParam}` : `${wsBase}?key=${encodeURIComponent(state.apiKey)}` console.log('[GeminiLive] Connecting WebSocket to:', wsBase, isProxy ? '(proxied)' : '(direct)') const ws = new WebSocket(wsUrl) @@ -130,34 +131,12 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { let setupDone = false ws.onopen = () => { - if (DEBUG) console.log('[GeminiLive] WebSocket connected, sending setup') - - const liveConfig = buildGeminiLiveConfig(config.language) - const setupMessage = { - setup: { - model: `models/${liveConfig.model}`, - generationConfig: { - responseModalities: ['AUDIO'], - speechConfig: { - voiceConfig: { - prebuiltVoiceConfig: { voiceName: 'Aoede' } - } - } - }, - systemInstruction: { - parts: [{ text: liveConfig.systemInstruction }] - }, - tools: liveConfig.tools.map((t) => ({ - functionDeclarations: t.functionDeclarations.map((fd) => ({ - name: fd.name, - description: fd.description, - parameters: fd.parameters - })) - })) - } - } + if (DEBUG) console.log('[GeminiLive] WebSocket connected', isProxy ? '(hub sends setup)' : ', sending setup') - ws.send(JSON.stringify(setupMessage)) + // Proxied sessions: hub sends HAPI-owned setup server-side (see gemini-ws proxy). + if (!isProxy) { + ws.send(JSON.stringify(buildGeminiLiveSetupMessage(config.language))) + } } ws.onmessage = async (event) => { diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index ace659c6d7..797ead8f6f 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -6,7 +6,6 @@ import { GeminiAudioRecorder } from './gemini/audioRecorder' import { GeminiAudioPlayer } from './gemini/audioPlayer' import { realtimeClientTools } from './realtimeClientTools' import { - QWEN_REALTIME_MODEL, QWEN_REALTIME_VOICE, VOICE_SYSTEM_PROMPT, VOICE_CHINESE_LANGUAGE_BLOCK, @@ -150,10 +149,9 @@ class QwenVoiceSessionImpl implements VoiceSession { const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' const defaultProxyUrl = `${protocol}//${window.location.host}/api/voice/qwen-ws` const proxyUrl = state.wsBaseUrl || defaultProxyUrl - const model = QWEN_REALTIME_MODEL const authToken = this.api.getAuthToken() || '' const separator = proxyUrl.includes('?') ? '&' : '?' - const wsUrl = `${proxyUrl}${separator}model=${encodeURIComponent(model)}&token=${encodeURIComponent(authToken)}` + const wsUrl = `${proxyUrl}${separator}token=${encodeURIComponent(authToken)}` const ws = new WebSocket(wsUrl) state.ws = ws From d0335e4e69217150d3746d122f32497acda55b72 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 31 May 2026 16:34:53 +0100 Subject: [PATCH 26/34] =?UTF-8?q?fix(voice):=20harden=20Qwen=20proxy=20?= =?UTF-8?q?=E2=80=94=20hub-owned=20setup,=20client=20frame=20allowlist?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the Gemini proxy security model for Qwen: - Hub sends initial session.update (voice/tools/instructions) on upstream connect so the browser cannot override config fields. - Proxy message() now calls isQwenSafeClientFrame() and closes the connection (1008) if a client session.update touches any field other than 'instructions' (blocks tool/voice/modality overrides). - QwenVoiceSession no longer sends session.update on session.created; it waits for the hub-relayed session.updated and then sends only instruction-only updates for context/proactive content. - Language passed as query param (?language=zh) so hub builds the correct Chinese system prompt without a client-supplied session.update. - buildQwenSessionUpdateMessage() and isQwenSafeClientFrame() added to @hapi/protocol/voice; 9 new unit tests cover filter edge cases. --- hub/src/web/server.ts | 14 +++-- shared/src/voice.gemini.test.ts | 66 ++++++++++++++++++++- shared/src/voice.ts | 53 +++++++++++++++++ web/src/realtime/QwenVoiceSession.tsx | 84 +++++---------------------- 4 files changed, 144 insertions(+), 73 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 9d688bc878..ab0c24e721 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -6,7 +6,7 @@ import { existsSync } from 'node:fs' import { serveStatic } from 'hono/bun' import { getConfiguration } from '../configuration' import { PROTOCOL_VERSION } from '@hapi/protocol' -import { buildGeminiLiveSetupMessage, QWEN_REALTIME_MODEL } from '@hapi/protocol/voice' +import { buildGeminiLiveSetupMessage, buildQwenSessionUpdateMessage, isQwenSafeClientFrame, QWEN_REALTIME_MODEL } from '@hapi/protocol/voice' import type { SyncEngine } from '../sync/syncEngine' import { createAuthMiddleware, type WebAppEnv } from './middleware/auth' import { createAuthRoutes } from './routes/auth' @@ -128,7 +128,7 @@ function createQwenProxyWebSocketHandler() { return { open(clientWs: ServerWebSocket) { - const data = clientWs.data as { apiKey: string; model: string } + const data = clientWs.data as { apiKey: string; model: string; language?: string } const upstreamUrl = `${process.env.QWEN_REALTIME_WS_URL || QWEN_WS_BASE}?model=${encodeURIComponent(data.model)}` const upstream = new WebSocket(upstreamUrl, { @@ -138,7 +138,8 @@ function createQwenProxyWebSocketHandler() { upstreamMap.set(clientWs, upstream) upstream.onopen = () => { - // Connection ready — upstream will send session.created + // Hub-owned setup — send initial session.update so client cannot override tools/voice/config. + upstream.send(JSON.stringify(buildQwenSessionUpdateMessage(data.language))) } upstream.onmessage = (event) => { try { @@ -157,6 +158,10 @@ function createQwenProxyWebSocketHandler() { } }, message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + if (!isQwenSafeClientFrame(message)) { + try { clientWs.close(1008, 'Client session.update may only modify instructions') } catch { /* */ } + return + } const upstream = upstreamMap.get(clientWs) if (upstream?.readyState === WebSocket.OPEN) { upstream.send(message) @@ -460,11 +465,12 @@ export async function startWebServer(options: { if (url.pathname === '/api/voice/qwen-ws') { const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY const model = QWEN_REALTIME_MODEL + const language = url.searchParams.get('language') ?? undefined if (!apiKey) { return new Response('DashScope API key not configured', { status: 400 }) } const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { - data: { _qwenProxy: true, apiKey, model } + data: { _qwenProxy: true, apiKey, model, language } }) if (!upgraded) { return new Response('WebSocket upgrade failed', { status: 500 }) diff --git a/shared/src/voice.gemini.test.ts b/shared/src/voice.gemini.test.ts index 5bed0ea961..2b3619eb69 100644 --- a/shared/src/voice.gemini.test.ts +++ b/shared/src/voice.gemini.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from 'bun:test' -import { buildGeminiLiveSetupMessage, GEMINI_LIVE_MODEL, GEMINI_LIVE_VOICE } from './voice' +import { buildGeminiLiveSetupMessage, buildQwenSessionUpdateMessage, isQwenSafeClientFrame, GEMINI_LIVE_MODEL, GEMINI_LIVE_VOICE, QWEN_REALTIME_VOICE } from './voice' describe('buildGeminiLiveSetupMessage', () => { test('locks model and voice to HAPI defaults', () => { @@ -19,3 +19,67 @@ describe('buildGeminiLiveSetupMessage', () => { expect(zhText.length).toBeGreaterThan(enText.length) }) }) + +describe('buildQwenSessionUpdateMessage', () => { + test('locks voice to HAPI default', () => { + const msg = buildQwenSessionUpdateMessage() + const session = msg.session as { voice: string } + expect(session.voice).toBe(QWEN_REALTIME_VOICE) + }) + + test('includes both tools', () => { + const msg = buildQwenSessionUpdateMessage() + const session = msg.session as { tools: Array<{ function: { name: string } }> } + const names = session.tools.map(t => t.function.name) + expect(names).toContain('messageCodingAgent') + expect(names).toContain('processPermissionRequest') + }) + + test('appends Chinese block when language is zh', () => { + const en = buildQwenSessionUpdateMessage() + const zh = buildQwenSessionUpdateMessage('zh') + const enInstr = (en.session as { instructions: string }).instructions + const zhInstr = (zh.session as { instructions: string }).instructions + expect(zhInstr.length).toBeGreaterThan(enInstr.length) + }) +}) + +describe('isQwenSafeClientFrame', () => { + test('allows non-session.update frames', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ type: 'input_audio_buffer.append', audio: 'abc' }))).toBe(true) + expect(isQwenSafeClientFrame(JSON.stringify({ type: 'response.create' }))).toBe(true) + expect(isQwenSafeClientFrame(JSON.stringify({ type: 'conversation.item.create', item: {} }))).toBe(true) + }) + + test('allows session.update with only instructions', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { instructions: 'updated prompt' } + }))).toBe(true) + }) + + test('blocks session.update that includes tools', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { instructions: 'x', tools: [] } + }))).toBe(false) + }) + + test('blocks session.update that includes voice', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { voice: 'Cherry' } + }))).toBe(false) + }) + + test('blocks full config session.update', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { modalities: ['text', 'audio'], voice: 'Cherry', instructions: 'x', tools: [], tool_choice: 'auto' } + }))).toBe(false) + }) + + test('allows non-JSON (binary audio frames pass through)', () => { + expect(isQwenSafeClientFrame('not json {')).toBe(true) + }) +}) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 956db6e5d0..98b998ca2f 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -366,6 +366,59 @@ export function buildGeminiLiveConfig(language?: string): GeminiLiveConfig { } } +/** Hub-owned initial session.update for Qwen Realtime (hub proxy). */ +export function buildQwenSessionUpdateMessage(language?: string): Record { + const instructions = language === 'zh' + ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` + : VOICE_SYSTEM_PROMPT + const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ + type: 'function' as const, + function: { name: td.name, description: td.description, parameters: td.parameters } + })) + return { + type: 'session.update', + session: { + modalities: ['text', 'audio'], + voice: QWEN_REALTIME_VOICE, + input_audio_format: 'pcm', + output_audio_format: 'pcm', + instructions, + temperature: 0.7, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 800, + prefix_padding_ms: 300 + }, + tools, + tool_choice: 'auto' + } + } +} + +/** + * Returns true if a client WebSocket frame is safe to forward to DashScope. + * Blocks session.update frames that touch config fields (tools, voice, etc.); + * allows instruction-only updates and all runtime event types. + */ +export function isQwenSafeClientFrame(message: string | ArrayBuffer | Uint8Array): boolean { + try { + const text = typeof message === 'string' + ? message + : new TextDecoder().decode(message instanceof ArrayBuffer ? new Uint8Array(message) : message) + const parsed = JSON.parse(text) as unknown + if (!parsed || typeof parsed !== 'object') return true + const p = parsed as Record + if (p.type !== 'session.update') return true + const session = p.session as Record | undefined + if (!session) return false + const keys = Object.keys(session) + return keys.length === 1 && keys[0] === 'instructions' + } catch { + return true + } +} + /** Wire-format setup frame for Gemini Live BidiGenerateContent (hub proxy + web client). */ export function buildGeminiLiveSetupMessage(language?: string): { setup: Record } { const liveConfig = buildGeminiLiveConfig(language) diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 797ead8f6f..0e22003656 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -1,4 +1,4 @@ -import { useEffect, useRef, useCallback } from 'react' +import { useEffect, useRef } from 'react' import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' import { registerSessionStore } from './realtimeClientTools' import { fetchQwenToken } from '@/api/voice' @@ -6,10 +6,8 @@ import { GeminiAudioRecorder } from './gemini/audioRecorder' import { GeminiAudioPlayer } from './gemini/audioPlayer' import { realtimeClientTools } from './realtimeClientTools' import { - QWEN_REALTIME_VOICE, VOICE_SYSTEM_PROMPT, VOICE_CHINESE_LANGUAGE_BLOCK, - VOICE_TOOL_DEFINITIONS } from '@hapi/protocol/voice' import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' import type { ApiClient } from '@/api/client' @@ -76,42 +74,27 @@ function sendEvent(type: string, payload?: Record): void { })) } -interface QwenSessionConfig { - modalities: ['text', 'audio'] - voice: string - input_audio_format: 'pcm' - output_audio_format: 'pcm' - instructions: string - temperature: number - turn_detection: { - type: 'server_vad' - threshold: number - silence_duration_ms: number - prefix_padding_ms: number - } - tools: Array - tool_choice: 'auto' -} class QwenVoiceSessionImpl implements VoiceSession { private api: ApiClient - private currentSessionConfig: QwenSessionConfig | null = null + private currentInstructions: string | null = null constructor(api: ApiClient) { this.api = api } private updateInstructions(update: string): void { - if (!this.currentSessionConfig) return - this.currentSessionConfig = { - ...this.currentSessionConfig, - instructions: `${this.currentSessionConfig.instructions}\n\n${update}` - } - sendEvent('session.update', { session: this.currentSessionConfig }) + if (this.currentInstructions === null) return + this.currentInstructions = `${this.currentInstructions}\n\n${update}` + // Hub filter allows only instruction-only session.update frames. + sendEvent('session.update', { session: { instructions: this.currentInstructions } }) } async startSession(config: VoiceSessionConfig): Promise { - this.currentSessionConfig = null + // Mirror the base instructions the hub will send so subsequent updates accumulate correctly. + this.currentInstructions = config.language === 'zh' + ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` + : VOICE_SYSTEM_PROMPT cleanup() state.statusCallback?.('connecting') @@ -151,7 +134,8 @@ class QwenVoiceSessionImpl implements VoiceSession { const proxyUrl = state.wsBaseUrl || defaultProxyUrl const authToken = this.api.getAuthToken() || '' const separator = proxyUrl.includes('?') ? '&' : '?' - const wsUrl = `${proxyUrl}${separator}token=${encodeURIComponent(authToken)}` + const langParam = config.language ? `&language=${encodeURIComponent(config.language)}` : '' + const wsUrl = `${proxyUrl}${separator}token=${encodeURIComponent(authToken)}${langParam}` const ws = new WebSocket(wsUrl) state.ws = ws @@ -173,45 +157,9 @@ class QwenVoiceSessionImpl implements VoiceSession { const eventType = data.type as string - // Session created - send configuration - if (eventType === 'session.created' && !sessionReady) { - if (DEBUG) console.log('[Qwen] Session created') - - // Build tools config - const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ - type: 'function' as const, - function: { - name: td.name, - description: td.description, - parameters: td.parameters - } - })) - - // Send session.update with full configuration - const basePrompt = config.language === 'zh' - ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` - : VOICE_SYSTEM_PROMPT - const instructions = config.initialContext - ? `${basePrompt}\n\n[Current Context]\n${config.initialContext}` - : basePrompt - - this.currentSessionConfig = { - modalities: ['text', 'audio'], - voice: QWEN_REALTIME_VOICE, - input_audio_format: 'pcm', - output_audio_format: 'pcm', - instructions, - temperature: 0.7, - turn_detection: { - type: 'server_vad', - threshold: 0.5, - silence_duration_ms: 800, - prefix_padding_ms: 300 - }, - tools, - tool_choice: 'auto' - } - sendEvent('session.update', { session: this.currentSessionConfig }) + // Session created — hub sends the initial session.update; browser waits for session.updated. + if (eventType === 'session.created') { + if (DEBUG) console.log('[Qwen] Session created (hub owns setup)') return } @@ -354,7 +302,7 @@ class QwenVoiceSessionImpl implements VoiceSession { } async endSession(): Promise { - this.currentSessionConfig = null + this.currentInstructions = null cleanup() resetRealtimeSessionState() state.statusCallback?.('disconnected') From 2e5ca915c804b9aac57d3349b5619b45a3b5ed99 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 31 May 2026 18:31:12 +0100 Subject: [PATCH 27/34] =?UTF-8?q?fix(voice):=20respect=20Qwen=20session.cr?= =?UTF-8?q?eated=E2=86=92session.update=20protocol=20ordering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DashScope requires session.update to be sent AFTER session.created is received, not immediately on WebSocket open. Previously the hub sent session.update in upstream.onopen, which violated this ordering and risked the config being processed in an uninitialized session context. Add pendingSetupMap to buffer the hub-owned session.update payload. The onmessage handler now relays session.created to the browser first, then immediately sends the pending session.update to DashScope — matching the protocol ordering the old browser-side code used (which waited for session.created before sending session.update). Also remove maxHttpBufferSize from the socket.io Engine config. That setting is unrelated to voice backends; upstream/main had no such limit set and it is not introduced by this PR. --- hub/src/socket/server.ts | 1 - hub/src/web/server.ts | 33 ++++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/hub/src/socket/server.ts b/hub/src/socket/server.ts index f464fe62a9..af7533e5c8 100644 --- a/hub/src/socket/server.ts +++ b/hub/src/socket/server.ts @@ -67,7 +67,6 @@ export function createSocketServer(deps: SocketServerDeps): { const engine = new Engine({ path: '/socket.io/', cors: corsOptions, - maxHttpBufferSize: 68 * 1024 * 1024, // 50 MiB decoded uploads are ~66.7 MiB as base64 JSON allowRequest: async (req) => { const origin = req.headers.get('origin') if (!origin || allowAllOrigins || corsOrigins.includes(origin)) { diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index ab0c24e721..7f57ce0826 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -125,6 +125,9 @@ function createGeminiProxyWebSocketHandler() { function createQwenProxyWebSocketHandler() { const QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' const upstreamMap = new WeakMap, WebSocket>() + // Holds the hub-owned session.update payload until session.created arrives from DashScope. + // Sending session.update before session.created violates the Qwen Realtime protocol ordering. + const pendingSetupMap = new WeakMap, string>() return { open(clientWs: ServerWebSocket) { @@ -136,23 +139,42 @@ function createQwenProxyWebSocketHandler() { } as unknown as string[]) upstreamMap.set(clientWs, upstream) + pendingSetupMap.set(clientWs, JSON.stringify(buildQwenSessionUpdateMessage(data.language))) - upstream.onopen = () => { - // Hub-owned setup — send initial session.update so client cannot override tools/voice/config. - upstream.send(JSON.stringify(buildQwenSessionUpdateMessage(data.language))) - } upstream.onmessage = (event) => { + const raw = event.data + const text = typeof raw === 'string' + ? raw + : new TextDecoder().decode(raw instanceof Uint8Array ? raw : new Uint8Array(raw as ArrayBuffer)) + + // Respect Qwen protocol ordering: relay session.created first, then send hub-owned + // session.update. DashScope must receive session.update after session.created. + const pendingSetup = pendingSetupMap.get(clientWs) + if (pendingSetup) { + try { + const parsed = JSON.parse(text) as { type?: string } + if (parsed.type === 'session.created') { + pendingSetupMap.delete(clientWs) + try { if (clientWs.readyState === 1) clientWs.send(text) } catch { /* client gone */ } + upstream.send(pendingSetup) + return + } + } catch { /* not JSON — relay as-is below */ } + } + try { if (clientWs.readyState === 1) { - clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + clientWs.send(typeof raw === 'string' ? raw : new Uint8Array(raw as ArrayBuffer)) } } catch { /* client gone */ } } upstream.onerror = () => { + pendingSetupMap.delete(clientWs) upstreamMap.delete(clientWs) try { clientWs.close(1011, 'Upstream error') } catch { /* */ } } upstream.onclose = (event) => { + pendingSetupMap.delete(clientWs) try { clientWs.close(toClientCloseCode(event.code), event.reason || 'Upstream closed') } catch { /* client gone */ } upstreamMap.delete(clientWs) } @@ -168,6 +190,7 @@ function createQwenProxyWebSocketHandler() { } }, close(clientWs: ServerWebSocket, code: number, reason: string) { + pendingSetupMap.delete(clientWs) const upstream = upstreamMap.get(clientWs) if (upstream) { try { upstream.close(toClientCloseCode(code), (reason || 'Client closed').slice(0, 123)) } catch { /* */ } From 090da8d7d3fa10013be87585caf176fc59e4073c Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 31 May 2026 19:39:24 +0100 Subject: [PATCH 28/34] fix(voice): use Realtime tool shape for Qwen session.update (not chat-completions) Qwen Realtime session.update expects tools as flat objects: { type: 'function', name, description, parameters } The previous code used the chat-completions shape: { type: 'function', function: { name, description, parameters } } DashScope may reject session.update or silently ignore tools with the nested shape, causing tool calls to fail at runtime. Fix applied in buildQwenSessionUpdateMessage(); test updated to assert flat shape and that no nested `function` key is present. --- shared/src/voice.gemini.test.ts | 7 +++++-- shared/src/voice.ts | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/shared/src/voice.gemini.test.ts b/shared/src/voice.gemini.test.ts index 2b3619eb69..b9602811a5 100644 --- a/shared/src/voice.gemini.test.ts +++ b/shared/src/voice.gemini.test.ts @@ -29,10 +29,13 @@ describe('buildQwenSessionUpdateMessage', () => { test('includes both tools', () => { const msg = buildQwenSessionUpdateMessage() - const session = msg.session as { tools: Array<{ function: { name: string } }> } - const names = session.tools.map(t => t.function.name) + // Realtime shape: flat {type, name, description, parameters} — NOT chat-completions {function:{...}} + const session = msg.session as { tools: Array<{ type: string; name: string }> } + const names = session.tools.map(t => t.name) expect(names).toContain('messageCodingAgent') expect(names).toContain('processPermissionRequest') + // Ensure no nested function key (would be wrong chat-completions shape) + session.tools.forEach(t => expect((t as Record).function).toBeUndefined()) }) test('appends Chinese block when language is zh', () => { diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 98b998ca2f..777b33ed89 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -371,9 +371,12 @@ export function buildQwenSessionUpdateMessage(language?: string): Record ({ type: 'function' as const, - function: { name: td.name, description: td.description, parameters: td.parameters } + name: td.name, + description: td.description, + parameters: td.parameters })) return { type: 'session.update', From bb84dd16e47be38a37c5469c21918a91263095f9 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 1 Jun 2026 18:31:45 +0100 Subject: [PATCH 29/34] fix(voice): update Qwen Realtime model, voice, and endpoint for intl service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live-tested against DashScope international API: - Model: qwen3-omni-flash-realtime → qwen3.5-omni-flash-realtime (previous model ID did not exist on DashScope) - Default voice: Cherry → Tina (confirmed from session.created response on qwen3.5-omni-flash-realtime) - Default WS base: dashscope.aliyuncs.com → dashscope-intl.aliyuncs.com (international accounts use the -intl endpoint; China endpoint rejects international API keys; QWEN_REALTIME_WS_URL env var still overrides) --- hub/src/web/server.ts | 2 +- shared/src/voice.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 7f57ce0826..db2b81d80c 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -123,7 +123,7 @@ function createGeminiProxyWebSocketHandler() { // Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header) function createQwenProxyWebSocketHandler() { - const QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' + const QWEN_WS_BASE = 'wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime' const upstreamMap = new WeakMap, WebSocket>() // Holds the hub-owned session.update payload until session.created arrives from DashScope. // Sending session.update before session.created violates the Qwen Realtime protocol ordering. diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 777b33ed89..9582dc2333 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -289,8 +289,8 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' -export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' -export const QWEN_REALTIME_VOICE = 'Cherry' +export const QWEN_REALTIME_MODEL = 'qwen3.5-omni-flash-realtime' +export const QWEN_REALTIME_VOICE = 'Tina' export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' From cdcd54d817002b68f6363dd907bda53da851654d Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 1 Jun 2026 20:36:10 +0100 Subject: [PATCH 30/34] fix(voice): correct Qwen text injection and generalise language handling Two dogfooding fixes verified against live Qwen Realtime session: sendTextMessage: switch from instruction-injection to conversation.item.create Qwen Realtime requires a user conversation item before response.create. The previous approach (updateInstructions + response.create) produced "input messages do not contain elements with role user" errors. Now sends {type:message, role:user, content:[{type:input_text}]} then response.create. sendContextualUpdate is unchanged (instruction-only, no response trigger). Language handling: replace zh-only branch with buildVoiceLanguageBlock() Previously, only language='zh' added any instruction; all other languages (including English) sent no language block, causing Qwen to drift to Chinese. buildVoiceLanguageBlock() now covers three cases: - 'zh'/'zh-*': existing Chinese block (unchanged) - explicit code ('en','es','fr',...): "Always respond in [Language]" - undefined/auto: "Detect the user's language and maintain it" Applied to buildGeminiLiveConfig, buildQwenSessionUpdateMessage, and the client-side currentInstructions mirror in QwenVoiceSession. Also removes the Gemini hub proxy's zh-only filter, which was discarding explicit language selections other than Chinese. --- hub/src/web/server.ts | 3 +- shared/src/voice.ts | 66 +++++++++++++++++++++++---- web/src/realtime/QwenVoiceSession.tsx | 19 ++++---- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index db2b81d80c..2ed37c3d50 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -474,8 +474,7 @@ export async function startWebServer(options: { if (!apiKey) { return new Response('Gemini API key not configured', { status: 400 }) } - const languageParam = url.searchParams.get('language') - const language = languageParam === 'zh' ? 'zh' : undefined + const language = url.searchParams.get('language') ?? undefined const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { data: { _geminiProxy: true, apiKey, language } }) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 9582dc2333..0c9d7a0cec 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -149,8 +149,11 @@ For builds, tests, or large file operations: When the user speaks to you for the first time, begin your response with a brief greeting before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` /** - * Additional language block appended to VOICE_SYSTEM_PROMPT for Gemini/Qwen - * backends (which don't have a separate language field like ElevenLabs). + * Language blocks appended to VOICE_SYSTEM_PROMPT for Gemini/Qwen backends + * (ElevenLabs has its own language field). + * + * Always append one of these — silence causes models to drift to their training + * language (Chinese for Qwen, mixed for Gemini). */ export const VOICE_CHINESE_LANGUAGE_BLOCK = ` @@ -162,6 +165,57 @@ IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. - Use English only for proper nouns, tool names, and code identifiers - Keep the same warm, concise conversational style in Chinese` +/** When no language is selected: mirror the user's detected speech language. */ +const VOICE_LANGUAGE_BLOCK_AUTO = ` + +# Language + +Detect the language the user is speaking and respond in that same language. +Maintain it consistently throughout the session — do not drift between turns. +If the language cannot be determined, default to English.` + +/** BCP-47 code → spoken language name (for explicit-language block). */ +const LANGUAGE_NAMES: Record = { + en: 'English', + es: 'Spanish', + fr: 'French', + de: 'German', + ja: 'Japanese', + ko: 'Korean', + pt: 'Portuguese', + it: 'Italian', + ar: 'Arabic', + ru: 'Russian', + hi: 'Hindi', + th: 'Thai', + vi: 'Vietnamese', + id: 'Indonesian', + nl: 'Dutch', + sv: 'Swedish', + pl: 'Polish', + tr: 'Turkish', +} + +/** + * Returns the language instruction block to append to VOICE_SYSTEM_PROMPT. + * - Explicit 'zh' → Chinese block + * - Other explicit code → "Always respond in [Language]" + * - undefined/auto → "detect from user speech and maintain it" + */ +export function buildVoiceLanguageBlock(language?: string): string { + if (!language) return VOICE_LANGUAGE_BLOCK_AUTO + if (language === 'zh' || language.startsWith('zh-')) return VOICE_CHINESE_LANGUAGE_BLOCK + const name = LANGUAGE_NAMES[language] ?? language + return ` + +# Language + +IMPORTANT: Always respond in ${name}. Maintain ${name} consistently throughout +the session — do not drift to a different language between turns. +Use English only for proper nouns, code identifiers, and technical terms with +no ${name} equivalent.` +} + /** ElevenLabs first message — language controlled by ElevenLabs language field */ export const VOICE_FIRST_MESSAGE = "Hey! Hapi here — what can I help you with?" @@ -351,9 +405,7 @@ export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclara } export function buildGeminiLiveConfig(language?: string): GeminiLiveConfig { - const systemInstruction = language === 'zh' - ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` - : VOICE_SYSTEM_PROMPT + const systemInstruction = `${VOICE_SYSTEM_PROMPT}${buildVoiceLanguageBlock(language)}` return { model: GEMINI_LIVE_MODEL, systemInstruction, @@ -368,9 +420,7 @@ export function buildGeminiLiveConfig(language?: string): GeminiLiveConfig { /** Hub-owned initial session.update for Qwen Realtime (hub proxy). */ export function buildQwenSessionUpdateMessage(language?: string): Record { - const instructions = language === 'zh' - ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` - : VOICE_SYSTEM_PROMPT + const instructions = `${VOICE_SYSTEM_PROMPT}${buildVoiceLanguageBlock(language)}` // Qwen Realtime uses the flat Realtime shape, not the chat-completions nested {function:{...}} shape. const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ type: 'function' as const, diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 0e22003656..f99ee39762 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -7,7 +7,7 @@ import { GeminiAudioPlayer } from './gemini/audioPlayer' import { realtimeClientTools } from './realtimeClientTools' import { VOICE_SYSTEM_PROMPT, - VOICE_CHINESE_LANGUAGE_BLOCK, + buildVoiceLanguageBlock, } from '@hapi/protocol/voice' import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' import type { ApiClient } from '@/api/client' @@ -92,9 +92,7 @@ class QwenVoiceSessionImpl implements VoiceSession { async startSession(config: VoiceSessionConfig): Promise { // Mirror the base instructions the hub will send so subsequent updates accumulate correctly. - this.currentInstructions = config.language === 'zh' - ? `${VOICE_SYSTEM_PROMPT}${VOICE_CHINESE_LANGUAGE_BLOCK}` - : VOICE_SYSTEM_PROMPT + this.currentInstructions = `${VOICE_SYSTEM_PROMPT}${buildVoiceLanguageBlock(config.language)}` cleanup() state.statusCallback?.('connecting') @@ -309,11 +307,14 @@ class QwenVoiceSessionImpl implements VoiceSession { } sendTextMessage(message: string): void { - // Qwen only supports conversation.item.create for function_call_output. - // Inject text as an instruction update then trigger a response. - // response.create without a prior conversation.item.create is valid — - // it generates from the current session context (updated instructions). - this.updateInstructions(message) + // Qwen Realtime requires a user conversation item before response.create. + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: message }] + } + }) sendEvent('response.create') } From 71aba34a56df46a1290f090efe33648a9a9197b5 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 2 Jun 2026 06:27:11 +0100 Subject: [PATCH 31/34] fix(hub): gate Gemini client frames until upstream setupComplete Hub sends its owned setup on upstream open, then waits for Google's setupComplete acknowledgment before flushing queued client frames. isGeminiSetupCompleteFrame() detects the {"setupComplete":{}} message; message() queues instead of forwarding while pendingMap is live. Addresses the repeated Major finding from bot review on PR #743. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/web/server.ts | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 2ed37c3d50..8480bdbbec 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -54,10 +54,21 @@ function isGeminiSetupFrame(message: string | ArrayBuffer | Uint8Array): boolean } } +function isGeminiSetupCompleteFrame(message: string | ArrayBuffer | Uint8Array): boolean { + try { + const parsed = JSON.parse(decodeWsText(message)) as unknown + return parsed !== null && typeof parsed === 'object' && 'setupComplete' in (parsed as object) + } catch { + return false + } +} + // Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions function createGeminiProxyWebSocketHandler() { const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' const upstreamMap = new WeakMap, WebSocket>() + // pendingMap holds queued client frames until Google acknowledges setup via setupComplete. + // Flushed on setupComplete; until then message() queues rather than forwards. const pendingMap = new WeakMap, Array>() return { @@ -72,13 +83,8 @@ function createGeminiProxyWebSocketHandler() { upstream.onopen = () => { // Hub-owned setup only — never forward client setup (prevents generic Gemini proxy abuse). + // Do NOT flush pending here: wait for Google's setupComplete before forwarding client frames. upstream.send(JSON.stringify(buildGeminiLiveSetupMessage(data.language))) - for (const queued of pending.splice(0)) { - if (!isGeminiSetupFrame(queued)) { - upstream.send(queued) - } - } - pendingMap.delete(clientWs) } upstream.onmessage = (event) => { try { @@ -86,6 +92,14 @@ function createGeminiProxyWebSocketHandler() { clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) } } catch { /* client gone */ } + // Flush queued client frames only after Google acknowledges setup. + const pending = pendingMap.get(clientWs) + if (pending && isGeminiSetupCompleteFrame(event.data as string | ArrayBuffer)) { + pendingMap.delete(clientWs) + for (const queued of pending) { + try { upstream.send(queued) } catch { /* client gone */ } + } + } } upstream.onerror = () => { pendingMap.delete(clientWs) @@ -103,11 +117,12 @@ function createGeminiProxyWebSocketHandler() { return } const upstream = upstreamMap.get(clientWs) - if (upstream?.readyState === WebSocket.OPEN) { + const pending = pendingMap.get(clientWs) + if (pending) { + // Still awaiting setupComplete (or upstream still connecting) — queue. + pending.push(message) + } else if (upstream?.readyState === WebSocket.OPEN) { upstream.send(message) - } else if (upstream?.readyState === WebSocket.CONNECTING) { - const pending = pendingMap.get(clientWs) - if (pending) pending.push(message) } }, close(clientWs: ServerWebSocket, code: number, reason: string) { From 72eaec4bb65fcd6afd4929e53d2e43fe9e02d2a3 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 2 Jun 2026 06:59:33 +0100 Subject: [PATCH 32/34] fix(hub): cap Gemini setup-window pending queue at 1 MiB An authenticated client could flood the queue between upstream.onopen and Google's setupComplete acknowledgment. Add pendingBytesMap tracking and close with 1009 if the budget is exceeded. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- hub/src/web/server.ts | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 8480bdbbec..b128bd40fb 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -63,6 +63,11 @@ function isGeminiSetupCompleteFrame(message: string | ArrayBuffer | Uint8Array): } } +const MAX_GEMINI_PENDING_BYTES = 1024 * 1024 // 1 MiB — rejects setup-window floods +function frameByteSize(msg: string | ArrayBuffer | Uint8Array): number { + return typeof msg === 'string' ? msg.length : (msg as ArrayBuffer | Uint8Array).byteLength +} + // Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions function createGeminiProxyWebSocketHandler() { const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' @@ -70,6 +75,7 @@ function createGeminiProxyWebSocketHandler() { // pendingMap holds queued client frames until Google acknowledges setup via setupComplete. // Flushed on setupComplete; until then message() queues rather than forwards. const pendingMap = new WeakMap, Array>() + const pendingBytesMap = new WeakMap, number>() return { open(clientWs: ServerWebSocket) { @@ -77,6 +83,7 @@ function createGeminiProxyWebSocketHandler() { const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}` const pending: Array = [] pendingMap.set(clientWs, pending) + pendingBytesMap.set(clientWs, 0) const upstream = new WebSocket(upstreamUrl) upstreamMap.set(clientWs, upstream) @@ -96,17 +103,20 @@ function createGeminiProxyWebSocketHandler() { const pending = pendingMap.get(clientWs) if (pending && isGeminiSetupCompleteFrame(event.data as string | ArrayBuffer)) { pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) for (const queued of pending) { - try { upstream.send(queued) } catch { /* client gone */ } + try { upstream.send(queued) } catch { /* upstream gone */ } } } } upstream.onerror = () => { pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) try { clientWs.close(1011, 'Upstream error') } catch { /* */ } } upstream.onclose = (event) => { pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) try { clientWs.close(toClientCloseCode(event.code), event.reason || 'Upstream closed') } catch { /* client gone */ } upstreamMap.delete(clientWs) } @@ -119,7 +129,13 @@ function createGeminiProxyWebSocketHandler() { const upstream = upstreamMap.get(clientWs) const pending = pendingMap.get(clientWs) if (pending) { - // Still awaiting setupComplete (or upstream still connecting) — queue. + // Still awaiting setupComplete — queue, but cap to prevent setup-window floods. + const total = (pendingBytesMap.get(clientWs) ?? 0) + frameByteSize(message) + if (total > MAX_GEMINI_PENDING_BYTES) { + try { clientWs.close(1009, 'Setup-window frame budget exceeded') } catch { /* */ } + return + } + pendingBytesMap.set(clientWs, total) pending.push(message) } else if (upstream?.readyState === WebSocket.OPEN) { upstream.send(message) @@ -128,6 +144,7 @@ function createGeminiProxyWebSocketHandler() { close(clientWs: ServerWebSocket, code: number, reason: string) { const upstream = upstreamMap.get(clientWs) pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) if (upstream) { try { upstream.close(toClientCloseCode(code), (reason || 'Client closed').slice(0, 123)) } catch { /* */ } upstreamMap.delete(clientWs) From ab99be073265e24e35cfe2a2ab7e98b193a7f213 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 2 Jun 2026 12:54:47 +0100 Subject: [PATCH 33/34] fix(gemini): pass all language codes to hub proxy, not just zh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Language selection for French, Spanish, Japanese etc. was silently dropped — only 'zh' was forwarded as a query param. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/GeminiLiveVoiceSession.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 79025c9f27..a23f554937 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -119,7 +119,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE const isProxy = !!state.wsBaseUrl const authToken = this.api.getAuthToken() || '' - const languageParam = config.language === 'zh' ? '&language=zh' : '' + const languageParam = config.language ? `&language=${encodeURIComponent(config.language)}` : '' const wsUrl = isProxy ? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}${languageParam}` : `${wsBase}?key=${encodeURIComponent(state.apiKey)}` From 358c197d9b09e3035c54f5c113cef0c2f9a1094f Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Wed, 3 Jun 2026 09:28:59 +0100 Subject: [PATCH 34/34] fix(voice): expand LANGUAGE_NAMES to cover full ElevenLabs language set Codes like 'no', 'da', 'fi', 'pt-br', 'bg', 'ro', 'cs', 'el', 'ms', 'tl', 'uk', 'hu', 'hr', 'sk' were falling through to raw-code prompts ("Always respond in no"). Now resolve to proper display names. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- shared/src/voice.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 0c9d7a0cec..3099a90576 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -183,6 +183,7 @@ const LANGUAGE_NAMES: Record = { ja: 'Japanese', ko: 'Korean', pt: 'Portuguese', + 'pt-br': 'Brazilian Portuguese', it: 'Italian', ar: 'Arabic', ru: 'Russian', @@ -192,8 +193,21 @@ const LANGUAGE_NAMES: Record = { id: 'Indonesian', nl: 'Dutch', sv: 'Swedish', + no: 'Norwegian', + da: 'Danish', + fi: 'Finnish', pl: 'Polish', tr: 'Turkish', + bg: 'Bulgarian', + ro: 'Romanian', + cs: 'Czech', + el: 'Greek', + ms: 'Malay', + tl: 'Filipino', + uk: 'Ukrainian', + hu: 'Hungarian', + hr: 'Croatian', + sk: 'Slovak', } /**