diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts index 15249c723..9cc3f4232 100644 --- a/hub/src/web/routes/voice.test.ts +++ b/hub/src/web/routes/voice.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, mock } from 'bun:test' +import { describe, expect, it, mock, test, afterEach } from 'bun:test' import { Hono } from 'hono' import { SignJWT } from 'jose' import type { WebAppEnv } from '../middleware/auth' @@ -188,3 +188,155 @@ describe('POST /api/voice/token', () => { else delete process.env.ELEVENLABS_AGENT_ID }) }) + +describe('GET /api/voice/backend', () => { + const originalEnv = process.env.VOICE_BACKEND + + afterEach(() => { + if (originalEnv === undefined) { + delete process.env.VOICE_BACKEND + } else { + process.env.VOICE_BACKEND = originalEnv + } + }) + + test('returns elevenlabs by default', async () => { + delete process.env.VOICE_BACKEND + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) + + test('returns gemini-live when configured', async () => { + process.env.VOICE_BACKEND = 'gemini-live' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('gemini-live') + }) + + test('returns qwen-realtime when configured', async () => { + process.env.VOICE_BACKEND = 'qwen-realtime' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('qwen-realtime') + }) + + test('falls back to elevenlabs for unknown values', async () => { + process.env.VOICE_BACKEND = 'unknown-backend' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/backend', { headers }) + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) +}) + +describe('POST /api/voice/gemini-token', () => { + const origGemini = process.env.GEMINI_API_KEY + const origGoogle = process.env.GOOGLE_API_KEY + + afterEach(() => { + if (origGemini === undefined) delete process.env.GEMINI_API_KEY + else process.env.GEMINI_API_KEY = origGemini + if (origGoogle === undefined) delete process.env.GOOGLE_API_KEY + else process.env.GOOGLE_API_KEY = origGoogle + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.GEMINI_API_KEY + delete process.env.GOOGLE_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns proxied wsUrl when GEMINI_API_KEY is set', async () => { + process.env.GEMINI_API_KEY = 'test-gemini-key' + delete process.env.GOOGLE_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('proxied') + expect(body.wsUrl).toContain('/api/voice/gemini-ws') + }) + + test('falls back to GOOGLE_API_KEY', async () => { + delete process.env.GEMINI_API_KEY + process.env.GOOGLE_API_KEY = 'test-google-key' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('proxied') + expect(body.wsUrl).toContain('/api/voice/gemini-ws') + }) +}) + +describe('POST /api/voice/qwen-token', () => { + const origDash = process.env.DASHSCOPE_API_KEY + const origQwen = process.env.QWEN_API_KEY + + afterEach(() => { + if (origDash === undefined) delete process.env.DASHSCOPE_API_KEY + else process.env.DASHSCOPE_API_KEY = origDash + if (origQwen === undefined) delete process.env.QWEN_API_KEY + else process.env.QWEN_API_KEY = origQwen + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.DASHSCOPE_API_KEY + delete process.env.QWEN_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns wsUrl when DASHSCOPE_API_KEY is set (no raw key exposed)', async () => { + process.env.DASHSCOPE_API_KEY = 'test-dash-key' + delete process.env.QWEN_API_KEY + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.wsUrl).toContain('/api/voice/qwen-ws') + expect(body).not.toHaveProperty('apiKey') + }) + + test('falls back to QWEN_API_KEY', async () => { + delete process.env.DASHSCOPE_API_KEY + process.env.QWEN_API_KEY = 'test-qwen-key' + const app = createApp() + const headers = await authHeaders() + const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; wsUrl: string } + expect(body.allowed).toBe(true) + expect(body.wsUrl).toContain('/api/voice/qwen-ws') + expect(body).not.toHaveProperty('apiKey') + }) +}) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 091b9c2ac..3ccc48155 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -4,8 +4,19 @@ import type { WebAppEnv } from '../middleware/auth' import { ELEVENLABS_API_BASE, VOICE_AGENT_NAME, - buildVoiceAgentConfig + buildVoiceAgentConfig, + DEFAULT_VOICE_BACKEND } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' + +function buildVoiceWsUrl(base: string, pathname: string): string { + const url = new URL(base) + url.protocol = url.protocol === 'https:' ? 'wss:' : 'ws:' + url.pathname = pathname + url.search = '' + url.hash = '' + return url.toString() +} const tokenRequestSchema = z.object({ customAgentId: z.string().optional(), @@ -166,6 +177,65 @@ async function getOrCreateAgentIdForVoice(apiKey: string, voiceId?: string): Pro export function createVoiceRoutes(): Hono { const app = new Hono() + // Return the configured voice backend type + app.get('/voice/backend', (c) => { + const raw = process.env.VOICE_BACKEND + const backend: VoiceBackendType = + raw === 'gemini-live' ? 'gemini-live' + : raw === 'qwen-realtime' ? 'qwen-realtime' + : DEFAULT_VOICE_BACKEND + return c.json({ backend }) + }) + + // Get Gemini API key for Gemini Live voice sessions + // Gemini Live API does not support ephemeral tokens, so we proxy the key. + // The key is short-lived in the browser session and never persisted client-side. + app.post('/voice/gemini-token', async (c) => { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'Gemini API key not configured (set GEMINI_API_KEY or GOOGLE_API_KEY)' + }, 400) + } + + // Use server-side WS proxy to avoid region restrictions. + // The proxy at /api/voice/gemini-ws handles the API key server-side. + // Derive wsUrl from the request origin so remote browsers connect back to the hub, + // not to localhost. HAPI_PUBLIC_URL overrides when set (e.g. behind a reverse proxy). + const requestOrigin = new URL(c.req.url).origin + const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin + const wsProxyUrl = buildVoiceWsUrl(publicUrl, '/api/voice/gemini-ws') + + return c.json({ + allowed: true, + apiKey: 'proxied', // Dummy — key is handled server-side + wsUrl: wsProxyUrl, // Always proxy — env WS URLs are upstream-only (server-side) + baseUrl: process.env.GEMINI_API_BASE || undefined + }) + }) + + // Check Qwen (DashScope) availability for Qwen Realtime voice sessions + // The actual API key is never sent to the browser — it stays server-side in the WS proxy. + app.post('/voice/qwen-token', async (c) => { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'DashScope API key not configured (set DASHSCOPE_API_KEY or QWEN_API_KEY)' + }, 400) + } + + const requestOrigin = new URL(c.req.url).origin + const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin + const wsProxyUrl = buildVoiceWsUrl(publicUrl, '/api/voice/qwen-ws') + + return c.json({ + allowed: true, + wsUrl: wsProxyUrl // Always proxy — env WS URLs are upstream-only (server-side) + }) + }) + // Get ElevenLabs ConvAI conversation token app.post('/voice/token', async (c) => { const requestId = crypto.randomUUID() diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index e18f3ddc6..b128bd40f 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -6,6 +6,7 @@ import { existsSync } from 'node:fs' import { serveStatic } from 'hono/bun' import { getConfiguration } from '../configuration' import { PROTOCOL_VERSION } from '@hapi/protocol' +import { buildGeminiLiveSetupMessage, buildQwenSessionUpdateMessage, isQwenSafeClientFrame, QWEN_REALTIME_MODEL } from '@hapi/protocol/voice' import type { SyncEngine } from '../sync/syncEngine' import { createAuthMiddleware, type WebAppEnv } from './middleware/auth' import { createAuthRoutes } from './routes/auth' @@ -21,13 +22,216 @@ import { createPushRoutes } from './routes/push' import { createVoiceRoutes } from './routes/voice' import type { SSEManager } from '../sse/sseManager' import type { VisibilityTracker } from '../visibility/visibilityTracker' -import type { Server as BunServer } from 'bun' +import type { Server as BunServer, ServerWebSocket } from 'bun' import type { Server as SocketEngine } from '@socket.io/bun-engine' +import { jwtVerify } from 'jose' import type { WebSocketData } from '@socket.io/bun-engine' import { loadEmbeddedAssetMap, type EmbeddedWebAsset } from './embeddedAssets' import { isBunCompiled } from '../utils/bunCompiled' import type { Store } from '../store' +// Normalise upstream close codes before forwarding to the browser client. +// Codes 1005/1006/1015 are reserved and cannot be sent in a close frame; +// abnormal upstream drops commonly produce 1006, which would throw on clientWs.close(). +function toClientCloseCode(code: number): number { + return code >= 1000 && code <= 4999 && code !== 1005 && code !== 1006 && code !== 1015 + ? code + : 1011 +} + +function decodeWsText(message: string | ArrayBuffer | Uint8Array): string { + if (typeof message === 'string') return message + const bytes = message instanceof Uint8Array ? message : new Uint8Array(message) + return new TextDecoder().decode(bytes) +} + +function isGeminiSetupFrame(message: string | ArrayBuffer | Uint8Array): boolean { + try { + const parsed = JSON.parse(decodeWsText(message)) as unknown + return parsed !== null && typeof parsed === 'object' && 'setup' in (parsed as object) + } catch { + return false + } +} + +function isGeminiSetupCompleteFrame(message: string | ArrayBuffer | Uint8Array): boolean { + try { + const parsed = JSON.parse(decodeWsText(message)) as unknown + return parsed !== null && typeof parsed === 'object' && 'setupComplete' in (parsed as object) + } catch { + return false + } +} + +const MAX_GEMINI_PENDING_BYTES = 1024 * 1024 // 1 MiB — rejects setup-window floods +function frameByteSize(msg: string | ArrayBuffer | Uint8Array): number { + return typeof msg === 'string' ? msg.length : (msg as ArrayBuffer | Uint8Array).byteLength +} + +// Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions +function createGeminiProxyWebSocketHandler() { + const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' + const upstreamMap = new WeakMap, WebSocket>() + // pendingMap holds queued client frames until Google acknowledges setup via setupComplete. + // Flushed on setupComplete; until then message() queues rather than forwards. + const pendingMap = new WeakMap, Array>() + const pendingBytesMap = new WeakMap, number>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { _geminiProxy: boolean; apiKey: string; language?: string } + const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}` + const pending: Array = [] + pendingMap.set(clientWs, pending) + pendingBytesMap.set(clientWs, 0) + + const upstream = new WebSocket(upstreamUrl) + upstreamMap.set(clientWs, upstream) + + upstream.onopen = () => { + // Hub-owned setup only — never forward client setup (prevents generic Gemini proxy abuse). + // Do NOT flush pending here: wait for Google's setupComplete before forwarding client frames. + upstream.send(JSON.stringify(buildGeminiLiveSetupMessage(data.language))) + } + upstream.onmessage = (event) => { + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + } + } catch { /* client gone */ } + // Flush queued client frames only after Google acknowledges setup. + const pending = pendingMap.get(clientWs) + if (pending && isGeminiSetupCompleteFrame(event.data as string | ArrayBuffer)) { + pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) + for (const queued of pending) { + try { upstream.send(queued) } catch { /* upstream gone */ } + } + } + } + upstream.onerror = () => { + pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) + try { clientWs.close(toClientCloseCode(event.code), event.reason || 'Upstream closed') } catch { /* client gone */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + if (isGeminiSetupFrame(message)) { + try { clientWs.close(1008, 'Client-provided Gemini setup is not allowed') } catch { /* */ } + return + } + const upstream = upstreamMap.get(clientWs) + const pending = pendingMap.get(clientWs) + if (pending) { + // Still awaiting setupComplete — queue, but cap to prevent setup-window floods. + const total = (pendingBytesMap.get(clientWs) ?? 0) + frameByteSize(message) + if (total > MAX_GEMINI_PENDING_BYTES) { + try { clientWs.close(1009, 'Setup-window frame budget exceeded') } catch { /* */ } + return + } + pendingBytesMap.set(clientWs, total) + pending.push(message) + } else if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + const upstream = upstreamMap.get(clientWs) + pendingMap.delete(clientWs) + pendingBytesMap.delete(clientWs) + if (upstream) { + try { upstream.close(toClientCloseCode(code), (reason || 'Client closed').slice(0, 123)) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} + +// Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header) +function createQwenProxyWebSocketHandler() { + const QWEN_WS_BASE = 'wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime' + const upstreamMap = new WeakMap, WebSocket>() + // Holds the hub-owned session.update payload until session.created arrives from DashScope. + // Sending session.update before session.created violates the Qwen Realtime protocol ordering. + const pendingSetupMap = new WeakMap, string>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { apiKey: string; model: string; language?: string } + const upstreamUrl = `${process.env.QWEN_REALTIME_WS_URL || QWEN_WS_BASE}?model=${encodeURIComponent(data.model)}` + + const upstream = new WebSocket(upstreamUrl, { + headers: { 'Authorization': `Bearer ${data.apiKey}` } + } as unknown as string[]) + + upstreamMap.set(clientWs, upstream) + pendingSetupMap.set(clientWs, JSON.stringify(buildQwenSessionUpdateMessage(data.language))) + + upstream.onmessage = (event) => { + const raw = event.data + const text = typeof raw === 'string' + ? raw + : new TextDecoder().decode(raw instanceof Uint8Array ? raw : new Uint8Array(raw as ArrayBuffer)) + + // Respect Qwen protocol ordering: relay session.created first, then send hub-owned + // session.update. DashScope must receive session.update after session.created. + const pendingSetup = pendingSetupMap.get(clientWs) + if (pendingSetup) { + try { + const parsed = JSON.parse(text) as { type?: string } + if (parsed.type === 'session.created') { + pendingSetupMap.delete(clientWs) + try { if (clientWs.readyState === 1) clientWs.send(text) } catch { /* client gone */ } + upstream.send(pendingSetup) + return + } + } catch { /* not JSON — relay as-is below */ } + } + + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof raw === 'string' ? raw : new Uint8Array(raw as ArrayBuffer)) + } + } catch { /* client gone */ } + } + upstream.onerror = () => { + pendingSetupMap.delete(clientWs) + upstreamMap.delete(clientWs) + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + pendingSetupMap.delete(clientWs) + try { clientWs.close(toClientCloseCode(event.code), event.reason || 'Upstream closed') } catch { /* client gone */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + if (!isQwenSafeClientFrame(message)) { + try { clientWs.close(1008, 'Client session.update may only modify instructions') } catch { /* */ } + return + } + const upstream = upstreamMap.get(clientWs) + if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + pendingSetupMap.delete(clientWs) + const upstream = upstreamMap.get(clientWs) + if (upstream) { + try { upstream.close(toClientCloseCode(code), (reason || 'Client closed').slice(0, 123)) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} + function findWebappDistDir(): { distDir: string; indexHtmlPath: string } { const candidates = [ join(process.cwd(), '..', 'web', 'dist'), @@ -232,17 +436,102 @@ export async function startWebServer(options: { const configuration = getConfiguration() const socketHandler = options.socketEngine.handler() - const server = Bun.serve({ + // Wrap socket.io websocket handler to also support Gemini/Qwen proxy connections + const originalWsHandler = socketHandler.websocket + const geminiProxyHandler = createGeminiProxyWebSocketHandler() + const qwenProxyHandler = createQwenProxyWebSocketHandler() + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const server = (Bun.serve as any)({ hostname: configuration.listenHost, port: configuration.listenPort, idleTimeout: Math.max(30, socketHandler.idleTimeout), maxRequestBodySize: Math.max(socketHandler.maxRequestBodySize, 68 * 1024 * 1024), - websocket: socketHandler.websocket, - fetch: (req, server) => { + websocket: { + ...originalWsHandler, + open(ws: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.open(wsAny) + } else if (wsAny.data?._qwenProxy) { + qwenProxyHandler.open(wsAny) + } else { + originalWsHandler.open?.(ws as never) + } + }, + message(ws: unknown, message: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.message(wsAny, message as string) + } else if (wsAny.data?._qwenProxy) { + qwenProxyHandler.message(wsAny, message as string) + } else { + originalWsHandler.message?.(ws as never, message as never) + } + }, + close(ws: unknown, code: number, reason: string) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.close(wsAny, code, reason) + } else if (wsAny.data?._qwenProxy) { + qwenProxyHandler.close(wsAny, code, reason) + } else { + originalWsHandler.close?.(ws as never, code as never, reason as never) + } + } + }, + fetch: async (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => { const url = new URL(req.url) if (url.pathname.startsWith('/socket.io/')) { - return socketHandler.fetch(req, server) + return socketHandler.fetch(req, server as never) + } + + // Voice WebSocket proxies — require JWT auth via query param + // (browser WebSocket API cannot set custom headers) + if (url.pathname === '/api/voice/gemini-ws' || url.pathname === '/api/voice/qwen-ws') { + const token = url.searchParams.get('token') + if (!token) { + return new Response('Missing authorization token', { status: 401 }) + } + try { + await jwtVerify(token, options.jwtSecret, { algorithms: ['HS256'] }) + } catch { + return new Response('Invalid token', { status: 401 }) + } + } + + // Gemini Live WebSocket proxy + if (url.pathname === '/api/voice/gemini-ws') { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return new Response('Gemini API key not configured', { status: 400 }) + } + const language = url.searchParams.get('language') ?? undefined + const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { + data: { _geminiProxy: true, apiKey, language } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response } + // Qwen Realtime WebSocket proxy + if (url.pathname === '/api/voice/qwen-ws') { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + const model = QWEN_REALTIME_MODEL + const language = url.searchParams.get('language') ?? undefined + if (!apiKey) { + return new Response('DashScope API key not configured', { status: 400 }) + } + const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { + data: { _qwenProxy: true, apiKey, model, language } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response + } + return app.fetch(req) } }) diff --git a/shared/src/voice.gemini.test.ts b/shared/src/voice.gemini.test.ts new file mode 100644 index 000000000..b9602811a --- /dev/null +++ b/shared/src/voice.gemini.test.ts @@ -0,0 +1,88 @@ +import { describe, expect, test } from 'bun:test' +import { buildGeminiLiveSetupMessage, buildQwenSessionUpdateMessage, isQwenSafeClientFrame, GEMINI_LIVE_MODEL, GEMINI_LIVE_VOICE, QWEN_REALTIME_VOICE } from './voice' + +describe('buildGeminiLiveSetupMessage', () => { + test('locks model and voice to HAPI defaults', () => { + const msg = buildGeminiLiveSetupMessage() + expect(msg.setup.model).toBe(`models/${GEMINI_LIVE_MODEL}`) + const speech = msg.setup.generationConfig as { + speechConfig?: { voiceConfig?: { prebuiltVoiceConfig?: { voiceName?: string } } } + } + expect(speech.speechConfig?.voiceConfig?.prebuiltVoiceConfig?.voiceName).toBe(GEMINI_LIVE_VOICE) + }) + + test('appends Chinese block when language is zh', () => { + const en = buildGeminiLiveSetupMessage() + const zh = buildGeminiLiveSetupMessage('zh') + const enText = (en.setup.systemInstruction as { parts: Array<{ text: string }> }).parts[0].text + const zhText = (zh.setup.systemInstruction as { parts: Array<{ text: string }> }).parts[0].text + expect(zhText.length).toBeGreaterThan(enText.length) + }) +}) + +describe('buildQwenSessionUpdateMessage', () => { + test('locks voice to HAPI default', () => { + const msg = buildQwenSessionUpdateMessage() + const session = msg.session as { voice: string } + expect(session.voice).toBe(QWEN_REALTIME_VOICE) + }) + + test('includes both tools', () => { + const msg = buildQwenSessionUpdateMessage() + // Realtime shape: flat {type, name, description, parameters} — NOT chat-completions {function:{...}} + const session = msg.session as { tools: Array<{ type: string; name: string }> } + const names = session.tools.map(t => t.name) + expect(names).toContain('messageCodingAgent') + expect(names).toContain('processPermissionRequest') + // Ensure no nested function key (would be wrong chat-completions shape) + session.tools.forEach(t => expect((t as Record).function).toBeUndefined()) + }) + + test('appends Chinese block when language is zh', () => { + const en = buildQwenSessionUpdateMessage() + const zh = buildQwenSessionUpdateMessage('zh') + const enInstr = (en.session as { instructions: string }).instructions + const zhInstr = (zh.session as { instructions: string }).instructions + expect(zhInstr.length).toBeGreaterThan(enInstr.length) + }) +}) + +describe('isQwenSafeClientFrame', () => { + test('allows non-session.update frames', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ type: 'input_audio_buffer.append', audio: 'abc' }))).toBe(true) + expect(isQwenSafeClientFrame(JSON.stringify({ type: 'response.create' }))).toBe(true) + expect(isQwenSafeClientFrame(JSON.stringify({ type: 'conversation.item.create', item: {} }))).toBe(true) + }) + + test('allows session.update with only instructions', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { instructions: 'updated prompt' } + }))).toBe(true) + }) + + test('blocks session.update that includes tools', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { instructions: 'x', tools: [] } + }))).toBe(false) + }) + + test('blocks session.update that includes voice', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { voice: 'Cherry' } + }))).toBe(false) + }) + + test('blocks full config session.update', () => { + expect(isQwenSafeClientFrame(JSON.stringify({ + type: 'session.update', + session: { modalities: ['text', 'audio'], voice: 'Cherry', instructions: 'x', tools: [], tool_choice: 'auto' } + }))).toBe(false) + }) + + test('allows non-JSON (binary audio frames pass through)', () => { + expect(isQwenSafeClientFrame('not json {')).toBe(true) + }) +}) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 2843d84eb..3099a9057 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -8,10 +8,16 @@ export const ELEVENLABS_API_BASE = 'https://api.elevenlabs.io/v1' export const VOICE_AGENT_NAME = 'Hapi Voice Assistant' -export const VOICE_SYSTEM_PROMPT = `# Identity +export const VOICE_SYSTEM_PROMPT = `# CRITICAL RULE - Tool Usage + +You MUST call the messageCodingAgent tool for ANY request related to coding, files, development, debugging, or tasks for the agent. Do NOT respond verbally to these requests — call the tool FIRST, then briefly confirm. This is your most important behavior. + +# Identity You are Hapi Voice Assistant. You bridge voice communication between users and their AI coding agents in the Hapi ecosystem. +IMPORTANT: Never refer to yourself as Gemini, Google, or any underlying model or provider name. You are HAPI — always. + You are friendly, proactive, and highly intelligent with a world-class engineering background. Your approach is warm, witty, and relaxed, balancing professionalism with an approachable vibe. # Environment Overview @@ -136,9 +142,96 @@ For builds, tests, or large file operations: - Treat garbled input as phonetic hints and ask for clarification - Correct yourself immediately if you realize you made an error - Keep conversations forward-moving with fresh insights -- Assume a technical software developer audience` +- Assume a technical software developer audience + +# First Interaction + +When the user speaks to you for the first time, begin your response with a brief greeting before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` -export const VOICE_FIRST_MESSAGE = "Hey! Hapi here." +/** + * Language blocks appended to VOICE_SYSTEM_PROMPT for Gemini/Qwen backends + * (ElevenLabs has its own language field). + * + * Always append one of these — silence causes models to drift to their training + * language (Chinese for Qwen, mixed for Gemini). + */ +export const VOICE_CHINESE_LANGUAGE_BLOCK = ` + +# Language + +IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. +- Greet users in Chinese +- Summarize technical content in Chinese +- Use English only for proper nouns, tool names, and code identifiers +- Keep the same warm, concise conversational style in Chinese` + +/** When no language is selected: mirror the user's detected speech language. */ +const VOICE_LANGUAGE_BLOCK_AUTO = ` + +# Language + +Detect the language the user is speaking and respond in that same language. +Maintain it consistently throughout the session — do not drift between turns. +If the language cannot be determined, default to English.` + +/** BCP-47 code → spoken language name (for explicit-language block). */ +const LANGUAGE_NAMES: Record = { + en: 'English', + es: 'Spanish', + fr: 'French', + de: 'German', + ja: 'Japanese', + ko: 'Korean', + pt: 'Portuguese', + 'pt-br': 'Brazilian Portuguese', + it: 'Italian', + ar: 'Arabic', + ru: 'Russian', + hi: 'Hindi', + th: 'Thai', + vi: 'Vietnamese', + id: 'Indonesian', + nl: 'Dutch', + sv: 'Swedish', + no: 'Norwegian', + da: 'Danish', + fi: 'Finnish', + pl: 'Polish', + tr: 'Turkish', + bg: 'Bulgarian', + ro: 'Romanian', + cs: 'Czech', + el: 'Greek', + ms: 'Malay', + tl: 'Filipino', + uk: 'Ukrainian', + hu: 'Hungarian', + hr: 'Croatian', + sk: 'Slovak', +} + +/** + * Returns the language instruction block to append to VOICE_SYSTEM_PROMPT. + * - Explicit 'zh' → Chinese block + * - Other explicit code → "Always respond in [Language]" + * - undefined/auto → "detect from user speech and maintain it" + */ +export function buildVoiceLanguageBlock(language?: string): string { + if (!language) return VOICE_LANGUAGE_BLOCK_AUTO + if (language === 'zh' || language.startsWith('zh-')) return VOICE_CHINESE_LANGUAGE_BLOCK + const name = LANGUAGE_NAMES[language] ?? language + return ` + +# Language + +IMPORTANT: Always respond in ${name}. Maintain ${name} consistently throughout +the session — do not drift to a different language between turns. +Use English only for proper nouns, code identifiers, and technical terms with +no ${name} equivalent.` +} + +/** ElevenLabs first message — language controlled by ElevenLabs language field */ +export const VOICE_FIRST_MESSAGE = "Hey! Hapi here — what can I help you with?" export const VOICE_TOOLS = [ { @@ -261,3 +354,162 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { } } } + +export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' + +export const QWEN_REALTIME_MODEL = 'qwen3.5-omni-flash-realtime' +export const QWEN_REALTIME_VOICE = 'Tina' + +export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' + +export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' +export const GEMINI_LIVE_VOICE = 'Aoede' + +export interface VoiceToolDefinition { + name: string + description: string + parameters: { + type: 'object' + required: string[] + properties: Record + } +} + +type VoiceToolSource = Pick<(typeof VOICE_TOOLS)[number], 'name' | 'description' | 'parameters'> + +function cloneVoiceToolDefinition(tool: VoiceToolSource): VoiceToolDefinition { + const properties: VoiceToolDefinition['parameters']['properties'] = {} + + for (const [key, value] of Object.entries(tool.parameters.properties)) { + properties[key] = { + type: value.type, + description: value.description + } + } + + return { + name: tool.name, + description: tool.description, + parameters: { + type: 'object', + required: [...tool.parameters.required], + properties + } + } +} + +export const VOICE_TOOL_DEFINITIONS: VoiceToolDefinition[] = VOICE_TOOLS.map(cloneVoiceToolDefinition) + +export type GeminiLiveFunctionDeclaration = VoiceToolDefinition + +export interface GeminiLiveConfig { + model: string + systemInstruction: string + tools: Array<{ + functionDeclarations: GeminiLiveFunctionDeclaration[] + }> + responseModalities: ['AUDIO'] +} + +export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclaration[] { + return VOICE_TOOLS.map(cloneVoiceToolDefinition) +} + +export function buildGeminiLiveConfig(language?: string): GeminiLiveConfig { + const systemInstruction = `${VOICE_SYSTEM_PROMPT}${buildVoiceLanguageBlock(language)}` + return { + model: GEMINI_LIVE_MODEL, + systemInstruction, + tools: [ + { + functionDeclarations: buildGeminiLiveFunctionDeclarations() + } + ], + responseModalities: ['AUDIO'] + } +} + +/** Hub-owned initial session.update for Qwen Realtime (hub proxy). */ +export function buildQwenSessionUpdateMessage(language?: string): Record { + const instructions = `${VOICE_SYSTEM_PROMPT}${buildVoiceLanguageBlock(language)}` + // Qwen Realtime uses the flat Realtime shape, not the chat-completions nested {function:{...}} shape. + const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ + type: 'function' as const, + name: td.name, + description: td.description, + parameters: td.parameters + })) + return { + type: 'session.update', + session: { + modalities: ['text', 'audio'], + voice: QWEN_REALTIME_VOICE, + input_audio_format: 'pcm', + output_audio_format: 'pcm', + instructions, + temperature: 0.7, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 800, + prefix_padding_ms: 300 + }, + tools, + tool_choice: 'auto' + } + } +} + +/** + * Returns true if a client WebSocket frame is safe to forward to DashScope. + * Blocks session.update frames that touch config fields (tools, voice, etc.); + * allows instruction-only updates and all runtime event types. + */ +export function isQwenSafeClientFrame(message: string | ArrayBuffer | Uint8Array): boolean { + try { + const text = typeof message === 'string' + ? message + : new TextDecoder().decode(message instanceof ArrayBuffer ? new Uint8Array(message) : message) + const parsed = JSON.parse(text) as unknown + if (!parsed || typeof parsed !== 'object') return true + const p = parsed as Record + if (p.type !== 'session.update') return true + const session = p.session as Record | undefined + if (!session) return false + const keys = Object.keys(session) + return keys.length === 1 && keys[0] === 'instructions' + } catch { + return true + } +} + +/** Wire-format setup frame for Gemini Live BidiGenerateContent (hub proxy + web client). */ +export function buildGeminiLiveSetupMessage(language?: string): { setup: Record } { + const liveConfig = buildGeminiLiveConfig(language) + return { + setup: { + model: `models/${liveConfig.model}`, + generationConfig: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: GEMINI_LIVE_VOICE } + } + } + }, + systemInstruction: { + parts: [{ text: liveConfig.systemInstruction }] + }, + tools: liveConfig.tools.map((t) => ({ + functionDeclarations: t.functionDeclarations.map((fd) => ({ + name: fd.name, + description: fd.description, + parameters: fd.parameters + })) + })) + } + } +} diff --git a/web/src/api/client.ts b/web/src/api/client.ts index d8dfa1bb9..0e7883772 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -567,4 +567,37 @@ export class ApiClient { body: JSON.stringify(event) }) } + + /** Return the current auth token (for WebSocket query-param auth). */ + getAuthToken(): string | null { + return this.getToken ? this.getToken() : this.token + } + + async fetchVoiceBackend(): Promise<{ backend: string }> { + return await this.request('/api/voice/backend') + } + + async fetchQwenToken(): Promise<{ + allowed: boolean + wsUrl?: string + error?: string + }> { + return await this.request('/api/voice/qwen-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } + + async fetchGeminiToken(): Promise<{ + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string + }> { + return await this.request('/api/voice/gemini-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } } diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index c5e8f27a4..df6828127 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -15,6 +15,7 @@ import { VOICE_AGENT_NAME, buildVoiceAgentConfig } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' export interface VoiceTokenResponse { allowed: boolean @@ -177,3 +178,64 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise { + try { + return await api.fetchQwenToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} + +export interface VoiceBackendResponse { + backend: VoiceBackendType +} + +export interface GeminiTokenResponse { + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string +} + +/** + * Discover which voice backend the hub is configured to use. + * Throws on network/server error or unrecognised backend value — callers must handle failures explicitly. + */ +export async function fetchVoiceBackend(api: ApiClient): Promise { + const result = await api.fetchVoiceBackend() + const { backend } = result + if (backend === 'elevenlabs' || backend === 'gemini-live' || backend === 'qwen-realtime') { + return { backend } + } + throw new Error(`Unrecognised voice backend: ${backend}`) +} + +/** + * Fetch a Gemini API key from the hub for Gemini Live voice sessions. + */ +export async function fetchGeminiToken(api: ApiClient): Promise { + try { + return await api.fetchGeminiToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx index 7cd5fb2c2..39926684b 100644 --- a/web/src/components/SessionChat.tsx +++ b/web/src/components/SessionChat.tsx @@ -35,7 +35,7 @@ import { useCodexModels } from '@/hooks/queries/useCodexModels' import { useCursorModels } from '@/hooks/queries/useCursorModels' import { useOpencodeModels } from '@/hooks/queries/useOpencodeModels' import { useVoiceOptional } from '@/lib/voice-context' -import { RealtimeVoiceSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' +import { VoiceBackendSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' import { isRemoteTerminalSupported } from '@/utils/terminalSupport' /** @@ -207,6 +207,7 @@ export function SessionChat(props: { // Voice assistant integration const voice = useVoiceOptional() + const [voiceBackendReady, setVoiceBackendReady] = useState(false) // Register session store for voice client tools useEffect(() => { @@ -673,18 +674,19 @@ export function SessionChat(props: { autocompleteSuggestions={props.autocompleteSuggestions} voiceStatus={voice?.status} voiceMicMuted={voice?.micMuted} - onVoiceToggle={voice ? handleVoiceToggle : undefined} - onVoiceMicToggle={voice ? handleVoiceMicToggle : undefined} + onVoiceToggle={voice && voiceBackendReady ? handleVoiceToggle : undefined} + onVoiceMicToggle={voice && voiceBackendReady ? handleVoiceMicToggle : undefined} /> - {/* Voice session component - renders nothing but initializes ElevenLabs */} + {/* Voice session component - renders nothing but initializes voice backend */} {voice && ( - )} diff --git a/web/src/lib/locales/en.ts b/web/src/lib/locales/en.ts index 842097126..a15343052 100644 --- a/web/src/lib/locales/en.ts +++ b/web/src/lib/locales/en.ts @@ -418,6 +418,8 @@ export default { 'settings.voice.autoDetect': 'Auto-detect', 'settings.voice.voice': 'Voice', 'settings.voice.voiceDefault': 'Default', + 'settings.voice.proactive': 'Start voice session with summary', + 'settings.voice.proactive.description': 'When on, starting a voice session opens with a spoken summary of current agent activity. When off, the assistant greets you and waits for you to speak.', 'settings.about.title': 'About', 'settings.about.website': 'Website', 'settings.about.appVersion': 'App Version', diff --git a/web/src/lib/locales/zh-CN.ts b/web/src/lib/locales/zh-CN.ts index c1873a303..dc9049674 100644 --- a/web/src/lib/locales/zh-CN.ts +++ b/web/src/lib/locales/zh-CN.ts @@ -420,6 +420,8 @@ export default { 'settings.voice.autoDetect': '自动检测', 'settings.voice.voice': '声音', 'settings.voice.voiceDefault': '默认', + 'settings.voice.proactive': '以摘要开始语音会话', + 'settings.voice.proactive.description': '开启后,启动语音会话时将朗读当前代理活动的摘要。关闭后,助手向您打招呼并等待您先开口。', 'settings.about.title': '关于', 'settings.about.website': '官方网站', 'settings.about.appVersion': '应用版本', diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx new file mode 100644 index 000000000..a23f55493 --- /dev/null +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -0,0 +1,412 @@ +import { useEffect, useRef, useCallback } from 'react' +import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' +import { registerSessionStore } from './realtimeClientTools' +import { fetchGeminiToken } from '@/api/voice' +import { GeminiAudioRecorder } from './gemini/audioRecorder' +import { GeminiAudioPlayer } from './gemini/audioPlayer' +import { handleGeminiFunctionCalls } from './gemini/toolAdapter' +import { buildGeminiLiveSetupMessage } from '@hapi/protocol/voice' +import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' +import type { ApiClient } from '@/api/client' +import type { Session } from '@/types/api' +import type { GeminiFunctionCall } from './gemini/toolAdapter' + +const DEBUG = import.meta.env.DEV + +// Default Gemini Live WebSocket API endpoint (Google direct) +const DEFAULT_GEMINI_LIVE_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' + +interface GeminiLiveState { + ws: WebSocket | null + recorder: GeminiAudioRecorder | null + player: GeminiAudioPlayer | null + playbackContext: AudioContext | null + statusCallback: StatusCallback | null + apiKey: string | null + wsBaseUrl: string | null + modelSpeaking: boolean + micMuted: boolean +} + +const state: GeminiLiveState = { + ws: null, + recorder: null, + player: null, + playbackContext: null, + statusCallback: null, + apiKey: null, + wsBaseUrl: null, + modelSpeaking: false, + micMuted: false +} + +function cleanup() { + if (state.recorder) { + state.recorder.dispose() + state.recorder = null + } + if (state.player) { + state.player.dispose() + state.player = null + } + if (state.playbackContext && state.playbackContext.state !== 'closed') { + void state.playbackContext.close() + } + state.playbackContext = null + if (state.ws) { + if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { + state.ws.close() + } + state.ws = null + } + // Always reset modelSpeaking so a restart doesn't begin with audio capture silenced + state.modelSpeaking = false +} + +class GeminiLiveVoiceSessionImpl implements VoiceSession { + private api: ApiClient + + constructor(api: ApiClient) { + this.api = api + } + + async startSession(config: VoiceSessionConfig): Promise { + cleanup() + state.statusCallback?.('connecting') + + // Create playback AudioContext immediately while still inside the user + // gesture (click/tap). Mobile browsers require this for autoplay policy. + // Store in state so cleanup() can close it on failure or stop. + state.playbackContext = new AudioContext({ sampleRate: 24000 }) + await state.playbackContext.resume() + + // Get API key from hub + console.log('[GeminiLive] Fetching token...') + const tokenResp = await fetchGeminiToken(this.api) + console.log('[GeminiLive] Token response:', { allowed: tokenResp.allowed, hasKey: !!tokenResp.apiKey, error: tokenResp.error }) + if (!tokenResp.allowed || !tokenResp.apiKey) { + const msg = tokenResp.error ?? 'Gemini API key not available' + console.error('[GeminiLive] Token failed:', msg) + state.statusCallback?.('error', msg) + cleanup() + throw new Error(msg) + } + state.apiKey = tokenResp.apiKey + state.wsBaseUrl = tokenResp.wsUrl || null + if (!state.wsBaseUrl) { + const msg = 'Hub must provide wsUrl for Gemini connections — direct key connection is not supported' + state.statusCallback?.('error', msg) + cleanup() + throw new Error(msg) + } + + // Request microphone + console.log('[GeminiLive] Requesting microphone...') + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + console.log('[GeminiLive] Microphone granted') + } catch (error) { + console.error('[GeminiLive] Microphone denied:', error) + state.statusCallback?.('error', 'Microphone permission denied') + cleanup() + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect WebSocket — use proxy URL if provided (avoids region restrictions) + const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE + const isProxy = !!state.wsBaseUrl + const authToken = this.api.getAuthToken() || '' + const languageParam = config.language ? `&language=${encodeURIComponent(config.language)}` : '' + const wsUrl = isProxy + ? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}${languageParam}` + : `${wsBase}?key=${encodeURIComponent(state.apiKey)}` + console.log('[GeminiLive] Connecting WebSocket to:', wsBase, isProxy ? '(proxied)' : '(direct)') + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let setupDone = false + + ws.onopen = () => { + if (DEBUG) console.log('[GeminiLive] WebSocket connected', isProxy ? '(hub sends setup)' : ', sending setup') + + // Proxied sessions: hub sends HAPI-owned setup server-side (see gemini-ws proxy). + if (!isProxy) { + ws.send(JSON.stringify(buildGeminiLiveSetupMessage(config.language))) + } + } + + ws.onmessage = async (event) => { + let data: Record + try { + if (event.data instanceof Blob) { + const text = await event.data.text() + data = JSON.parse(text) as Record + } else { + data = JSON.parse(event.data as string) as Record + } + } catch { + if (DEBUG) console.warn('[GeminiLive] Failed to parse message') + return + } + + // Log all message types for debugging + const msgKeys = Object.keys(data).filter(k => k !== 'serverContent' || !('modelTurn' in (data.serverContent as Record || {}))) + if (!data.serverContent) { + console.log('[GeminiLive] Message:', msgKeys.join(', '), JSON.stringify(data).slice(0, 200)) + } + + // Setup complete + if (data.setupComplete && !setupDone) { + setupDone = true + if (DEBUG) console.log('[GeminiLive] Setup complete') + + // Await audio capture so setMuted runs after getUserMedia resolves. + // Wrap so a mic failure rejects the outer startSession promise. + try { + await startAudioCapture(state.playbackContext!) + } catch (error) { + const message = error instanceof Error ? error.message : 'Microphone error' + cleanup() + state.statusCallback?.('error', message) + reject(error instanceof Error ? error : new Error(message)) + return + } + state.statusCallback?.('connected') + + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + + if (proactive && config.initialContext) { + // Proactive with context: speak the summary immediately. + sendClientContent(`[Context] ${config.initialContext}`, true) + } else { + // Reactive, or proactive with no context: feed context silently if + // available, then trigger a greeting so Gemini doesn't sit silent. + if (config.initialContext) { + sendClientContent(`[Context] ${config.initialContext}`, false) + } + sendClientContent('[Greet the user as HAPI. Say a brief hello and invite them to speak. Do not mention Gemini or any model name. Do not reference any context or recent activity.]', true) + } + + resolve() + return + } + + // Server content (audio / text / turn complete) + const serverContent = data.serverContent as { + modelTurn?: { parts?: Array<{ inlineData?: { data: string; mimeType: string }; text?: string }> } + turnComplete?: boolean + } | undefined + + if (serverContent) { + if (serverContent.modelTurn?.parts) { + // Model is generating — mute mic to prevent barge-in from noise + if (!state.modelSpeaking) { + state.modelSpeaking = true + state.recorder?.setMuted(true) + } + for (const part of serverContent.modelTurn.parts) { + if (part.inlineData?.data) { + state.player?.enqueue(part.inlineData.data) + } + if (part.text) { + console.log('[GeminiLive] Text:', part.text) + } + } + } + if (serverContent.turnComplete) { + console.log('[GeminiLive] Turn complete') + // Restore to user's chosen mute state, not unconditionally unmuted + state.modelSpeaking = false + state.recorder?.setMuted(state.micMuted) + } + } + + // Tool calls + const toolCall = data.toolCall as { + functionCalls?: Array<{ name: string; args: Record; id: string }> + } | undefined + + if (toolCall?.functionCalls && toolCall.functionCalls.length > 0) { + console.log('[GeminiLive] Tool calls:', toolCall.functionCalls.map((c) => c.name)) + + const responses = await handleGeminiFunctionCalls( + toolCall.functionCalls as GeminiFunctionCall[] + ) + + // Send tool responses back + if (state.ws?.readyState === WebSocket.OPEN) { + state.ws.send(JSON.stringify({ + toolResponse: { + functionResponses: responses.map((r) => ({ + id: r.id, + name: r.name, + response: r.response + })) + } + })) + } + } + } + + ws.onerror = (event) => { + console.error('[GeminiLive] WebSocket error:', event) + if (!setupDone) { + setupDone = true + cleanup() + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (state.ws !== ws) return + if (DEBUG) console.log('[GeminiLive] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + if (!setupDone) { + const message = event.reason || 'WebSocket closed before setup completed' + state.statusCallback?.('error', message) + reject(new Error(message)) + return + } + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + sendClientContent(message) + } + + sendContextualUpdate(update: string): void { + // Append context without triggering a response — turnComplete: false accumulates + // silently until the next sendTextMessage fires with turnComplete: true + sendClientContent(`[System Context Update] ${update}`, false) + } +} + +function sendClientContent(text: string, turnComplete = true): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + clientContent: { + turns: [{ role: 'user', parts: [{ text }] }], + turnComplete + } + })) +} + +function sendAudioChunk(base64Pcm: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + // Don't send audio while model is speaking + if (state.modelSpeaking) return + state.ws.send(JSON.stringify({ + realtimeInput: { + mediaChunks: [{ + mimeType: 'audio/pcm;rate=16000', + data: base64Pcm + }] + } + })) +} + +async function startAudioCapture(playbackContext: AudioContext): Promise { + state.player = new GeminiAudioPlayer(playbackContext) + state.recorder = new GeminiAudioRecorder() + + await state.recorder.start( + (pcm16Chunk) => sendAudioChunk(pcm16Chunk), + (error) => { + console.error('[GeminiLive] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) + + // Apply mute state after recorder has a stream — safe to call either way + state.recorder.setMuted(state.micMuted) +} + +// --- React component --- + +export interface GeminiLiveVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + onRegistered?: () => void + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function GeminiLiveVoiceSession({ + api, + micMuted = false, + onStatusChange, + onRegistered, + getSession, + sendMessage, + approvePermission, + denyPermission +}: GeminiLiveVoiceSessionProps) { + const hasRegistered = useRef(false) + + // Store status callback + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + // Register session store for client tools + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + // Register voice session once + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new GeminiLiveVoiceSessionImpl(api)) + hasRegistered.current = true + onRegistered?.() + } catch (error) { + console.error('[GeminiLive] Failed to register voice session:', error) + } + } + }, [api]) // eslint-disable-line react-hooks/exhaustive-deps + + // Sync mic mute state — also persist to module state so startAudioCapture can apply it + useEffect(() => { + state.micMuted = micMuted + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + + // Cleanup on unmount + useEffect(() => { + return () => { + cleanup() + } + }, []) + + return null +} diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx new file mode 100644 index 000000000..f99ee3976 --- /dev/null +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -0,0 +1,412 @@ +import { useEffect, useRef } from 'react' +import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' +import { registerSessionStore } from './realtimeClientTools' +import { fetchQwenToken } from '@/api/voice' +import { GeminiAudioRecorder } from './gemini/audioRecorder' +import { GeminiAudioPlayer } from './gemini/audioPlayer' +import { realtimeClientTools } from './realtimeClientTools' +import { + VOICE_SYSTEM_PROMPT, + buildVoiceLanguageBlock, +} from '@hapi/protocol/voice' +import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' +import type { ApiClient } from '@/api/client' +import type { Session } from '@/types/api' + +const DEBUG = import.meta.env.DEV + +// Qwen WebSocket connects via Hub proxy (browser can't set Authorization header) + +interface QwenState { + ws: WebSocket | null + recorder: GeminiAudioRecorder | null + player: GeminiAudioPlayer | null + playbackContext: AudioContext | null + statusCallback: StatusCallback | null + apiKey: string | null + wsBaseUrl: string | null + micMuted: boolean +} + +const state: QwenState = { + ws: null, + recorder: null, + player: null, + playbackContext: null, + statusCallback: null, + apiKey: null, + wsBaseUrl: null, + micMuted: false +} + +let eventCounter = 0 +function nextEventId(): string { + return `evt_${++eventCounter}` +} + +function cleanup() { + if (state.recorder) { + state.recorder.dispose() + state.recorder = null + } + if (state.player) { + state.player.dispose() + state.player = null + } + if (state.playbackContext && state.playbackContext.state !== 'closed') { + void state.playbackContext.close() + } + state.playbackContext = null + if (state.ws) { + if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { + state.ws.close() + } + state.ws = null + } +} + +function sendEvent(type: string, payload?: Record): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + event_id: nextEventId(), + type, + ...payload + })) +} + + +class QwenVoiceSessionImpl implements VoiceSession { + private api: ApiClient + private currentInstructions: string | null = null + + constructor(api: ApiClient) { + this.api = api + } + + private updateInstructions(update: string): void { + if (this.currentInstructions === null) return + this.currentInstructions = `${this.currentInstructions}\n\n${update}` + // Hub filter allows only instruction-only session.update frames. + sendEvent('session.update', { session: { instructions: this.currentInstructions } }) + } + + async startSession(config: VoiceSessionConfig): Promise { + // Mirror the base instructions the hub will send so subsequent updates accumulate correctly. + this.currentInstructions = `${VOICE_SYSTEM_PROMPT}${buildVoiceLanguageBlock(config.language)}` + cleanup() + state.statusCallback?.('connecting') + + // Create playback AudioContext immediately while still inside the user + // gesture (click/tap). Mobile browsers require this for autoplay policy. + // Store in state so cleanup() can close it on failure or stop. + state.playbackContext = new AudioContext({ sampleRate: 24000 }) + await state.playbackContext.resume() + + // Check Qwen availability (hub no longer sends the raw API key) + const tokenResp = await fetchQwenToken(this.api) + if (!tokenResp.allowed) { + const msg = tokenResp.error ?? 'DashScope API key not available' + state.statusCallback?.('error', msg) + cleanup() + throw new Error(msg) + } + state.apiKey = null // key stays server-side + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + } catch (error) { + state.statusCallback?.('error', 'Microphone permission denied') + cleanup() + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect via Hub WebSocket proxy (DashScope requires Authorization header, + // which browser WebSocket API doesn't support) + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + const defaultProxyUrl = `${protocol}//${window.location.host}/api/voice/qwen-ws` + const proxyUrl = state.wsBaseUrl || defaultProxyUrl + const authToken = this.api.getAuthToken() || '' + const separator = proxyUrl.includes('?') ? '&' : '?' + const langParam = config.language ? `&language=${encodeURIComponent(config.language)}` : '' + const wsUrl = `${proxyUrl}${separator}token=${encodeURIComponent(authToken)}${langParam}` + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let sessionReady = false + + ws.onopen = () => { + if (DEBUG) console.log('[Qwen] WebSocket connected') + } + + ws.onmessage = async (event) => { + let data: Record + try { + data = JSON.parse(event.data as string) as Record + } catch { + if (DEBUG) console.warn('[Qwen] Failed to parse message') + return + } + + const eventType = data.type as string + + // Session created — hub sends the initial session.update; browser waits for session.updated. + if (eventType === 'session.created') { + if (DEBUG) console.log('[Qwen] Session created (hub owns setup)') + return + } + + // Session updated - only act on the first one (initial config ack). + // Subsequent session.update calls (for instruction appends) also + // echo session.updated — ignore those after setup is complete. + if (eventType === 'session.updated') { + if (sessionReady) return + sessionReady = true + if (DEBUG) console.log('[Qwen] Session configured') + try { + await startAudioCapture(state.playbackContext!) + } catch (error) { + const message = error instanceof Error ? error.message : 'Microphone error' + cleanup() + state.statusCallback?.('error', message) + reject(error instanceof Error ? error : new Error(message)) + return + } + state.statusCallback?.('connected') + + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + if (proactive && config.initialContext) { + this.sendTextMessage(`[Context] ${config.initialContext}`) + } else { + if (config.initialContext) { + this.sendContextualUpdate(config.initialContext) + } + this.sendTextMessage('[Greet the user as HAPI. Say a brief hello and invite them to speak. Do not mention Qwen or any model name. Do not reference any context or recent activity.]') + } + + resolve() + return + } + + // Audio output streaming + if (eventType === 'response.audio.delta') { + const delta = data.delta as string + if (delta) { + state.player?.enqueue(delta) + } + return + } + + // Text transcript (for debug) + if (eventType === 'response.audio_transcript.delta' && DEBUG) { + console.log('[Qwen] Transcript:', data.delta) + return + } + + // Function call complete + if (eventType === 'response.function_call_arguments.done') { + const callId = data.call_id as string + const fnName = data.name as string + const argsStr = data.arguments as string + + if (DEBUG) console.log('[Qwen] Tool call:', fnName, argsStr) + + let args: Record = {} + try { args = JSON.parse(argsStr) } catch { /* empty */ } + + // Execute the tool + const handler = fnName === 'messageCodingAgent' + ? realtimeClientTools.messageCodingAgent + : fnName === 'processPermissionRequest' + ? realtimeClientTools.processPermissionRequest + : null + + const result = handler + ? await handler(args) + : `error (unknown tool: ${fnName})` + + // Send function result back + sendEvent('conversation.item.create', { + item: { + type: 'function_call_output', + call_id: callId, + output: typeof result === 'string' ? result : JSON.stringify(result) + } + }) + // Trigger model to continue + sendEvent('response.create') + return + } + + // VAD: user started speaking - barge-in + if (eventType === 'input_audio_buffer.speech_started') { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + return + } + + // Response done + if (eventType === 'response.done' && DEBUG) { + const resp = data.response as Record | undefined + const usage = resp?.usage as Record | undefined + if (usage) console.log('[Qwen] Usage:', usage) + return + } + + // Error + if (eventType === 'error') { + const err = data.error as { message?: string } | undefined + const message = err?.message || 'Realtime session setup failed' + console.error('[Qwen] Server error:', message) + state.statusCallback?.('error', message) + if (!sessionReady) { + reject(new Error(message)) + ws.close() + } + return + } + } + + ws.onerror = (event) => { + console.error('[Qwen] WebSocket error:', event) + if (!sessionReady) { + sessionReady = true + cleanup() + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (state.ws !== ws) return + if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + if (!sessionReady) { + const message = event.reason || 'WebSocket closed before setup completed' + state.statusCallback?.('error', message) + reject(new Error(message)) + return + } + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + this.currentInstructions = null + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + // Qwen Realtime requires a user conversation item before response.create. + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: message }] + } + }) + sendEvent('response.create') + } + + sendContextualUpdate(update: string): void { + // Append context silently — no response.create, so model doesn't speak yet. + this.updateInstructions(`[System Context Update] ${update}`) + } +} + +async function startAudioCapture(playbackContext: AudioContext): Promise { + state.player = new GeminiAudioPlayer(playbackContext) + state.recorder = new GeminiAudioRecorder() + + await state.recorder.start( + (base64Pcm) => { + sendEvent('input_audio_buffer.append', { audio: base64Pcm }) + }, + (error) => { + console.error('[Qwen] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) + + // Apply mute state after recorder has a stream — safe to call either way + state.recorder.setMuted(state.micMuted) +} + +// --- React component --- + +export interface QwenVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + onRegistered?: () => void + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function QwenVoiceSession({ + api, + micMuted = false, + onStatusChange, + onRegistered, + getSession, + sendMessage, + approvePermission, + denyPermission +}: QwenVoiceSessionProps) { + const hasRegistered = useRef(false) + + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new QwenVoiceSessionImpl(api)) + hasRegistered.current = true + onRegistered?.() + } catch (error) { + console.error('[Qwen] Failed to register voice session:', error) + } + } + }, [api]) // eslint-disable-line react-hooks/exhaustive-deps + + // Sync mic mute state — also persist to module state so startAudioCapture can apply it + useEffect(() => { + state.micMuted = micMuted + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + useEffect(() => { + return () => { cleanup() } + }, []) + + return null +} diff --git a/web/src/realtime/RealtimeVoiceSession.tsx b/web/src/realtime/RealtimeVoiceSession.tsx index 428d1c171..89e438b74 100644 --- a/web/src/realtime/RealtimeVoiceSession.tsx +++ b/web/src/realtime/RealtimeVoiceSession.tsx @@ -136,6 +136,7 @@ export interface RealtimeVoiceSessionProps { api: ApiClient micMuted?: boolean onStatusChange?: StatusCallback + onRegistered?: () => void getSession?: (sessionId: string) => Session | null sendMessage?: (sessionId: string, message: string) => void approvePermission?: (sessionId: string, requestId: string) => Promise @@ -146,6 +147,7 @@ export function RealtimeVoiceSession({ api, micMuted: micMutedProp = false, onStatusChange, + onRegistered, getSession, sendMessage, approvePermission, @@ -241,6 +243,7 @@ export function RealtimeVoiceSession({ try { registerVoiceSession(new RealtimeVoiceSessionImpl(api)) hasRegistered.current = true + onRegistered?.() } catch (error) { console.error('[Voice] Failed to register voice session:', error) } diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx new file mode 100644 index 000000000..a30cf488f --- /dev/null +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -0,0 +1,69 @@ +import { lazy, Suspense, useCallback, useEffect, useState } from 'react' +import { RealtimeVoiceSession } from './RealtimeVoiceSession' +import type { RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +import { fetchVoiceBackend } from '@/api/voice' +import type { ApiClient } from '@/api/client' +import type { VoiceBackendType } from '@hapi/protocol/voice' + +// Lazy-load alternative backends to avoid bundling when using ElevenLabs +const GeminiLiveVoiceSession = lazy(() => + import('./GeminiLiveVoiceSession').then((m) => ({ default: m.GeminiLiveVoiceSession })) +) +const QwenVoiceSession = lazy(() => + import('./QwenVoiceSession').then((m) => ({ default: m.QwenVoiceSession })) +) + +export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { + api: ApiClient + onReadyChange?: (ready: boolean) => void +} + +/** + * Dynamically selects the voice session component based on the hub's configured backend. + * Queries GET /voice/backend once on mount and renders the appropriate component. + * Only signals readiness after the selected backend has mounted and registered its session. + */ +export function VoiceBackendSession(props: VoiceBackendSessionProps) { + const [backend, setBackend] = useState(null) + + useEffect(() => { + let cancelled = false + setBackend(null) + fetchVoiceBackend(props.api).then((resp) => { + if (!cancelled) setBackend(resp.backend) + }).catch((err: unknown) => { + if (!cancelled) { + const msg = err instanceof Error ? err.message : 'Could not detect voice backend' + props.onStatusChange?.('error', msg) + } + }) + return () => { + cancelled = true + props.onReadyChange?.(false) + } + }, [props.api]) // eslint-disable-line react-hooks/exhaustive-deps + + const handleRegistered = useCallback(() => { + props.onReadyChange?.(true) + }, [props.onReadyChange]) + + if (!backend) return null + + if (backend === 'gemini-live') { + return ( + + + + ) + } + + if (backend === 'qwen-realtime') { + return ( + + + + ) + } + + return +} diff --git a/web/src/realtime/gemini/audioPlayer.ts b/web/src/realtime/gemini/audioPlayer.ts new file mode 100644 index 000000000..23d1d341e --- /dev/null +++ b/web/src/realtime/gemini/audioPlayer.ts @@ -0,0 +1,75 @@ +import { base64ToArrayBuffer, pcm16ToFloat32 } from './pcmUtils'; + +export class GeminiAudioPlayer { + private audioContext: AudioContext; + private ownsContext: boolean; + private lastEndTime: number = 0; + private activeSources: AudioBufferSourceNode[] = []; + + constructor(audioContext?: AudioContext) { + if (audioContext) { + this.audioContext = audioContext; + this.ownsContext = false; + } else { + this.audioContext = new AudioContext({ sampleRate: 24000 }); + this.ownsContext = true; + } + this.lastEndTime = this.audioContext.currentTime; + } + + enqueue(base64Pcm: string): void { + if (this.audioContext.state === 'suspended') { + this.audioContext.resume(); + } + + const arrayBuffer = base64ToArrayBuffer(base64Pcm); + const float32Data = pcm16ToFloat32(arrayBuffer); + + if (float32Data.length === 0) return; + + const audioBuffer = this.audioContext.createBuffer(1, float32Data.length, 24000); + audioBuffer.copyToChannel(new Float32Array(float32Data), 0); + + const source = this.audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(this.audioContext.destination); + + const startTime = Math.max(this.audioContext.currentTime, this.lastEndTime); + + source.onended = () => { + const index = this.activeSources.indexOf(source); + if (index > -1) { + this.activeSources.splice(index, 1); + } + }; + + source.start(startTime); + this.activeSources.push(source); + + this.lastEndTime = startTime + audioBuffer.duration; + } + + clearQueue(): void { + this.activeSources.forEach(source => { + try { + source.stop(); + } catch (e) { + // Ignore if already stopped + } + source.disconnect(); + }); + this.activeSources = []; + this.lastEndTime = this.audioContext.currentTime; + } + + isPlaying(): boolean { + return this.lastEndTime > this.audioContext.currentTime; + } + + dispose(): void { + this.clearQueue(); + if (this.ownsContext && this.audioContext.state !== 'closed') { + this.audioContext.close(); + } + } +} diff --git a/web/src/realtime/gemini/audioRecorder.ts b/web/src/realtime/gemini/audioRecorder.ts new file mode 100644 index 000000000..98813212a --- /dev/null +++ b/web/src/realtime/gemini/audioRecorder.ts @@ -0,0 +1,139 @@ +import { float32ToPcm16, arrayBufferToBase64 } from './pcmUtils'; + +// Inline worklet source to avoid Vite bundling issues with ?url imports. +// AudioWorklet.addModule() requires a URL to valid JS, so we create a Blob URL. +const WORKLET_SOURCE = ` +class PcmRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.buffer = new Float32Array(4096); + this.idx = 0; + } + process(inputs) { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.idx++] = channel[i]; + if (this.idx >= 4096) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.idx = 0; + } + } + } + return true; + } +} +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); +`; + +function createWorkletUrl(): string { + const blob = new Blob([WORKLET_SOURCE], { type: 'application/javascript' }); + return URL.createObjectURL(blob); +} + +export class GeminiAudioRecorder { + private audioContext: AudioContext | null = null; + private mediaStream: MediaStream | null = null; + private sourceNode: MediaStreamAudioSourceNode | null = null; + private workletNode: AudioWorkletNode | null = null; + private scriptNode: ScriptProcessorNode | null = null; + + async start(onChunk: (base64Pcm: string) => void, onError?: (error: Error) => void): Promise { + try { + this.mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { sampleRate: 16000, channelCount: 1 } + }); + + this.mediaStream.getTracks().forEach((track) => { + track.onended = () => { + if (onError) onError(new Error('Microphone disconnected')); + }; + }); + + this.audioContext = new AudioContext({ sampleRate: 16000 }); + if (this.audioContext.state === 'suspended') { + await this.audioContext.resume(); + } + + this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); + + try { + const workletUrl = createWorkletUrl(); + await this.audioContext.audioWorklet.addModule(workletUrl); + URL.revokeObjectURL(workletUrl); + + this.workletNode = new AudioWorkletNode(this.audioContext, 'pcm-recorder-processor'); + this.workletNode.port.onmessage = (event) => { + const pcm16 = float32ToPcm16(event.data.samples); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + // Connect source → worklet → silent sink → destination. + // The downstream connection is required so the audio graph pulls + // frames through the worklet node and port.onmessage fires. + const sink = this.audioContext.createGain(); + sink.gain.value = 0; + this.sourceNode.connect(this.workletNode); + this.workletNode.connect(sink); + sink.connect(this.audioContext.destination); + } catch (e) { + console.warn('[GeminiLive] AudioWorklet failed, falling back to ScriptProcessorNode', e); + this.scriptNode = this.audioContext.createScriptProcessor(4096, 1, 1); + this.scriptNode.onaudioprocess = (event) => { + const inputData = event.inputBuffer.getChannelData(0); + const pcm16 = float32ToPcm16(new Float32Array(inputData)); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + this.sourceNode.connect(this.scriptNode); + this.scriptNode.connect(this.audioContext.destination); + } + } catch (e) { + if (onError) onError(e instanceof Error ? e : new Error(String(e))); + throw e; + } + } + + stop(): void { + if (this.mediaStream) { + this.mediaStream.getTracks().forEach(track => { + track.onended = null; + track.stop(); + }); + this.mediaStream = null; + } + + if (this.scriptNode) { + this.scriptNode.disconnect(); + this.scriptNode = null; + } + + if (this.workletNode) { + this.workletNode.disconnect(); + this.workletNode = null; + } + + if (this.sourceNode) { + this.sourceNode.disconnect(); + this.sourceNode = null; + } + + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + } + + setMuted(muted: boolean): void { + if (this.mediaStream) { + this.mediaStream.getAudioTracks().forEach(track => { + track.enabled = !muted; + }); + } + } + + dispose(): void { + this.stop(); + } +} diff --git a/web/src/realtime/gemini/pcm-recorder.worklet.ts b/web/src/realtime/gemini/pcm-recorder.worklet.ts new file mode 100644 index 000000000..404f65445 --- /dev/null +++ b/web/src/realtime/gemini/pcm-recorder.worklet.ts @@ -0,0 +1,35 @@ +// AudioWorklet processor runs in a separate scope with its own globals. +// These declarations satisfy TypeScript without pulling in DOM lib types. +declare class AudioWorkletProcessor { + readonly port: MessagePort + constructor() +} +declare function registerProcessor(name: string, ctor: new () => AudioWorkletProcessor): void + +class PcmRecorderProcessor extends AudioWorkletProcessor { + private buffer: Float32Array; + private bufferSize = 4096; + private bufferIndex = 0; + + constructor() { + super(); + this.buffer = new Float32Array(this.bufferSize); + } + + process(inputs: Float32Array[][]): boolean { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.bufferIndex++] = channel[i]; + if (this.bufferIndex >= this.bufferSize) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.bufferIndex = 0; + } + } + } + return true; + } +} + +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); diff --git a/web/src/realtime/gemini/pcmUtils.test.ts b/web/src/realtime/gemini/pcmUtils.test.ts new file mode 100644 index 000000000..1b2159f5c --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.test.ts @@ -0,0 +1,60 @@ +import { describe, test, expect } from 'vitest' +import { + float32ToPcm16, + pcm16ToFloat32, + arrayBufferToBase64, + base64ToArrayBuffer +} from './pcmUtils' + +describe('pcmUtils', () => { + describe('float32ToPcm16 / pcm16ToFloat32 round-trip', () => { + test('preserves signal within quantization error', () => { + const input = new Float32Array([0, 0.5, -0.5, 1.0, -1.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(output.length).toBe(input.length) + for (let i = 0; i < input.length; i++) { + expect(Math.abs(output[i] - input[i])).toBeLessThan(0.001) + } + }) + + test('clamps values outside [-1, 1]', () => { + const input = new Float32Array([2.0, -2.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(Math.abs(output[0] - 1.0)).toBeLessThan(0.001) + expect(Math.abs(output[1] - (-1.0))).toBeLessThan(0.001) + }) + + test('handles empty input', () => { + const input = new Float32Array(0) + const pcm16 = float32ToPcm16(input) + expect(pcm16.byteLength).toBe(0) + const output = pcm16ToFloat32(pcm16) + expect(output.length).toBe(0) + }) + }) + + describe('arrayBufferToBase64 / base64ToArrayBuffer round-trip', () => { + test('preserves binary data', () => { + const original = new Uint8Array([0, 1, 127, 128, 255]) + const base64 = arrayBufferToBase64(original.buffer) + const restored = new Uint8Array(base64ToArrayBuffer(base64)) + + expect(restored.length).toBe(original.length) + for (let i = 0; i < original.length; i++) { + expect(restored[i]).toBe(original[i]) + } + }) + + test('handles empty buffer', () => { + const empty = new ArrayBuffer(0) + const base64 = arrayBufferToBase64(empty) + expect(base64).toBe('') + const restored = base64ToArrayBuffer(base64) + expect(restored.byteLength).toBe(0) + }) + }) +}) diff --git a/web/src/realtime/gemini/pcmUtils.ts b/web/src/realtime/gemini/pcmUtils.ts new file mode 100644 index 000000000..67e2928fc --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.ts @@ -0,0 +1,39 @@ +export function float32ToPcm16(samples: Float32Array): ArrayBuffer { + const buffer = new ArrayBuffer(samples.length * 2); + const view = new DataView(buffer); + for (let i = 0; i < samples.length; i++) { + let s = Math.max(-1, Math.min(1, samples[i])); + s = s < 0 ? s * 0x8000 : s * 0x7FFF; + view.setInt16(i * 2, s, true); + } + return buffer; +} + +export function pcm16ToFloat32(buffer: ArrayBuffer): Float32Array { + const int16Array = new Int16Array(buffer); + const float32Array = new Float32Array(int16Array.length); + for (let i = 0; i < int16Array.length; i++) { + const s = int16Array[i]; + float32Array[i] = s < 0 ? s / 0x8000 : s / 0x7FFF; + } + return float32Array; +} + +export function arrayBufferToBase64(buffer: ArrayBuffer): string { + let binary = ''; + const bytes = new Uint8Array(buffer); + const len = bytes.byteLength; + for (let i = 0; i < len; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +} + +export function base64ToArrayBuffer(base64: string): ArrayBuffer { + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes.buffer; +} diff --git a/web/src/realtime/gemini/toolAdapter.test.ts b/web/src/realtime/gemini/toolAdapter.test.ts new file mode 100644 index 000000000..651e890a1 --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.test.ts @@ -0,0 +1,28 @@ +import { describe, test, expect } from 'vitest' +import { handleGeminiFunctionCall, handleGeminiFunctionCalls } from './toolAdapter' +import type { GeminiFunctionCall } from './toolAdapter' + +describe('toolAdapter', () => { + test('returns error for unknown tool', async () => { + const call: GeminiFunctionCall = { + name: 'unknownTool', + args: {}, + id: 'call-1' + } + const resp = await handleGeminiFunctionCall(call) + expect(resp.name).toBe('unknownTool') + expect(resp.id).toBe('call-1') + expect(resp.response.result).toContain('unknown tool') + }) + + test('handles multiple calls in parallel', async () => { + const calls: GeminiFunctionCall[] = [ + { name: 'unknownA', args: {}, id: 'a' }, + { name: 'unknownB', args: {}, id: 'b' } + ] + const responses = await handleGeminiFunctionCalls(calls) + expect(responses.length).toBe(2) + expect(responses[0].id).toBe('a') + expect(responses[1].id).toBe('b') + }) +}) diff --git a/web/src/realtime/gemini/toolAdapter.ts b/web/src/realtime/gemini/toolAdapter.ts new file mode 100644 index 000000000..dbf4dee9c --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.ts @@ -0,0 +1,76 @@ +import { realtimeClientTools } from '../realtimeClientTools' + +/** + * Gemini Live API function call from server. + * Matches the `toolCall` shape in a BidiGenerateContent serverMessage. + */ +export interface GeminiFunctionCall { + name: string + args: Record + id: string +} + +/** + * Response sent back to Gemini Live via `toolResponse`. + */ +export interface GeminiFunctionResponse { + name: string + id: string + response: { result: string } +} + +type ClientToolHandler = (parameters: unknown) => Promise + +const toolHandlers: Record = { + messageCodingAgent: realtimeClientTools.messageCodingAgent, + processPermissionRequest: realtimeClientTools.processPermissionRequest +} + +/** + * Execute a Gemini Live function call using the existing client tool handlers. + * Returns a GeminiFunctionResponse ready to send back over the WebSocket. + */ +export async function handleGeminiFunctionCall( + call: GeminiFunctionCall +): Promise { + const handler = toolHandlers[call.name] + + if (!handler) { + return { + name: call.name, + id: call.id, + response: { result: `error (unknown tool: ${call.name})` } + } + } + + try { + const result = await handler(call.args) + return { + name: call.name, + id: call.id, + response: { result } + } + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown error' + return { + name: call.name, + id: call.id, + response: { result: `error (${message})` } + } + } +} + +/** + * Process multiple function calls sequentially to avoid racing on shared + * session state (e.g. processPermissionRequest resolving the same pending + * request twice when calls run in parallel). + */ +export async function handleGeminiFunctionCalls( + calls: GeminiFunctionCall[] +): Promise { + const responses: GeminiFunctionResponse[] = [] + for (const call of calls) { + responses.push(await handleGeminiFunctionCall(call)) + } + return responses +} diff --git a/web/src/realtime/hooks/voiceHooks.ts b/web/src/realtime/hooks/voiceHooks.ts index c6318d3c1..e87630294 100644 --- a/web/src/realtime/hooks/voiceHooks.ts +++ b/web/src/realtime/hooks/voiceHooks.ts @@ -151,7 +151,13 @@ export const voiceHooks = { reportSession(sessionId) const messages = messagesGetter?.(sessionId) ?? [] const lastAssistantText = extractLastAssistantSpeakable(messages) - reportTextUpdate(formatReadyEvent(sessionId, lastAssistantText)) + const update = formatReadyEvent(sessionId, lastAssistantText) + const proactive = localStorage.getItem('hapi-voice-proactive') === 'true' + if (proactive) { + reportTextUpdate(update) + } else { + reportContextualUpdate(update) + } }, /** diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index 58b7b229e..d3e58c686 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -15,8 +15,9 @@ export { // Client tools export { realtimeClientTools, registerSessionStore } from './realtimeClientTools' -// Voice session component +// Voice session components export { RealtimeVoiceSession, type RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +export { VoiceBackendSession, type VoiceBackendSessionProps } from './VoiceBackendSession' // Voice hooks export { voiceHooks, registerVoiceHooksStore } from './hooks/voiceHooks' diff --git a/web/src/routes/settings/index.tsx b/web/src/routes/settings/index.tsx index d754e9ad7..d798a64e4 100644 --- a/web/src/routes/settings/index.tsx +++ b/web/src/routes/settings/index.tsx @@ -349,6 +349,20 @@ export default function SettingsPage() { const [playingVoiceId, setPlayingVoiceId] = useState(null) const currentAudioRef = useRef(null) + // Voice proactive mode - read from localStorage, default false (reactive) + const [voiceProactive, setVoiceProactive] = useState(() => { + return localStorage.getItem('hapi-voice-proactive') === 'true' + }) + + const handleVoiceProactiveChange = (value: boolean) => { + setVoiceProactive(value) + if (value) { + localStorage.setItem('hapi-voice-proactive', 'true') + } else { + localStorage.removeItem('hapi-voice-proactive') + } + } + const fontScaleOptions = getFontScaleOptions() const terminalFontSizeOptions = getTerminalFontSizeOptions() const composerEnterBehaviorOptions = getComposerEnterBehaviorOptions() @@ -988,7 +1002,6 @@ export default function SettingsPage() { )} -
+ +
+
+ {t('settings.voice.proactive')} + +
+

{t('settings.voice.proactive.description')}

+
{/* About section */} diff --git a/web/tsconfig.json b/web/tsconfig.json index 8b0682a4b..de7bcdca5 100644 --- a/web/tsconfig.json +++ b/web/tsconfig.json @@ -11,5 +11,6 @@ "@/*": ["./src/*"] } }, - "include": ["src"] + "include": ["src"], + "exclude": ["src/**/*.test.ts", "src/**/*.test.tsx", "src/**/*.spec.ts", "src/**/*.spec.tsx"] }