Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6fb3bb2
feat(voice): pluggable voice backend with Gemini Live & Qwen Realtime
heavygee May 25, 2026
4f4db15
fix(voice): restore user mic mute state after Gemini turn completes
heavygee May 25, 2026
03db00b
fix(voice): remove hard-coded Chinese language from Gemini backend
heavygee May 25, 2026
cdea28d
fix(voice): reset modelSpeaking in cleanup to unblock mic on restart
heavygee May 25, 2026
0628457
fix(voice): guard stale close handlers in Gemini and Qwen sessions
heavygee May 25, 2026
8994835
fix(voice): remove hard-coded Chinese language from Qwen backend
heavygee May 25, 2026
a9dfd1c
feat(voice): proactive/reactive toggle in voice settings
heavygee May 25, 2026
ea8ac50
fix(voice): normalize WS close codes, drop barrel re-exports, fix SSE…
heavygee May 25, 2026
d7c04db
fix(voice): respect language setting in Gemini/Qwen; fix voice-start …
heavygee May 26, 2026
d68f97d
fix(voice): send greeting trigger in reactive mode for Gemini
heavygee May 26, 2026
f67296c
fix(voice): suppress Gemini self-identification and context leak in g…
heavygee May 26, 2026
1a10d86
fix(voice): address code review findings — error handling, proxy, audio
heavygee May 26, 2026
580f82c
fix(voice): trailing-slash WS URL, Qwen session.update schema
heavygee May 26, 2026
6e8eba6
fix(voice): await audio capture before setMuted; sanitize upstream cl…
heavygee May 26, 2026
6403596
fix(voice): wrap startAudioCapture in try/catch to propagate mic errors
heavygee May 26, 2026
3a2d841
fix(voice): propagate backend discovery failure instead of silently f…
heavygee May 26, 2026
533f4e2
fix(voice): throw on unrecognised backend value instead of silently f…
heavygee May 26, 2026
ec5a170
fix(voice): add Qwen greeting/proactive trigger; fix socket buffer fo…
heavygee May 26, 2026
f107aa3
fix(voice): replace unsupported conversation.item.create with session…
heavygee May 26, 2026
d9e72ad
fix(voice): guard session.updated re-entry and reset config on sessio…
heavygee May 26, 2026
4c24d11
fix(voice): assert wsUrl presence for Gemini proxy connections
heavygee May 27, 2026
c84fbd4
fix(voice): correct Qwen audio formats and default voice
heavygee May 27, 2026
f273eca
fix(voice): close AudioContext on failed voice session start
heavygee May 27, 2026
f991093
chore: restore non-voice files to upstream/main state
heavygee May 27, 2026
7b3d998
fix(voice): harden Gemini and Qwen WS proxies against client abuse
heavygee May 28, 2026
d0335e4
fix(voice): harden Qwen proxy — hub-owned setup, client frame allowlist
heavygee May 31, 2026
2e5ca91
fix(voice): respect Qwen session.created→session.update protocol orde…
heavygee May 31, 2026
090da8d
fix(voice): use Realtime tool shape for Qwen session.update (not chat…
heavygee May 31, 2026
bb84dd1
fix(voice): update Qwen Realtime model, voice, and endpoint for intl …
heavygee Jun 1, 2026
cdcd54d
fix(voice): correct Qwen text injection and generalise language handling
heavygee Jun 1, 2026
71aba34
fix(hub): gate Gemini client frames until upstream setupComplete
heavygee Jun 2, 2026
72eaec4
fix(hub): cap Gemini setup-window pending queue at 1 MiB
heavygee Jun 2, 2026
ab99be0
fix(gemini): pass all language codes to hub proxy, not just zh
heavygee Jun 2, 2026
358c197
fix(voice): expand LANGUAGE_NAMES to cover full ElevenLabs language set
heavygee Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 153 additions & 1 deletion hub/src/web/routes/voice.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { describe, expect, it, mock } from 'bun:test'
import { describe, expect, it, mock, test, afterEach } from 'bun:test'
import { Hono } from 'hono'
import { SignJWT } from 'jose'
import type { WebAppEnv } from '../middleware/auth'
Expand Down Expand Up @@ -188,3 +188,155 @@ describe('POST /api/voice/token', () => {
else delete process.env.ELEVENLABS_AGENT_ID
})
})

describe('GET /api/voice/backend', () => {
const originalEnv = process.env.VOICE_BACKEND

afterEach(() => {
if (originalEnv === undefined) {
delete process.env.VOICE_BACKEND
} else {
process.env.VOICE_BACKEND = originalEnv
}
})

test('returns elevenlabs by default', async () => {
delete process.env.VOICE_BACKEND
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/backend', { headers })
expect(res.status).toBe(200)
const body = await res.json() as { backend: string }
expect(body.backend).toBe('elevenlabs')
})

test('returns gemini-live when configured', async () => {
process.env.VOICE_BACKEND = 'gemini-live'
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/backend', { headers })
expect(res.status).toBe(200)
const body = await res.json() as { backend: string }
expect(body.backend).toBe('gemini-live')
})

test('returns qwen-realtime when configured', async () => {
process.env.VOICE_BACKEND = 'qwen-realtime'
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/backend', { headers })
expect(res.status).toBe(200)
const body = await res.json() as { backend: string }
expect(body.backend).toBe('qwen-realtime')
})

test('falls back to elevenlabs for unknown values', async () => {
process.env.VOICE_BACKEND = 'unknown-backend'
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/backend', { headers })
expect(res.status).toBe(200)
const body = await res.json() as { backend: string }
expect(body.backend).toBe('elevenlabs')
})
})

describe('POST /api/voice/gemini-token', () => {
const origGemini = process.env.GEMINI_API_KEY
const origGoogle = process.env.GOOGLE_API_KEY

afterEach(() => {
if (origGemini === undefined) delete process.env.GEMINI_API_KEY
else process.env.GEMINI_API_KEY = origGemini
if (origGoogle === undefined) delete process.env.GOOGLE_API_KEY
else process.env.GOOGLE_API_KEY = origGoogle
})

test('returns 400 when no API key configured', async () => {
delete process.env.GEMINI_API_KEY
delete process.env.GOOGLE_API_KEY
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers })
expect(res.status).toBe(400)
const body = await res.json() as { allowed: boolean; error: string }
expect(body.allowed).toBe(false)
expect(body.error).toContain('not configured')
})

test('returns proxied wsUrl when GEMINI_API_KEY is set', async () => {
process.env.GEMINI_API_KEY = 'test-gemini-key'
delete process.env.GOOGLE_API_KEY
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers })
expect(res.status).toBe(200)
const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string }
expect(body.allowed).toBe(true)
expect(body.apiKey).toBe('proxied')
expect(body.wsUrl).toContain('/api/voice/gemini-ws')
})

test('falls back to GOOGLE_API_KEY', async () => {
delete process.env.GEMINI_API_KEY
process.env.GOOGLE_API_KEY = 'test-google-key'
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/gemini-token', { method: 'POST', headers })
expect(res.status).toBe(200)
const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string }
expect(body.allowed).toBe(true)
expect(body.apiKey).toBe('proxied')
expect(body.wsUrl).toContain('/api/voice/gemini-ws')
})
})

describe('POST /api/voice/qwen-token', () => {
const origDash = process.env.DASHSCOPE_API_KEY
const origQwen = process.env.QWEN_API_KEY

afterEach(() => {
if (origDash === undefined) delete process.env.DASHSCOPE_API_KEY
else process.env.DASHSCOPE_API_KEY = origDash
if (origQwen === undefined) delete process.env.QWEN_API_KEY
else process.env.QWEN_API_KEY = origQwen
})

test('returns 400 when no API key configured', async () => {
delete process.env.DASHSCOPE_API_KEY
delete process.env.QWEN_API_KEY
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers })
expect(res.status).toBe(400)
const body = await res.json() as { allowed: boolean; error: string }
expect(body.allowed).toBe(false)
expect(body.error).toContain('not configured')
})

test('returns wsUrl when DASHSCOPE_API_KEY is set (no raw key exposed)', async () => {
process.env.DASHSCOPE_API_KEY = 'test-dash-key'
delete process.env.QWEN_API_KEY
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers })
expect(res.status).toBe(200)
const body = await res.json() as { allowed: boolean; wsUrl: string }
expect(body.allowed).toBe(true)
expect(body.wsUrl).toContain('/api/voice/qwen-ws')
expect(body).not.toHaveProperty('apiKey')
})

test('falls back to QWEN_API_KEY', async () => {
delete process.env.DASHSCOPE_API_KEY
process.env.QWEN_API_KEY = 'test-qwen-key'
const app = createApp()
const headers = await authHeaders()
const res = await app.request('/api/voice/qwen-token', { method: 'POST', headers })
expect(res.status).toBe(200)
const body = await res.json() as { allowed: boolean; wsUrl: string }
expect(body.allowed).toBe(true)
expect(body.wsUrl).toContain('/api/voice/qwen-ws')
expect(body).not.toHaveProperty('apiKey')
})
})
72 changes: 71 additions & 1 deletion hub/src/web/routes/voice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,19 @@ import type { WebAppEnv } from '../middleware/auth'
import {
ELEVENLABS_API_BASE,
VOICE_AGENT_NAME,
buildVoiceAgentConfig
buildVoiceAgentConfig,
DEFAULT_VOICE_BACKEND
} from '@hapi/protocol/voice'
import type { VoiceBackendType } from '@hapi/protocol/voice'

function buildVoiceWsUrl(base: string, pathname: string): string {
const url = new URL(base)
url.protocol = url.protocol === 'https:' ? 'wss:' : 'ws:'
url.pathname = pathname
url.search = ''
url.hash = ''
return url.toString()
}

const tokenRequestSchema = z.object({
customAgentId: z.string().optional(),
Expand Down Expand Up @@ -166,6 +177,65 @@ async function getOrCreateAgentIdForVoice(apiKey: string, voiceId?: string): Pro
export function createVoiceRoutes(): Hono<WebAppEnv> {
const app = new Hono<WebAppEnv>()

// Return the configured voice backend type
app.get('/voice/backend', (c) => {
const raw = process.env.VOICE_BACKEND
const backend: VoiceBackendType =
raw === 'gemini-live' ? 'gemini-live'
: raw === 'qwen-realtime' ? 'qwen-realtime'
: DEFAULT_VOICE_BACKEND
return c.json({ backend })
})

// Get Gemini API key for Gemini Live voice sessions
// Gemini Live API does not support ephemeral tokens, so we proxy the key.
// The key is short-lived in the browser session and never persisted client-side.
app.post('/voice/gemini-token', async (c) => {
const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY
if (!apiKey) {
return c.json({
allowed: false,
error: 'Gemini API key not configured (set GEMINI_API_KEY or GOOGLE_API_KEY)'
}, 400)
}

// Use server-side WS proxy to avoid region restrictions.
// The proxy at /api/voice/gemini-ws handles the API key server-side.
// Derive wsUrl from the request origin so remote browsers connect back to the hub,
// not to localhost. HAPI_PUBLIC_URL overrides when set (e.g. behind a reverse proxy).
const requestOrigin = new URL(c.req.url).origin
const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin
const wsProxyUrl = buildVoiceWsUrl(publicUrl, '/api/voice/gemini-ws')

return c.json({
allowed: true,
apiKey: 'proxied', // Dummy — key is handled server-side
wsUrl: wsProxyUrl, // Always proxy — env WS URLs are upstream-only (server-side)
baseUrl: process.env.GEMINI_API_BASE || undefined
})
})

// Check Qwen (DashScope) availability for Qwen Realtime voice sessions
// The actual API key is never sent to the browser — it stays server-side in the WS proxy.
app.post('/voice/qwen-token', async (c) => {
const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY
if (!apiKey) {
return c.json({
allowed: false,
error: 'DashScope API key not configured (set DASHSCOPE_API_KEY or QWEN_API_KEY)'
}, 400)
}

const requestOrigin = new URL(c.req.url).origin
const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin
const wsProxyUrl = buildVoiceWsUrl(publicUrl, '/api/voice/qwen-ws')

return c.json({
allowed: true,
wsUrl: wsProxyUrl // Always proxy — env WS URLs are upstream-only (server-side)
})
})

// Get ElevenLabs ConvAI conversation token
app.post('/voice/token', async (c) => {
const requestId = crypto.randomUUID()
Expand Down
Loading
Loading