From f18a5c96b01ef3368c393dddce12ef0add20f5ae Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 24 May 2026 21:31:40 +0100 Subject: [PATCH 1/4] fix(web): embed agent text in voice ready event for readback Voice onReady now extracts the last speakable assistant message and embeds it in the ready inject so ConvAI can summarize without the user re-prompting. Also formats Codex/Cursor stream-json messages for live context updates and session history. Co-authored-by: Cursor --- .../realtime/hooks/contextFormatters.test.ts | 185 ++++++++++++++++++ web/src/realtime/hooks/contextFormatters.ts | 102 +++++++++- web/src/realtime/hooks/voiceHooks.ts | 8 +- web/src/realtime/index.ts | 3 +- 4 files changed, 286 insertions(+), 12 deletions(-) create mode 100644 web/src/realtime/hooks/contextFormatters.test.ts diff --git a/web/src/realtime/hooks/contextFormatters.test.ts b/web/src/realtime/hooks/contextFormatters.test.ts new file mode 100644 index 0000000000..164e669753 --- /dev/null +++ b/web/src/realtime/hooks/contextFormatters.test.ts @@ -0,0 +1,185 @@ +import { describe, expect, it } from 'vitest' +import type { DecryptedMessage } from '@/types/api' +import { extractLastAssistantSpeakable, formatMessage, formatNewMessages, formatReadyEvent } from './contextFormatters' + +function msg(partial: Pick): DecryptedMessage { + return { + id: partial.id, + seq: partial.seq, + localId: null, + content: partial.content, + createdAt: 0, + sessionId: 'session-1' + } as DecryptedMessage +} + +describe('extractLastAssistantSpeakable', () => { + it('returns null for empty history', () => { + expect(extractLastAssistantSpeakable([])).toBeNull() + }) + + it('returns the latest assistant plain string', () => { + const messages = [ + msg({ id: '1', seq: 1, content: { role: 'user', content: 'hello' } }), + msg({ id: '2', seq: 2, content: { role: 'assistant', content: 'first reply' } }), + msg({ id: '3', seq: 3, content: { role: 'assistant', content: ' latest reply ' } }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('latest reply') + }) + + it('skips trailing user messages and reads earlier assistant text', () => { + const messages = [ + msg({ id: '1', seq: 1, content: { role: 'assistant', content: 'done with subtitle search' } }), + msg({ id: '2', seq: 2, content: { role: 'user', content: 'thanks' } }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('done with subtitle search') + }) + + it('extracts text blocks from assistant content arrays', () => { + const messages = [ + msg({ + id: '1', + seq: 1, + content: { + role: 'assistant', + content: [ + { type: 'text', text: 'Part one.' }, + { type: 'text', text: 'Part two.' } + ] + } + }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('Part one.\n\nPart two.') + }) + + it('extracts codex stream-json assistant messages', () => { + const messages = [ + msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { + type: 'message', + message: '**Subtitle coverage** — 5,018 indexed items.' + } + } + } + }), + msg({ + id: '2', + seq: 2, + content: { + role: 'agent', + content: { + type: 'codex', + data: { type: 'ready' } + } + } + }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('**Subtitle coverage** — 5,018 indexed items.') + }) + + it('unwraps codex-style output envelopes', () => { + const messages = [ + msg({ + id: '1', + seq: 1, + content: { + type: 'output', + data: { + type: 'assistant', + message: { content: 'Codex finished the refactor.' } + } + } + }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('Codex finished the refactor.') + }) +}) + +describe('formatReadyEvent', () => { + const sessionId = '9d04335d-2b90-4941-98a7-eb414823f0e0' + + it('embeds assistant text when provided', () => { + const text = 'Added subtitle index search to jellybot.' + const event = formatReadyEvent(sessionId, text) + expect(event).toContain('coding agent finished working') + expect(event).toContain(`${text}`) + expect(event).not.toContain('Claude Code') + }) + + it('falls back when assistant text is missing', () => { + const event = formatReadyEvent(sessionId, null) + expect(event).toContain('Use the latest agent message already present in context') + expect(event).not.toContain('Claude Code') + }) + + it('treats blank assistant text as missing', () => { + const event = formatReadyEvent(sessionId, ' ') + expect(event).toContain('Use the latest agent message already present in context') + }) +}) + +describe('formatMessage', () => { + it('formats codex stream-json assistant messages for voice context', () => { + const formatted = formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { + type: 'message', + message: '**Subtitle coverage** — 5,018 indexed items.' + } + } + } + })) + + expect(formatted).toContain('Claude Code:') + expect(formatted).toContain('**Subtitle coverage** — 5,018 indexed items.') + }) + + it('ignores codex ready and tool-call payloads', () => { + expect(formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { type: 'ready' } + } + } + }))).toBeNull() + }) +}) + +describe('formatNewMessages', () => { + it('includes codex assistant replies in contextual updates', () => { + const update = formatNewMessages('session-1', [ + msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { + type: 'message', + message: 'Database size is 2.43 GiB.' + } + } + } + }) + ]) + + expect(update).toContain('New messages in session: session-1') + expect(update).toContain('Database size is 2.43 GiB.') + }) +}) diff --git a/web/src/realtime/hooks/contextFormatters.ts b/web/src/realtime/hooks/contextFormatters.ts index a6dba4bb3e..9165a3d3f5 100644 --- a/web/src/realtime/hooks/contextFormatters.ts +++ b/web/src/realtime/hooks/contextFormatters.ts @@ -92,21 +92,26 @@ export function formatPermissionRequest( * Format a single message for voice context */ export function formatMessage(message: DecryptedMessage): string | null { - const lines: string[] = [] const { role, content: wrappedContent } = unwrapRoleWrappedContent(message) const { roleOverride, content } = unwrapOutputContent(wrappedContent) const normalizedRole = roleOverride ?? role + if (isNonSpeakableAgentPayload(wrappedContent) || isNonSpeakableAgentPayload(content)) { + return null + } + + const speakable = extractSpeakableFromContent(content) + if (speakable) { + const roleForFormat = normalizedRole === 'user' ? 'user' : 'assistant' + return formatPlainText(roleForFormat, speakable) + } + if (!isContentArray(content)) { - if (typeof content === 'string') { - return formatPlainText(normalizedRole, content) - } - if (isObject(content) && content.type === 'text' && typeof content.text === 'string') { - return formatPlainText(normalizedRole, content.text) - } return null } + const lines: string[] = [] + // Determine message type by checking for tool_use (assistant) vs user content const hasToolUse = content.some(item => item.type === 'tool_use') const isAssistant = normalizedRole === 'assistant' @@ -134,6 +139,81 @@ export function formatMessage(message: DecryptedMessage): string | null { return lines.join('\n\n') } +function extractSpeakableFromContent(content: unknown): string | null { + if (typeof content === 'string' && content.trim()) { + return content.trim() + } + + if (isObject(content) && content.type === 'text' && typeof content.text === 'string' && content.text.trim()) { + return content.text.trim() + } + + // Codex / stream-json agent messages: { type: 'codex', data: { type: 'message', message: '...' } } + if (isObject(content) && typeof content.type === 'string' && isObject(content.data)) { + const data = content.data + if (data.type === 'message' && typeof data.message === 'string' && data.message.trim()) { + return data.message.trim() + } + } + + if (!isContentArray(content)) { + return null + } + + const textParts = content + .filter((item) => item.type === 'text' && item.text) + .map((item) => item.text!.trim()) + .filter(Boolean) + + if (textParts.length > 0) { + return textParts.join('\n\n') + } + + return null +} + +function isNonSpeakableAgentPayload(content: unknown): boolean { + if (!isObject(content) || typeof content.type !== 'string') { + return false + } + + if (content.type === 'codex' && isObject(content.data)) { + const eventType = content.data.type + return eventType === 'ready' + || eventType === 'tool-call' + || eventType === 'tool-call-result' + || eventType === 'event' + } + + return false +} + +export function extractLastAssistantSpeakable(messages: DecryptedMessage[]): string | null { + const sorted = [...messages].sort((a, b) => (a.seq ?? 0) - (b.seq ?? 0)) + + for (let i = sorted.length - 1; i >= 0; i -= 1) { + const message = sorted[i] + const { role, content: wrappedContent } = unwrapRoleWrappedContent(message) + const { roleOverride, content } = unwrapOutputContent(wrappedContent) + const normalizedRole = roleOverride ?? role + + if (normalizedRole === 'user') { + continue + } + + if (isNonSpeakableAgentPayload(wrappedContent) || isNonSpeakableAgentPayload(content)) { + continue + } + + const speakable = extractSpeakableFromContent(content) + if (speakable) { + return speakable + } + } + + return null +} + export function formatNewSingleMessage(sessionId: string, message: DecryptedMessage): string | null { const formatted = formatMessage(message) if (!formatted) { @@ -199,6 +279,10 @@ export function formatSessionFocus(sessionId: string, _metadata?: SessionMetadat return `Session became focused: ${sessionId}` } -export function formatReadyEvent(sessionId: string): string { - return `Claude Code done working in session: ${sessionId}. The previous message(s) are the summary of the work done. Report this to the human immediately.` +export function formatReadyEvent(sessionId: string, lastAssistantText?: string | null): string { + const trimmed = lastAssistantText?.trim() + if (trimmed) { + return `The coding agent finished working in session: ${sessionId}. Summarize this for the human immediately:\n${trimmed}` + } + return `The coding agent finished working in session: ${sessionId}. Use the latest agent message already present in context and summarize it for the human immediately.` } diff --git a/web/src/realtime/hooks/voiceHooks.ts b/web/src/realtime/hooks/voiceHooks.ts index 5f32193752..c6318d3c16 100644 --- a/web/src/realtime/hooks/voiceHooks.ts +++ b/web/src/realtime/hooks/voiceHooks.ts @@ -6,7 +6,8 @@ import { formatSessionFocus, formatSessionFull, formatSessionOffline, - formatSessionOnline + formatSessionOnline, + extractLastAssistantSpeakable } from './contextFormatters' import { VOICE_CONFIG } from '../voiceConfig' import type { DecryptedMessage, Session } from '@/types/api' @@ -68,6 +69,7 @@ function reportSession(sessionId: string) { reportContextualUpdate(contextUpdate) } + export const voiceHooks = { /** * Called when a session comes online/connects @@ -147,7 +149,9 @@ export const voiceHooks = { if (VOICE_CONFIG.DISABLE_READY_EVENTS) return reportSession(sessionId) - reportTextUpdate(formatReadyEvent(sessionId)) + const messages = messagesGetter?.(sessionId) ?? [] + const lastAssistantText = extractLastAssistantSpeakable(messages) + reportTextUpdate(formatReadyEvent(sessionId, lastAssistantText)) }, /** diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index a7fa2fbe99..58b7b229e3 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -32,7 +32,8 @@ export { formatSessionOffline, formatSessionFocus, formatPermissionRequest, - formatReadyEvent + formatReadyEvent, + extractLastAssistantSpeakable } from './hooks/contextFormatters' // Config From 63c373261683dd8a5bfb0866c881fa2fe06995d4 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 24 May 2026 21:39:52 +0100 Subject: [PATCH 2/4] test(web): use domain-neutral voice formatter fixtures Replace jellybot/subtitle dogfood strings in tests with generic examples. Co-authored-by: Cursor --- .../realtime/hooks/contextFormatters.test.ts | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/web/src/realtime/hooks/contextFormatters.test.ts b/web/src/realtime/hooks/contextFormatters.test.ts index 164e669753..088da7581f 100644 --- a/web/src/realtime/hooks/contextFormatters.test.ts +++ b/web/src/realtime/hooks/contextFormatters.test.ts @@ -29,10 +29,10 @@ describe('extractLastAssistantSpeakable', () => { it('skips trailing user messages and reads earlier assistant text', () => { const messages = [ - msg({ id: '1', seq: 1, content: { role: 'assistant', content: 'done with subtitle search' } }), + msg({ id: '1', seq: 1, content: { role: 'assistant', content: 'done with the refactor' } }), msg({ id: '2', seq: 2, content: { role: 'user', content: 'thanks' } }) ] - expect(extractLastAssistantSpeakable(messages)).toBe('done with subtitle search') + expect(extractLastAssistantSpeakable(messages)).toBe('done with the refactor') }) it('extracts text blocks from assistant content arrays', () => { @@ -63,7 +63,7 @@ describe('extractLastAssistantSpeakable', () => { type: 'codex', data: { type: 'message', - message: '**Subtitle coverage** — 5,018 indexed items.' + message: 'Indexed 5,018 items in the search database.' } } } @@ -80,7 +80,7 @@ describe('extractLastAssistantSpeakable', () => { } }) ] - expect(extractLastAssistantSpeakable(messages)).toBe('**Subtitle coverage** — 5,018 indexed items.') + expect(extractLastAssistantSpeakable(messages)).toBe('Indexed 5,018 items in the search database.') }) it('unwraps codex-style output envelopes', () => { @@ -105,7 +105,7 @@ describe('formatReadyEvent', () => { const sessionId = '9d04335d-2b90-4941-98a7-eb414823f0e0' it('embeds assistant text when provided', () => { - const text = 'Added subtitle index search to jellybot.' + const text = 'Added full-text search to the API module.' const event = formatReadyEvent(sessionId, text) expect(event).toContain('coding agent finished working') expect(event).toContain(`${text}`) @@ -135,14 +135,14 @@ describe('formatMessage', () => { type: 'codex', data: { type: 'message', - message: '**Subtitle coverage** — 5,018 indexed items.' + message: 'Indexed 5,018 items in the search database.' } } } })) expect(formatted).toContain('Claude Code:') - expect(formatted).toContain('**Subtitle coverage** — 5,018 indexed items.') + expect(formatted).toContain('Indexed 5,018 items in the search database.') }) it('ignores codex ready and tool-call payloads', () => { @@ -172,7 +172,7 @@ describe('formatNewMessages', () => { type: 'codex', data: { type: 'message', - message: 'Database size is 2.43 GiB.' + message: 'Local database file size is 2.43 GiB.' } } } @@ -180,6 +180,6 @@ describe('formatNewMessages', () => { ]) expect(update).toContain('New messages in session: session-1') - expect(update).toContain('Database size is 2.43 GiB.') + expect(update).toContain('Local database file size is 2.43 GiB.') }) }) From 818bf7c7b8e8a5316397214a6af6a72fb2a56b5d Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 24 May 2026 23:50:28 +0100 Subject: [PATCH 3/4] fix(web): guard extractSpeakableFromContent for non-arrays in formatMessage extractSpeakableFromContent also handles content arrays (joins text items), so calling it unconditionally before the existing array loop caused mixed text+tool_use payloads to return early without formatting the tool_use item. Guard with !isContentArray so the loop handles arrays as before. Adds regression test: mixed text+tool_use array must produce both the text and the tool-call line (was red before this fix). via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- .../realtime/hooks/contextFormatters.test.ts | 17 +++++++++++++++++ web/src/realtime/hooks/contextFormatters.ts | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/web/src/realtime/hooks/contextFormatters.test.ts b/web/src/realtime/hooks/contextFormatters.test.ts index 088da7581f..0d912f3b21 100644 --- a/web/src/realtime/hooks/contextFormatters.test.ts +++ b/web/src/realtime/hooks/contextFormatters.test.ts @@ -158,6 +158,23 @@ describe('formatMessage', () => { } }))).toBeNull() }) + + it('preserves tool-call context for mixed text+tool_use content array', () => { + const formatted = formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'assistant', + content: [ + { type: 'text', text: 'Here is the result.' }, + { type: 'tool_use', name: 'Bash', input: { command: 'ls' } } + ] + } + })) + + expect(formatted).toContain('Here is the result.') + expect(formatted).toContain('Claude Code is using Bash') + }) }) describe('formatNewMessages', () => { diff --git a/web/src/realtime/hooks/contextFormatters.ts b/web/src/realtime/hooks/contextFormatters.ts index 9165a3d3f5..ebfeea66e0 100644 --- a/web/src/realtime/hooks/contextFormatters.ts +++ b/web/src/realtime/hooks/contextFormatters.ts @@ -100,7 +100,7 @@ export function formatMessage(message: DecryptedMessage): string | null { return null } - const speakable = extractSpeakableFromContent(content) + const speakable = !isContentArray(content) ? extractSpeakableFromContent(content) : null if (speakable) { const roleForFormat = normalizedRole === 'user' ? 'user' : 'assistant' return formatPlainText(roleForFormat, speakable) From 09e9b554e140b04eb7284e01940da0bbb73fdad5 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 24 May 2026 23:58:17 +0100 Subject: [PATCH 4/4] fix(web): narrow extractSpeakableFromContent to codex type only The helper matched any object with a string type and a data property, so sendSessionEvent({ type: 'message', message }) events (which arrive as { type: 'event', data: { type: 'message', message } }) were falsely formatted as speakable assistant text and could be selected as the ready readback. Narrow the Codex path to content.type === 'codex' as the comment already states. Adds regression test: session status event must return null from formatMessage. via [HAPI](https://hapi.run) Co-Authored-By: HAPI --- web/src/realtime/hooks/contextFormatters.test.ts | 15 +++++++++++++++ web/src/realtime/hooks/contextFormatters.ts | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/web/src/realtime/hooks/contextFormatters.test.ts b/web/src/realtime/hooks/contextFormatters.test.ts index 0d912f3b21..c2b4ac7594 100644 --- a/web/src/realtime/hooks/contextFormatters.test.ts +++ b/web/src/realtime/hooks/contextFormatters.test.ts @@ -159,6 +159,21 @@ describe('formatMessage', () => { }))).toBeNull() }) + it('does not treat session status events as speakable assistant text', () => { + expect(formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + id: 'some-uuid', + type: 'event', + data: { type: 'message', message: 'Aborting task.' } + } + } + }))).toBeNull() + }) + it('preserves tool-call context for mixed text+tool_use content array', () => { const formatted = formatMessage(msg({ id: '1', diff --git a/web/src/realtime/hooks/contextFormatters.ts b/web/src/realtime/hooks/contextFormatters.ts index ebfeea66e0..98768429fb 100644 --- a/web/src/realtime/hooks/contextFormatters.ts +++ b/web/src/realtime/hooks/contextFormatters.ts @@ -149,7 +149,7 @@ function extractSpeakableFromContent(content: unknown): string | null { } // Codex / stream-json agent messages: { type: 'codex', data: { type: 'message', message: '...' } } - if (isObject(content) && typeof content.type === 'string' && isObject(content.data)) { + if (isObject(content) && content.type === 'codex' && isObject(content.data)) { const data = content.data if (data.type === 'message' && typeof data.message === 'string' && data.message.trim()) { return data.message.trim()