diff --git a/web/src/realtime/hooks/contextFormatters.test.ts b/web/src/realtime/hooks/contextFormatters.test.ts new file mode 100644 index 0000000000..c2b4ac7594 --- /dev/null +++ b/web/src/realtime/hooks/contextFormatters.test.ts @@ -0,0 +1,217 @@ +import { describe, expect, it } from 'vitest' +import type { DecryptedMessage } from '@/types/api' +import { extractLastAssistantSpeakable, formatMessage, formatNewMessages, formatReadyEvent } from './contextFormatters' + +function msg(partial: Pick): DecryptedMessage { + return { + id: partial.id, + seq: partial.seq, + localId: null, + content: partial.content, + createdAt: 0, + sessionId: 'session-1' + } as DecryptedMessage +} + +describe('extractLastAssistantSpeakable', () => { + it('returns null for empty history', () => { + expect(extractLastAssistantSpeakable([])).toBeNull() + }) + + it('returns the latest assistant plain string', () => { + const messages = [ + msg({ id: '1', seq: 1, content: { role: 'user', content: 'hello' } }), + msg({ id: '2', seq: 2, content: { role: 'assistant', content: 'first reply' } }), + msg({ id: '3', seq: 3, content: { role: 'assistant', content: ' latest reply ' } }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('latest reply') + }) + + it('skips trailing user messages and reads earlier assistant text', () => { + const messages = [ + msg({ id: '1', seq: 1, content: { role: 'assistant', content: 'done with the refactor' } }), + msg({ id: '2', seq: 2, content: { role: 'user', content: 'thanks' } }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('done with the refactor') + }) + + it('extracts text blocks from assistant content arrays', () => { + const messages = [ + msg({ + id: '1', + seq: 1, + content: { + role: 'assistant', + content: [ + { type: 'text', text: 'Part one.' }, + { type: 'text', text: 'Part two.' } + ] + } + }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('Part one.\n\nPart two.') + }) + + it('extracts codex stream-json assistant messages', () => { + const messages = [ + msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { + type: 'message', + message: 'Indexed 5,018 items in the search database.' + } + } + } + }), + msg({ + id: '2', + seq: 2, + content: { + role: 'agent', + content: { + type: 'codex', + data: { type: 'ready' } + } + } + }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('Indexed 5,018 items in the search database.') + }) + + it('unwraps codex-style output envelopes', () => { + const messages = [ + msg({ + id: '1', + seq: 1, + content: { + type: 'output', + data: { + type: 'assistant', + message: { content: 'Codex finished the refactor.' } + } + } + }) + ] + expect(extractLastAssistantSpeakable(messages)).toBe('Codex finished the refactor.') + }) +}) + +describe('formatReadyEvent', () => { + const sessionId = '9d04335d-2b90-4941-98a7-eb414823f0e0' + + it('embeds assistant text when provided', () => { + const text = 'Added full-text search to the API module.' + const event = formatReadyEvent(sessionId, text) + expect(event).toContain('coding agent finished working') + expect(event).toContain(`${text}`) + expect(event).not.toContain('Claude Code') + }) + + it('falls back when assistant text is missing', () => { + const event = formatReadyEvent(sessionId, null) + expect(event).toContain('Use the latest agent message already present in context') + expect(event).not.toContain('Claude Code') + }) + + it('treats blank assistant text as missing', () => { + const event = formatReadyEvent(sessionId, ' ') + expect(event).toContain('Use the latest agent message already present in context') + }) +}) + +describe('formatMessage', () => { + it('formats codex stream-json assistant messages for voice context', () => { + const formatted = formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { + type: 'message', + message: 'Indexed 5,018 items in the search database.' + } + } + } + })) + + expect(formatted).toContain('Claude Code:') + expect(formatted).toContain('Indexed 5,018 items in the search database.') + }) + + it('ignores codex ready and tool-call payloads', () => { + expect(formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { type: 'ready' } + } + } + }))).toBeNull() + }) + + it('does not treat session status events as speakable assistant text', () => { + expect(formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + id: 'some-uuid', + type: 'event', + data: { type: 'message', message: 'Aborting task.' } + } + } + }))).toBeNull() + }) + + it('preserves tool-call context for mixed text+tool_use content array', () => { + const formatted = formatMessage(msg({ + id: '1', + seq: 1, + content: { + role: 'assistant', + content: [ + { type: 'text', text: 'Here is the result.' }, + { type: 'tool_use', name: 'Bash', input: { command: 'ls' } } + ] + } + })) + + expect(formatted).toContain('Here is the result.') + expect(formatted).toContain('Claude Code is using Bash') + }) +}) + +describe('formatNewMessages', () => { + it('includes codex assistant replies in contextual updates', () => { + const update = formatNewMessages('session-1', [ + msg({ + id: '1', + seq: 1, + content: { + role: 'agent', + content: { + type: 'codex', + data: { + type: 'message', + message: 'Local database file size is 2.43 GiB.' + } + } + } + }) + ]) + + expect(update).toContain('New messages in session: session-1') + expect(update).toContain('Local database file size is 2.43 GiB.') + }) +}) diff --git a/web/src/realtime/hooks/contextFormatters.ts b/web/src/realtime/hooks/contextFormatters.ts index a6dba4bb3e..98768429fb 100644 --- a/web/src/realtime/hooks/contextFormatters.ts +++ b/web/src/realtime/hooks/contextFormatters.ts @@ -92,21 +92,26 @@ export function formatPermissionRequest( * Format a single message for voice context */ export function formatMessage(message: DecryptedMessage): string | null { - const lines: string[] = [] const { role, content: wrappedContent } = unwrapRoleWrappedContent(message) const { roleOverride, content } = unwrapOutputContent(wrappedContent) const normalizedRole = roleOverride ?? role + if (isNonSpeakableAgentPayload(wrappedContent) || isNonSpeakableAgentPayload(content)) { + return null + } + + const speakable = !isContentArray(content) ? extractSpeakableFromContent(content) : null + if (speakable) { + const roleForFormat = normalizedRole === 'user' ? 'user' : 'assistant' + return formatPlainText(roleForFormat, speakable) + } + if (!isContentArray(content)) { - if (typeof content === 'string') { - return formatPlainText(normalizedRole, content) - } - if (isObject(content) && content.type === 'text' && typeof content.text === 'string') { - return formatPlainText(normalizedRole, content.text) - } return null } + const lines: string[] = [] + // Determine message type by checking for tool_use (assistant) vs user content const hasToolUse = content.some(item => item.type === 'tool_use') const isAssistant = normalizedRole === 'assistant' @@ -134,6 +139,81 @@ export function formatMessage(message: DecryptedMessage): string | null { return lines.join('\n\n') } +function extractSpeakableFromContent(content: unknown): string | null { + if (typeof content === 'string' && content.trim()) { + return content.trim() + } + + if (isObject(content) && content.type === 'text' && typeof content.text === 'string' && content.text.trim()) { + return content.text.trim() + } + + // Codex / stream-json agent messages: { type: 'codex', data: { type: 'message', message: '...' } } + if (isObject(content) && content.type === 'codex' && isObject(content.data)) { + const data = content.data + if (data.type === 'message' && typeof data.message === 'string' && data.message.trim()) { + return data.message.trim() + } + } + + if (!isContentArray(content)) { + return null + } + + const textParts = content + .filter((item) => item.type === 'text' && item.text) + .map((item) => item.text!.trim()) + .filter(Boolean) + + if (textParts.length > 0) { + return textParts.join('\n\n') + } + + return null +} + +function isNonSpeakableAgentPayload(content: unknown): boolean { + if (!isObject(content) || typeof content.type !== 'string') { + return false + } + + if (content.type === 'codex' && isObject(content.data)) { + const eventType = content.data.type + return eventType === 'ready' + || eventType === 'tool-call' + || eventType === 'tool-call-result' + || eventType === 'event' + } + + return false +} + +export function extractLastAssistantSpeakable(messages: DecryptedMessage[]): string | null { + const sorted = [...messages].sort((a, b) => (a.seq ?? 0) - (b.seq ?? 0)) + + for (let i = sorted.length - 1; i >= 0; i -= 1) { + const message = sorted[i] + const { role, content: wrappedContent } = unwrapRoleWrappedContent(message) + const { roleOverride, content } = unwrapOutputContent(wrappedContent) + const normalizedRole = roleOverride ?? role + + if (normalizedRole === 'user') { + continue + } + + if (isNonSpeakableAgentPayload(wrappedContent) || isNonSpeakableAgentPayload(content)) { + continue + } + + const speakable = extractSpeakableFromContent(content) + if (speakable) { + return speakable + } + } + + return null +} + export function formatNewSingleMessage(sessionId: string, message: DecryptedMessage): string | null { const formatted = formatMessage(message) if (!formatted) { @@ -199,6 +279,10 @@ export function formatSessionFocus(sessionId: string, _metadata?: SessionMetadat return `Session became focused: ${sessionId}` } -export function formatReadyEvent(sessionId: string): string { - return `Claude Code done working in session: ${sessionId}. The previous message(s) are the summary of the work done. Report this to the human immediately.` +export function formatReadyEvent(sessionId: string, lastAssistantText?: string | null): string { + const trimmed = lastAssistantText?.trim() + if (trimmed) { + return `The coding agent finished working in session: ${sessionId}. Summarize this for the human immediately:\n${trimmed}` + } + return `The coding agent finished working in session: ${sessionId}. Use the latest agent message already present in context and summarize it for the human immediately.` } diff --git a/web/src/realtime/hooks/voiceHooks.ts b/web/src/realtime/hooks/voiceHooks.ts index 5f32193752..c6318d3c16 100644 --- a/web/src/realtime/hooks/voiceHooks.ts +++ b/web/src/realtime/hooks/voiceHooks.ts @@ -6,7 +6,8 @@ import { formatSessionFocus, formatSessionFull, formatSessionOffline, - formatSessionOnline + formatSessionOnline, + extractLastAssistantSpeakable } from './contextFormatters' import { VOICE_CONFIG } from '../voiceConfig' import type { DecryptedMessage, Session } from '@/types/api' @@ -68,6 +69,7 @@ function reportSession(sessionId: string) { reportContextualUpdate(contextUpdate) } + export const voiceHooks = { /** * Called when a session comes online/connects @@ -147,7 +149,9 @@ export const voiceHooks = { if (VOICE_CONFIG.DISABLE_READY_EVENTS) return reportSession(sessionId) - reportTextUpdate(formatReadyEvent(sessionId)) + const messages = messagesGetter?.(sessionId) ?? [] + const lastAssistantText = extractLastAssistantSpeakable(messages) + reportTextUpdate(formatReadyEvent(sessionId, lastAssistantText)) }, /** diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index a7fa2fbe99..58b7b229e3 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -32,7 +32,8 @@ export { formatSessionOffline, formatSessionFocus, formatPermissionRequest, - formatReadyEvent + formatReadyEvent, + extractLastAssistantSpeakable } from './hooks/contextFormatters' // Config