From afa79665f67ab2eb31e478864b51a5d658ed868f Mon Sep 17 00:00:00 2001 From: Aniket Dixit <47004499+dixitaniket@users.noreply.github.com> Date: Thu, 4 Jun 2026 02:13:43 +0530 Subject: [PATCH 1/2] image gen format fixes (#91) * testing image format fix * review fixes * lint fix --- tee_gateway/controllers/chat_controller.py | 4 +--- tee_gateway/llm_backend.py | 25 ++++++++++++++++------ tee_gateway/test/test_tee_core.py | 22 +++++++++++++------ tests/test_structured_outputs.py | 23 ++++++++++++++++++++ 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/tee_gateway/controllers/chat_controller.py b/tee_gateway/controllers/chat_controller.py index 291a684..08f058f 100644 --- a/tee_gateway/controllers/chat_controller.py +++ b/tee_gateway/controllers/chat_controller.py @@ -911,9 +911,7 @@ def _chat_request_to_dict(chat_request: CreateChatCompletionRequest) -> dict: messages.append( { "role": "user", - "content": msg.content - if isinstance(msg.content, str) - else str(msg.content), + "content": msg.content, } ) elif isinstance(msg, ChatCompletionRequestAssistantMessage): diff --git a/tee_gateway/llm_backend.py b/tee_gateway/llm_backend.py index 0e80d22..c0da980 100644 --- a/tee_gateway/llm_backend.py +++ b/tee_gateway/llm_backend.py @@ -325,6 +325,17 @@ def generate_images(model: str, prompt: str, n: int = 1) -> tuple[list[str], int return images, len(images) +def _normalize_user_content_parts(content: list) -> list: + """Preserve multimodal user content while tolerating primitive text parts.""" + normalized = [] + for part in content: + if isinstance(part, dict): + normalized.append(part) + else: + normalized.append({"type": "text", "text": str(part)}) + return normalized + + def convert_messages(messages: list) -> List[Any]: """Convert OpenAI-format message objects or dicts to LangChain message objects.""" langchain_messages: List[BaseMessage] = [] @@ -333,13 +344,17 @@ def convert_messages(messages: list) -> List[Any]: # Support both OpenAPI model objects and plain dicts if isinstance(msg, dict): role = msg.get("role", "").lower() - content = msg.get("content", "") or "" + content = msg.get("content", "") + if content is None: + content = "" tool_calls = msg.get("tool_calls") tool_call_id = msg.get("tool_call_id") name = msg.get("name") else: role = getattr(msg, "role", "").lower() - content = getattr(msg, "content", "") or "" + content = getattr(msg, "content", "") + if content is None: + content = "" tool_calls = getattr(msg, "tool_calls", None) tool_call_id = getattr(msg, "tool_call_id", None) name = getattr(msg, "name", None) @@ -348,12 +363,8 @@ def convert_messages(messages: list) -> List[Any]: langchain_messages.append(SystemMessage(content=content)) elif role == "user": - # content may be a string or a list of content parts; handle both if isinstance(content, list): - content = "".join( - part.get("text", "") if isinstance(part, dict) else str(part) - for part in content - ) + content = _normalize_user_content_parts(content) langchain_messages.append(HumanMessage(content=content)) elif role == "assistant": diff --git a/tee_gateway/test/test_tee_core.py b/tee_gateway/test/test_tee_core.py index 41aac33..2411176 100644 --- a/tee_gateway/test/test_tee_core.py +++ b/tee_gateway/test/test_tee_core.py @@ -569,20 +569,30 @@ def test_multi_turn_order_preserved(self): self.assertIsInstance(result[2], AIMessage) def test_user_content_as_list_of_parts(self): - """Multimodal content parts should be concatenated into a single string.""" + """Multimodal content parts should be preserved for vision-capable models.""" + content = [ + {"type": "text", "text": "Hello world"}, + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,abcd"}, + }, + ] result = convert_messages( [ { "role": "user", - "content": [ - {"type": "text", "text": "Hello "}, - {"type": "text", "text": "world"}, - ], + "content": content, } ] ) self.assertIsInstance(result[0], HumanMessage) - self.assertEqual(result[0].content, "Hello world") + self.assertEqual(result[0].content, content) + + def test_empty_user_content_list_is_preserved(self): + """Empty multimodal content lists should not be coerced to empty strings.""" + result = convert_messages([{"role": "user", "content": []}]) + self.assertIsInstance(result[0], HumanMessage) + self.assertEqual(result[0].content, []) def test_full_tool_call_conversation(self): """End-to-end multi-turn with tool use: user → assistant (tool call) → tool result.""" diff --git a/tests/test_structured_outputs.py b/tests/test_structured_outputs.py index 3db565f..71fbd20 100644 --- a/tests/test_structured_outputs.py +++ b/tests/test_structured_outputs.py @@ -9,6 +9,9 @@ from tee_gateway.models.create_chat_completion_request import ( CreateChatCompletionRequest, ) +from tee_gateway.models.chat_completion_request_user_message import ( + ChatCompletionRequestUserMessage, +) class TestResponseFormatParsing(unittest.TestCase): @@ -109,6 +112,26 @@ def test_hash_differs_with_and_without_response_format(self): h2 = json.dumps(chat_request_to_dict(req_json), sort_keys=True) self.assertNotEqual(h1, h2) + def test_hash_dict_preserves_multimodal_user_content(self): + content = [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,abcd"}, + }, + ] + req = CreateChatCompletionRequest( + model="gpt-4.1", + messages=[ChatCompletionRequestUserMessage(role="user", content=content)], + temperature=1.0, + ) + request_dict = chat_request_to_dict(req) + self.assertEqual(request_dict["messages"][0]["content"], content) + + dumped_once = json.dumps(request_dict, sort_keys=True) + dumped_twice = json.dumps(chat_request_to_dict(req), sort_keys=True) + self.assertEqual(dumped_once, dumped_twice) + class TestResponseFormatModelBinding(unittest.TestCase): """Tests that response_format is bound to the model before invocation.""" From d4d72ea3d8f9e53334021f55634c32146a79d7d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 12:54:30 +0000 Subject: [PATCH 2/2] Minimize attachment handling: keep provider-native image pass-through Revert the bespoke image-conversion path in convert_messages to main's raw pass-through (text/image parts already convert correctly to every provider's native API, so images keep working untouched). Only file/PDF parts are rewritten to LangChain standard file blocks, since Anthropic needs a 'document' block and rejects OpenAI's raw file shape. Capability gating, the per-request size cap, and request-hash canonicalization are retained. Drop the design doc. --- docs/native-attachments-design.md | 231 ------------------------------ tee_gateway/llm_backend.py | 48 ++++--- tee_gateway/test/test_tee_core.py | 30 ++-- 3 files changed, 45 insertions(+), 264 deletions(-) delete mode 100644 docs/native-attachments-design.md diff --git a/docs/native-attachments-design.md b/docs/native-attachments-design.md deleted file mode 100644 index c1a637c..0000000 --- a/docs/native-attachments-design.md +++ /dev/null @@ -1,231 +0,0 @@ -# Design: Native LLM Attachments over the Private (OHTTP) Path - -## Status - -Proposal. Spans three repos: `chat-app` (browser), `chat-api` (relay), `tee-gateway` -(enclave). The bulk of the change lands in `tee-gateway`. - -## Motivation - -Today attachments are handled by **server-side parsing in `chat-api`**: - -- `chat-api/src/core/attachments.py` downloads each attachment and runs PyMuPDF / - python-docx to extract **plain text**, then injects that text into the prompt. -- Images are classified by content-type and passed through as URLs. - -This is the wrong layer to solve the problem: - -1. **It throws away everything the models do natively.** Modern Claude / GPT / - Gemini ingest PDFs and images directly — layout, tables, figures, charts, - handwriting, embedded images. Flattening a PDF to `page.get_text()` loses all - of that and feeds the model a worse input than it could handle itself. -2. **It only works on the non-private path.** The parsing in `attachments.py` is - invoked exclusively from the regular `POST /api/v1/chat` handler. On the - **OHTTP path**, `chat-api` is a dumb relay — it forwards opaque ciphertext to - the enclave and never sees the body — so attachments are simply not processed. - Worse, in the enclave `llm_backend.convert_messages` flattens multimodal - content parts to text only (`"".join(part.get("text", "") ...)`), so any - `image_url` part is **silently dropped** before it reaches the provider. - -Net result: **attachments and privacy are currently mutually exclusive.** -Attachments only work on the route where `chat-api` reads the plaintext, and the -private route drops them. - -## Goal - -Send attachments to the model **natively**, on the **private (OHTTP) path**: - -- No server-side text extraction. The file bytes reach the model as a native - image/document content part. -- `chat-api` and Cloudflare never see attachment plaintext (same trust boundary - as the message text already enjoys on OHTTP). -- The enclave converts the inner request's multimodal content into each - provider's native format via LangChain. - -## Trust boundary (what this does and does not hide) - -- **Hidden from:** the browser→relay transport, `chat-api`, the OHTTP relay, - Cloudflare/R2. They see only HPKE ciphertext. -- **Visible to:** the enclave (it decrypts — that's the trust anchor) and the - **upstream LLM provider** (OpenAI/Anthropic/Google/xAI/ByteDance), which - receives the attachment as part of the completion request. This is identical - to how message *text* is already handled: whatever you send the model, the - model provider sees. Fully provider-blind attachments would require the model - to run inside the TEE and are out of scope here. - -## Transport: how the attachment reaches the enclave - -### Phase 1 — inline base64 (recommended starting point) - -The browser embeds the file directly in the message content as a standard -OpenAI-style content part, inside the HPKE-encrypted OHTTP payload: - -```jsonc -{ - "model": "claude-sonnet-4-6", - "messages": [ - { - "role": "user", - "content": [ - { "type": "text", "text": "Summarize this contract." }, - { "type": "image_url", - "image_url": { "url": "data:image/png;base64,iVBORw0K..." } }, - { "type": "file", - "file": { "filename": "contract.pdf", - "file_data": "data:application/pdf;base64,JVBERi0..." } } - ] - } - ] -} -``` - -- Pros: nothing outside the enclave/provider ever sees the bytes; no R2 round - trip; no presigned-URL machinery; no SSRF surface. -- Cons: base64 inflates ~33%; bounded by request/OHTTP size limits; no - persistence (re-sent each turn). Fine for the common case (a few MB of PDF or - an image). Enforce a hard per-request attachment-bytes cap in the enclave. - -### Phase 2 — encrypted blob in R2 (only if large files / persistence needed) - -Browser client-side-encrypts the file (AES-GCM), uploads **ciphertext** to R2 -(Cloudflare sees only ciphertext), and includes inside the OHTTP payload an R2 -reference plus the AES key **wrapped to the TEE attestation/HPKE public key**. -The enclave fetches the ciphertext and decrypts internally. Defer until Phase 1 -limits become a real constraint. - -> Note: do **not** go back to plaintext-in-R2 + presigned URLs. That reintroduces -> the public-bearer-token leak and the SSRF surface in `attachments.py`. - -## Enclave changes (`tee-gateway`) — the core of the work - -### 1. `convert_messages` must preserve multimodal content - -`llm_backend.py:248-255` currently does: - -```python -elif role == "user": - if isinstance(content, list): - content = "".join( - part.get("text", "") if isinstance(part, dict) else str(part) - for part in content - ) - langchain_messages.append(HumanMessage(content=content)) -``` - -Replace the flattening with a converter that maps the inbound OpenAI-style -content parts to **LangChain v1 standard content blocks** (`langchain_core. -messages.content` — `ImageContentBlock`, `FileContentBlock`). Building the -*standard* blocks (rather than raw OpenAI `image_url`/`file` dicts) is important: -each provider package translates them into its own native API, so one code path -covers Anthropic, OpenAI, Gemini, and xAI uniformly. - -- `text` → `{"type": "text", "text": ...}` -- image (base64 data URI or https) → - `{"type": "image", "base64": ..., "mime_type": "image/png"}` (or `"url": ...`) -- document/PDF (base64) → - `{"type": "file", "base64": ..., "mime_type": "application/pdf", - "filename": ""}` - -Keep a `HumanMessage` with a **list** content when parts are present; only -collapse to a plain string when the message is text-only (preserves current -behavior for the no-attachment case). - -**Verified** against the pinned versions (see "Dependency check" below): a -`HumanMessage` carrying these standard blocks converts correctly outbound — -Anthropic emits `{"type":"document","source":{"type":"base64","media_type": -"application/pdf",...}}`, OpenAI emits `{"type":"file","file":{"file_data": -"data:application/pdf;base64,...","filename":...}}`. **Carry the original -`filename`** on file blocks — OpenAI requires one and otherwise substitutes a -placeholder (`LC_AUTOGENERATED`). - -### 2. No new dependencies (PCR constraint) — confirmed - -Native handoff means the enclave does **not** parse PDFs/DOCX itself — it passes -the bytes to the provider. So we should **not** add PyMuPDF/python-docx to -`tee-gateway`. - -**Dependency check (done).** The currently pinned versions already support -standard image *and* file (PDF) content blocks with base64, across every -provider we route to — so **this change needs no dependency bump and the PCR -measurements stay stable**: - -| Package | Pinned | Native file/image support | -|---|---|---| -| `langchain-core` | 1.2.26 | Defines `ImageContentBlock` / `FileContentBlock` (base64, url, file_id, mime_type) | -| `langchain-anthropic` | 1.4.0 | `file` → `document` (defaults `application/pdf`); image → base64 source | -| `langchain-openai` | 1.1.12 | `file` → `file_data` data-URI / `input_file`; image → `image_url` | -| `langchain-google-genai` | 4.2.1 | document/image blocks supported | -| `langchain-xai` | 1.2.2 | subclass of `BaseChatOpenAI` → inherits OpenAI handling | - -This was verified functionally (not just by reading types) by running the -Anthropic and OpenAI outbound message converters over a multimodal -`HumanMessage`. Per-model *acceptance* of PDFs still depends on the model itself -(see capability gating below). - -### 3. Per-provider capability gating - -Not every model accepts every modality. Extend `model_registry` with capability -flags (e.g. `supports_image`, `supports_pdf`) and reject (clear 4xx inside the -inner request) when a request sends a modality the target model can't handle, -rather than silently dropping it as today. - -### 4. Request signing / hashing - -`chat_controller.py` (~645-651) hashes user content via `str(msg.content)`. With -multimodal content that would hash megabytes of base64 and is not canonical. -Define a stable hashing rule, e.g. hash each attachment as -`sha256(mime_type || raw_bytes)` and include those digests (not the base64) in -the canonical request JSON that feeds `keccak256(requestHash ...)`. This keeps -signatures meaningful and bounded while still committing to the exact attachment -content. - -### 5. Limits & validation - -- Hard cap on total attachment bytes per request (post-decode). -- Allowlist of accepted mime types per modality. -- Reject `image_url` values that are remote `https` URLs on the private path if - we want to guarantee the enclave makes no outbound fetch for user content - (Phase 1 = base64 only). Decide explicitly. - -## `chat-api` changes - -- OHTTP path: **no change needed** to the relay itself — attachments ride inside - the encrypted payload it already forwards opaquely. -- Regular `POST /api/v1/chat` path: stop calling `load_documents` / - `is_image_url` and stop injecting extracted text. Either (a) build native - content parts here too, or (b) deprecate attachment support on the non-private - path and route all attachments through OHTTP. Recommend (b) for a single code - path. -- The presigned-URL / `attachments: string[]` machinery and `attachments.py` - become dead code for inference and can be removed once Phase 1 ships (R2 may - still be used for chat-history storage — that is a separate concern and should - be client-side-encrypted if kept). - -## `chat-app` changes - -- Replace "upload to R2 → store presigned URL → send URL in `attachments`" with: - read the file in the browser, base64-encode, and add a native `image_url` / - `file` content part to the outgoing (to-be-encrypted) message. -- Enforce client-side size/type limits matching the enclave caps; surface a clear - error when a file exceeds them. -- Drop the presigned-upload/download hooks from the send path. - -## Rollout - -1. Enclave: `convert_messages` multimodal support + capability flags + hashing + - limits (behind the existing OHTTP path). Ship and verify PCRs. -2. `chat-app`: send native base64 content parts on the OHTTP path. -3. Remove server-side parsing from `chat-api`; retire `attachments.py` and the - presigned-URL attachment flow. -4. (Optional, later) Phase 2 encrypted-R2-blob for large files. - -## Open questions - -- ~~Pinned `langchain-*` versions: do they already support `file` (PDF) content - blocks?~~ **Resolved:** yes, all five providers — no dep bump / PCR change - needed (see Dependency check above). -- Hard size cap value for inline attachments, and the OHTTP request size ceiling. -- Keep or drop attachment support entirely on the non-private path? -- Source of truth for per-model `supports_image` / `supports_pdf` flags — note - `langchain-*` ships `ModelProfile` data (e.g. `langchain_xai/data/_profiles`) - that may already encode some of this. diff --git a/tee_gateway/llm_backend.py b/tee_gateway/llm_backend.py index b41f612..aa845d3 100644 --- a/tee_gateway/llm_backend.py +++ b/tee_gateway/llm_backend.py @@ -425,27 +425,27 @@ def _convert_content_part(part: Any) -> Optional[Dict[str, Any]]: return {"type": "text", "text": text} if text else None -def _convert_user_content(content: Any) -> Any: - """Convert user-message content into a value accepted by ``HumanMessage``. - - A list of OpenAI content parts becomes a list of LangChain standard content - blocks. When every part is text, it collapses back to a plain string so simple - requests stay simple (and to preserve prior behavior). Non-list content is - returned unchanged. +def _normalize_user_content_parts(content: list) -> list: + """Pass OpenAI content parts through to LangChain mostly unchanged. + + Text and image parts already convert correctly to every provider's native + API in their OpenAI form, so they are forwarded as-is. Only ``file`` / + ``input_file`` parts are rewritten into LangChain standard file blocks: the + raw OpenAI ``{"type": "file", "file": {...}}`` shape is passed straight + through to providers like Anthropic, which expect a ``document`` block and + would otherwise reject it. Primitive (non-dict) parts are wrapped as text. """ - if not isinstance(content, list): - return content - - blocks: List[Dict[str, Any]] = [] + normalized: List[Any] = [] for part in content: - block = _convert_content_part(part) - if block is not None: - blocks.append(block) - - if blocks and all(b["type"] == "text" for b in blocks): - return "".join(b["text"] for b in blocks) - - return blocks + if isinstance(part, dict): + if part.get("type") in ("file", "input_file"): + block = _convert_content_part(part) + normalized.append(block if block is not None else part) + else: + normalized.append(part) + else: + normalized.append({"type": "text", "text": str(part)}) + return normalized class AttachmentValidationError(ValueError): @@ -559,10 +559,12 @@ def convert_messages(messages: list) -> List[Any]: elif role == "user": # content may be a string or a list of multimodal content parts - # (text / image / file); convert to native LangChain content blocks. - langchain_messages.append( - HumanMessage(content=_convert_user_content(content)) - ) + # (text / image / file). Pass parts through as-is (file parts are + # normalized to standard LangChain blocks) so the providers handle + # the native conversion. + if isinstance(content, list): + content = _normalize_user_content_parts(content) + langchain_messages.append(HumanMessage(content=content)) elif role == "assistant": if tool_calls: diff --git a/tee_gateway/test/test_tee_core.py b/tee_gateway/test/test_tee_core.py index 1647e1f..aea5c25 100644 --- a/tee_gateway/test/test_tee_core.py +++ b/tee_gateway/test/test_tee_core.py @@ -577,8 +577,8 @@ def test_multi_turn_order_preserved(self): self.assertIsInstance(result[1], HumanMessage) self.assertIsInstance(result[2], AIMessage) - def test_user_content_text_only_parts_collapse_to_string(self): - """A list of text-only parts collapses back to a plain string.""" + def test_user_content_text_parts_passthrough(self): + """A list of text parts is passed through unchanged for the provider.""" result = convert_messages( [ { @@ -591,7 +591,13 @@ def test_user_content_text_only_parts_collapse_to_string(self): ] ) self.assertIsInstance(result[0], HumanMessage) - self.assertEqual(result[0].content, "Hello world") + self.assertEqual( + result[0].content, + [ + {"type": "text", "text": "Hello "}, + {"type": "text", "text": "world"}, + ], + ) def test_empty_user_content_list_is_preserved(self): """Empty multimodal content lists should not be coerced to empty strings.""" @@ -600,8 +606,8 @@ def test_empty_user_content_list_is_preserved(self): self.assertEqual(result[0].content, []) def test_user_content_with_base64_image(self): - """An image_url data URI becomes a standard image content block, so the - image survives conversion instead of being dropped.""" + """An image_url part is passed through unchanged; each provider converts + it to its native image format at send time.""" result = convert_messages( [ { @@ -624,9 +630,8 @@ def test_user_content_with_base64_image(self): self.assertEqual( content[1], { - "type": "image", - "base64": "iVBORw0KGgoAAAANSUhEUg==", - "mime_type": "image/png", + "type": "image_url", + "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUg=="}, }, ) @@ -663,7 +668,7 @@ def test_user_content_with_base64_pdf(self): ) def test_user_content_image_remote_url(self): - """A non-data-URI image URL is passed through as a url image block.""" + """A remote (non-data-URI) image URL part is passed through unchanged.""" result = convert_messages( [ { @@ -679,7 +684,12 @@ def test_user_content_image_remote_url(self): ) self.assertEqual( result[0].content, - [{"type": "image", "url": "https://example.com/cat.png"}], + [ + { + "type": "image_url", + "image_url": {"url": "https://example.com/cat.png"}, + } + ], ) def test_multimodal_blocks_convert_for_providers(self):