From afa79665f67ab2eb31e478864b51a5d658ed868f Mon Sep 17 00:00:00 2001
From: Aniket Dixit <47004499+dixitaniket@users.noreply.github.com>
Date: Thu, 4 Jun 2026 02:13:43 +0530
Subject: [PATCH 1/2] image gen format fixes (#91)

* testing image format fix

* review fixes

* lint fix
---
 tee_gateway/controllers/chat_controller.py |  4 +---
 tee_gateway/llm_backend.py                 | 25 ++++++++++++++++------
 tee_gateway/test/test_tee_core.py          | 22 +++++++++++++------
 tests/test_structured_outputs.py           | 23 ++++++++++++++++++++
 4 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/tee_gateway/controllers/chat_controller.py b/tee_gateway/controllers/chat_controller.py
index 291a684..08f058f 100644
--- a/tee_gateway/controllers/chat_controller.py
+++ b/tee_gateway/controllers/chat_controller.py
@@ -911,9 +911,7 @@ def _chat_request_to_dict(chat_request: CreateChatCompletionRequest) -> dict:
             messages.append(
                 {
                     "role": "user",
-                    "content": msg.content
-                    if isinstance(msg.content, str)
-                    else str(msg.content),
+                    "content": msg.content,
                 }
             )
         elif isinstance(msg, ChatCompletionRequestAssistantMessage):
diff --git a/tee_gateway/llm_backend.py b/tee_gateway/llm_backend.py
index 0e80d22..c0da980 100644
--- a/tee_gateway/llm_backend.py
+++ b/tee_gateway/llm_backend.py
@@ -325,6 +325,17 @@ def generate_images(model: str, prompt: str, n: int = 1) -> tuple[list[str], int
     return images, len(images)
 
 
+def _normalize_user_content_parts(content: list) -> list:
+    """Preserve multimodal user content while tolerating primitive text parts."""
+    normalized = []
+    for part in content:
+        if isinstance(part, dict):
+            normalized.append(part)
+        else:
+            normalized.append({"type": "text", "text": str(part)})
+    return normalized
+
+
 def convert_messages(messages: list) -> List[Any]:
     """Convert OpenAI-format message objects or dicts to LangChain message objects."""
     langchain_messages: List[BaseMessage] = []
@@ -333,13 +344,17 @@ def convert_messages(messages: list) -> List[Any]:
         # Support both OpenAPI model objects and plain dicts
         if isinstance(msg, dict):
             role = msg.get("role", "").lower()
-            content = msg.get("content", "") or ""
+            content = msg.get("content", "")
+            if content is None:
+                content = ""
             tool_calls = msg.get("tool_calls")
             tool_call_id = msg.get("tool_call_id")
             name = msg.get("name")
         else:
             role = getattr(msg, "role", "").lower()
-            content = getattr(msg, "content", "") or ""
+            content = getattr(msg, "content", "")
+            if content is None:
+                content = ""
             tool_calls = getattr(msg, "tool_calls", None)
             tool_call_id = getattr(msg, "tool_call_id", None)
             name = getattr(msg, "name", None)
@@ -348,12 +363,8 @@ def convert_messages(messages: list) -> List[Any]:
             langchain_messages.append(SystemMessage(content=content))
 
         elif role == "user":
-            # content may be a string or a list of content parts; handle both
             if isinstance(content, list):
-                content = "".join(
-                    part.get("text", "") if isinstance(part, dict) else str(part)
-                    for part in content
-                )
+                content = _normalize_user_content_parts(content)
             langchain_messages.append(HumanMessage(content=content))
 
         elif role == "assistant":
diff --git a/tee_gateway/test/test_tee_core.py b/tee_gateway/test/test_tee_core.py
index 41aac33..2411176 100644
--- a/tee_gateway/test/test_tee_core.py
+++ b/tee_gateway/test/test_tee_core.py
@@ -569,20 +569,30 @@ def test_multi_turn_order_preserved(self):
         self.assertIsInstance(result[2], AIMessage)
 
     def test_user_content_as_list_of_parts(self):
-        """Multimodal content parts should be concatenated into a single string."""
+        """Multimodal content parts should be preserved for vision-capable models."""
+        content = [
+            {"type": "text", "text": "Hello world"},
+            {
+                "type": "image_url",
+                "image_url": {"url": "data:image/png;base64,abcd"},
+            },
+        ]
         result = convert_messages(
             [
                 {
                     "role": "user",
-                    "content": [
-                        {"type": "text", "text": "Hello "},
-                        {"type": "text", "text": "world"},
-                    ],
+                    "content": content,
                 }
             ]
         )
         self.assertIsInstance(result[0], HumanMessage)
-        self.assertEqual(result[0].content, "Hello world")
+        self.assertEqual(result[0].content, content)
+
+    def test_empty_user_content_list_is_preserved(self):
+        """Empty multimodal content lists should not be coerced to empty strings."""
+        result = convert_messages([{"role": "user", "content": []}])
+        self.assertIsInstance(result[0], HumanMessage)
+        self.assertEqual(result[0].content, [])
 
     def test_full_tool_call_conversation(self):
         """End-to-end multi-turn with tool use: user → assistant (tool call) → tool result."""
diff --git a/tests/test_structured_outputs.py b/tests/test_structured_outputs.py
index 3db565f..71fbd20 100644
--- a/tests/test_structured_outputs.py
+++ b/tests/test_structured_outputs.py
@@ -9,6 +9,9 @@
 from tee_gateway.models.create_chat_completion_request import (
     CreateChatCompletionRequest,
 )
+from tee_gateway.models.chat_completion_request_user_message import (
+    ChatCompletionRequestUserMessage,
+)
 
 
 class TestResponseFormatParsing(unittest.TestCase):
@@ -109,6 +112,26 @@ def test_hash_differs_with_and_without_response_format(self):
         h2 = json.dumps(chat_request_to_dict(req_json), sort_keys=True)
         self.assertNotEqual(h1, h2)
 
+    def test_hash_dict_preserves_multimodal_user_content(self):
+        content = [
+            {"type": "text", "text": "Describe this image"},
+            {
+                "type": "image_url",
+                "image_url": {"url": "data:image/png;base64,abcd"},
+            },
+        ]
+        req = CreateChatCompletionRequest(
+            model="gpt-4.1",
+            messages=[ChatCompletionRequestUserMessage(role="user", content=content)],
+            temperature=1.0,
+        )
+        request_dict = chat_request_to_dict(req)
+        self.assertEqual(request_dict["messages"][0]["content"], content)
+
+        dumped_once = json.dumps(request_dict, sort_keys=True)
+        dumped_twice = json.dumps(chat_request_to_dict(req), sort_keys=True)
+        self.assertEqual(dumped_once, dumped_twice)
+
 
 class TestResponseFormatModelBinding(unittest.TestCase):
     """Tests that response_format is bound to the model before invocation."""

From d4d72ea3d8f9e53334021f55634c32146a79d7d8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 12:54:30 +0000
Subject: [PATCH 2/2] Minimize attachment handling: keep provider-native image
 pass-through

Revert the bespoke image-conversion path in convert_messages to main's raw
pass-through (text/image parts already convert correctly to every provider's
native API, so images keep working untouched). Only file/PDF parts are
rewritten to LangChain standard file blocks, since Anthropic needs a
'document' block and rejects OpenAI's raw file shape.

Capability gating, the per-request size cap, and request-hash canonicalization
are retained. Drop the design doc.
---
 docs/native-attachments-design.md | 231 ------------------------------
 tee_gateway/llm_backend.py        |  48 ++++---
 tee_gateway/test/test_tee_core.py |  30 ++--
 3 files changed, 45 insertions(+), 264 deletions(-)
 delete mode 100644 docs/native-attachments-design.md

diff --git a/docs/native-attachments-design.md b/docs/native-attachments-design.md
deleted file mode 100644
index c1a637c..0000000
--- a/docs/native-attachments-design.md
+++ /dev/null
@@ -1,231 +0,0 @@
-# Design: Native LLM Attachments over the Private (OHTTP) Path
-
-## Status
-
-Proposal. Spans three repos: `chat-app` (browser), `chat-api` (relay), `tee-gateway`
-(enclave). The bulk of the change lands in `tee-gateway`.
-
-## Motivation
-
-Today attachments are handled by **server-side parsing in `chat-api`**:
-
-- `chat-api/src/core/attachments.py` downloads each attachment and runs PyMuPDF /
-  python-docx to extract **plain text**, then injects that text into the prompt.
-- Images are classified by content-type and passed through as URLs.
-
-This is the wrong layer to solve the problem:
-
-1. **It throws away everything the models do natively.** Modern Claude / GPT /
-   Gemini ingest PDFs and images directly — layout, tables, figures, charts,
-   handwriting, embedded images. Flattening a PDF to `page.get_text()` loses all
-   of that and feeds the model a worse input than it could handle itself.
-2. **It only works on the non-private path.** The parsing in `attachments.py` is
-   invoked exclusively from the regular `POST /api/v1/chat` handler. On the
-   **OHTTP path**, `chat-api` is a dumb relay — it forwards opaque ciphertext to
-   the enclave and never sees the body — so attachments are simply not processed.
-   Worse, in the enclave `llm_backend.convert_messages` flattens multimodal
-   content parts to text only (`"".join(part.get("text", "") ...)`), so any
-   `image_url` part is **silently dropped** before it reaches the provider.
-
-Net result: **attachments and privacy are currently mutually exclusive.**
-Attachments only work on the route where `chat-api` reads the plaintext, and the
-private route drops them.
-
-## Goal
-
-Send attachments to the model **natively**, on the **private (OHTTP) path**:
-
-- No server-side text extraction. The file bytes reach the model as a native
-  image/document content part.
-- `chat-api` and Cloudflare never see attachment plaintext (same trust boundary
-  as the message text already enjoys on OHTTP).
-- The enclave converts the inner request's multimodal content into each
-  provider's native format via LangChain.
-
-## Trust boundary (what this does and does not hide)
-
-- **Hidden from:** the browser→relay transport, `chat-api`, the OHTTP relay,
-  Cloudflare/R2. They see only HPKE ciphertext.
-- **Visible to:** the enclave (it decrypts — that's the trust anchor) and the
-  **upstream LLM provider** (OpenAI/Anthropic/Google/xAI/ByteDance), which
-  receives the attachment as part of the completion request. This is identical
-  to how message *text* is already handled: whatever you send the model, the
-  model provider sees. Fully provider-blind attachments would require the model
-  to run inside the TEE and are out of scope here.
-
-## Transport: how the attachment reaches the enclave
-
-### Phase 1 — inline base64 (recommended starting point)
-
-The browser embeds the file directly in the message content as a standard
-OpenAI-style content part, inside the HPKE-encrypted OHTTP payload:
-
-```jsonc
-{
-  "model": "claude-sonnet-4-6",
-  "messages": [
-    {
-      "role": "user",
-      "content": [
-        { "type": "text", "text": "Summarize this contract." },
-        { "type": "image_url",
-          "image_url": { "url": "data:image/png;base64,iVBORw0K..." } },
-        { "type": "file",
-          "file": { "filename": "contract.pdf",
-                    "file_data": "data:application/pdf;base64,JVBERi0..." } }
-      ]
-    }
-  ]
-}
-```
-
-- Pros: nothing outside the enclave/provider ever sees the bytes; no R2 round
-  trip; no presigned-URL machinery; no SSRF surface.
-- Cons: base64 inflates ~33%; bounded by request/OHTTP size limits; no
-  persistence (re-sent each turn). Fine for the common case (a few MB of PDF or
-  an image). Enforce a hard per-request attachment-bytes cap in the enclave.
-
-### Phase 2 — encrypted blob in R2 (only if large files / persistence needed)
-
-Browser client-side-encrypts the file (AES-GCM), uploads **ciphertext** to R2
-(Cloudflare sees only ciphertext), and includes inside the OHTTP payload an R2
-reference plus the AES key **wrapped to the TEE attestation/HPKE public key**.
-The enclave fetches the ciphertext and decrypts internally. Defer until Phase 1
-limits become a real constraint.
-
-> Note: do **not** go back to plaintext-in-R2 + presigned URLs. That reintroduces
-> the public-bearer-token leak and the SSRF surface in `attachments.py`.
-
-## Enclave changes (`tee-gateway`) — the core of the work
-
-### 1. `convert_messages` must preserve multimodal content
-
-`llm_backend.py:248-255` currently does:
-
-```python
-elif role == "user":
-    if isinstance(content, list):
-        content = "".join(
-            part.get("text", "") if isinstance(part, dict) else str(part)
-            for part in content
-        )
-    langchain_messages.append(HumanMessage(content=content))
-```
-
-Replace the flattening with a converter that maps the inbound OpenAI-style
-content parts to **LangChain v1 standard content blocks** (`langchain_core.
-messages.content` — `ImageContentBlock`, `FileContentBlock`). Building the
-*standard* blocks (rather than raw OpenAI `image_url`/`file` dicts) is important:
-each provider package translates them into its own native API, so one code path
-covers Anthropic, OpenAI, Gemini, and xAI uniformly.
-
-- `text` → `{"type": "text", "text": ...}`
-- image (base64 data URI or https) →
-  `{"type": "image", "base64": ..., "mime_type": "image/png"}` (or `"url": ...`)
-- document/PDF (base64) →
-  `{"type": "file", "base64": ..., "mime_type": "application/pdf",
-    "filename": "<original name>"}`
-
-Keep a `HumanMessage` with a **list** content when parts are present; only
-collapse to a plain string when the message is text-only (preserves current
-behavior for the no-attachment case).
-
-**Verified** against the pinned versions (see "Dependency check" below): a
-`HumanMessage` carrying these standard blocks converts correctly outbound —
-Anthropic emits `{"type":"document","source":{"type":"base64","media_type":
-"application/pdf",...}}`, OpenAI emits `{"type":"file","file":{"file_data":
-"data:application/pdf;base64,...","filename":...}}`. **Carry the original
-`filename`** on file blocks — OpenAI requires one and otherwise substitutes a
-placeholder (`LC_AUTOGENERATED`).
-
-### 2. No new dependencies (PCR constraint) — confirmed
-
-Native handoff means the enclave does **not** parse PDFs/DOCX itself — it passes
-the bytes to the provider. So we should **not** add PyMuPDF/python-docx to
-`tee-gateway`.
-
-**Dependency check (done).** The currently pinned versions already support
-standard image *and* file (PDF) content blocks with base64, across every
-provider we route to — so **this change needs no dependency bump and the PCR
-measurements stay stable**:
-
-| Package | Pinned | Native file/image support |
-|---|---|---|
-| `langchain-core` | 1.2.26 | Defines `ImageContentBlock` / `FileContentBlock` (base64, url, file_id, mime_type) |
-| `langchain-anthropic` | 1.4.0 | `file` → `document` (defaults `application/pdf`); image → base64 source |
-| `langchain-openai` | 1.1.12 | `file` → `file_data` data-URI / `input_file`; image → `image_url` |
-| `langchain-google-genai` | 4.2.1 | document/image blocks supported |
-| `langchain-xai` | 1.2.2 | subclass of `BaseChatOpenAI` → inherits OpenAI handling |
-
-This was verified functionally (not just by reading types) by running the
-Anthropic and OpenAI outbound message converters over a multimodal
-`HumanMessage`. Per-model *acceptance* of PDFs still depends on the model itself
-(see capability gating below).
-
-### 3. Per-provider capability gating
-
-Not every model accepts every modality. Extend `model_registry` with capability
-flags (e.g. `supports_image`, `supports_pdf`) and reject (clear 4xx inside the
-inner request) when a request sends a modality the target model can't handle,
-rather than silently dropping it as today.
-
-### 4. Request signing / hashing
-
-`chat_controller.py` (~645-651) hashes user content via `str(msg.content)`. With
-multimodal content that would hash megabytes of base64 and is not canonical.
-Define a stable hashing rule, e.g. hash each attachment as
-`sha256(mime_type || raw_bytes)` and include those digests (not the base64) in
-the canonical request JSON that feeds `keccak256(requestHash ...)`. This keeps
-signatures meaningful and bounded while still committing to the exact attachment
-content.
-
-### 5. Limits & validation
-
-- Hard cap on total attachment bytes per request (post-decode).
-- Allowlist of accepted mime types per modality.
-- Reject `image_url` values that are remote `https` URLs on the private path if
-  we want to guarantee the enclave makes no outbound fetch for user content
-  (Phase 1 = base64 only). Decide explicitly.
-
-## `chat-api` changes
-
-- OHTTP path: **no change needed** to the relay itself — attachments ride inside
-  the encrypted payload it already forwards opaquely.
-- Regular `POST /api/v1/chat` path: stop calling `load_documents` /
-  `is_image_url` and stop injecting extracted text. Either (a) build native
-  content parts here too, or (b) deprecate attachment support on the non-private
-  path and route all attachments through OHTTP. Recommend (b) for a single code
-  path.
-- The presigned-URL / `attachments: string[]` machinery and `attachments.py`
-  become dead code for inference and can be removed once Phase 1 ships (R2 may
-  still be used for chat-history storage — that is a separate concern and should
-  be client-side-encrypted if kept).
-
-## `chat-app` changes
-
-- Replace "upload to R2 → store presigned URL → send URL in `attachments`" with:
-  read the file in the browser, base64-encode, and add a native `image_url` /
-  `file` content part to the outgoing (to-be-encrypted) message.
-- Enforce client-side size/type limits matching the enclave caps; surface a clear
-  error when a file exceeds them.
-- Drop the presigned-upload/download hooks from the send path.
-
-## Rollout
-
-1. Enclave: `convert_messages` multimodal support + capability flags + hashing +
-   limits (behind the existing OHTTP path). Ship and verify PCRs.
-2. `chat-app`: send native base64 content parts on the OHTTP path.
-3. Remove server-side parsing from `chat-api`; retire `attachments.py` and the
-   presigned-URL attachment flow.
-4. (Optional, later) Phase 2 encrypted-R2-blob for large files.
-
-## Open questions
-
-- ~~Pinned `langchain-*` versions: do they already support `file` (PDF) content
-  blocks?~~ **Resolved:** yes, all five providers — no dep bump / PCR change
-  needed (see Dependency check above).
-- Hard size cap value for inline attachments, and the OHTTP request size ceiling.
-- Keep or drop attachment support entirely on the non-private path?
-- Source of truth for per-model `supports_image` / `supports_pdf` flags — note
-  `langchain-*` ships `ModelProfile` data (e.g. `langchain_xai/data/_profiles`)
-  that may already encode some of this.
diff --git a/tee_gateway/llm_backend.py b/tee_gateway/llm_backend.py
index b41f612..aa845d3 100644
--- a/tee_gateway/llm_backend.py
+++ b/tee_gateway/llm_backend.py
@@ -425,27 +425,27 @@ def _convert_content_part(part: Any) -> Optional[Dict[str, Any]]:
     return {"type": "text", "text": text} if text else None
 
 
-def _convert_user_content(content: Any) -> Any:
-    """Convert user-message content into a value accepted by ``HumanMessage``.
-
-    A list of OpenAI content parts becomes a list of LangChain standard content
-    blocks. When every part is text, it collapses back to a plain string so simple
-    requests stay simple (and to preserve prior behavior). Non-list content is
-    returned unchanged.
+def _normalize_user_content_parts(content: list) -> list:
+    """Pass OpenAI content parts through to LangChain mostly unchanged.
+
+    Text and image parts already convert correctly to every provider's native
+    API in their OpenAI form, so they are forwarded as-is. Only ``file`` /
+    ``input_file`` parts are rewritten into LangChain standard file blocks: the
+    raw OpenAI ``{"type": "file", "file": {...}}`` shape is passed straight
+    through to providers like Anthropic, which expect a ``document`` block and
+    would otherwise reject it. Primitive (non-dict) parts are wrapped as text.
     """
-    if not isinstance(content, list):
-        return content
-
-    blocks: List[Dict[str, Any]] = []
+    normalized: List[Any] = []
     for part in content:
-        block = _convert_content_part(part)
-        if block is not None:
-            blocks.append(block)
-
-    if blocks and all(b["type"] == "text" for b in blocks):
-        return "".join(b["text"] for b in blocks)
-
-    return blocks
+        if isinstance(part, dict):
+            if part.get("type") in ("file", "input_file"):
+                block = _convert_content_part(part)
+                normalized.append(block if block is not None else part)
+            else:
+                normalized.append(part)
+        else:
+            normalized.append({"type": "text", "text": str(part)})
+    return normalized
 
 
 class AttachmentValidationError(ValueError):
@@ -559,10 +559,12 @@ def convert_messages(messages: list) -> List[Any]:
 
         elif role == "user":
             # content may be a string or a list of multimodal content parts
-            # (text / image / file); convert to native LangChain content blocks.
-            langchain_messages.append(
-                HumanMessage(content=_convert_user_content(content))
-            )
+            # (text / image / file). Pass parts through as-is (file parts are
+            # normalized to standard LangChain blocks) so the providers handle
+            # the native conversion.
+            if isinstance(content, list):
+                content = _normalize_user_content_parts(content)
+            langchain_messages.append(HumanMessage(content=content))
 
         elif role == "assistant":
             if tool_calls:
diff --git a/tee_gateway/test/test_tee_core.py b/tee_gateway/test/test_tee_core.py
index 1647e1f..aea5c25 100644
--- a/tee_gateway/test/test_tee_core.py
+++ b/tee_gateway/test/test_tee_core.py
@@ -577,8 +577,8 @@ def test_multi_turn_order_preserved(self):
         self.assertIsInstance(result[1], HumanMessage)
         self.assertIsInstance(result[2], AIMessage)
 
-    def test_user_content_text_only_parts_collapse_to_string(self):
-        """A list of text-only parts collapses back to a plain string."""
+    def test_user_content_text_parts_passthrough(self):
+        """A list of text parts is passed through unchanged for the provider."""
         result = convert_messages(
             [
                 {
@@ -591,7 +591,13 @@ def test_user_content_text_only_parts_collapse_to_string(self):
             ]
         )
         self.assertIsInstance(result[0], HumanMessage)
-        self.assertEqual(result[0].content, "Hello world")
+        self.assertEqual(
+            result[0].content,
+            [
+                {"type": "text", "text": "Hello "},
+                {"type": "text", "text": "world"},
+            ],
+        )
 
     def test_empty_user_content_list_is_preserved(self):
         """Empty multimodal content lists should not be coerced to empty strings."""
@@ -600,8 +606,8 @@ def test_empty_user_content_list_is_preserved(self):
         self.assertEqual(result[0].content, [])
 
     def test_user_content_with_base64_image(self):
-        """An image_url data URI becomes a standard image content block, so the
-        image survives conversion instead of being dropped."""
+        """An image_url part is passed through unchanged; each provider converts
+        it to its native image format at send time."""
         result = convert_messages(
             [
                 {
@@ -624,9 +630,8 @@ def test_user_content_with_base64_image(self):
         self.assertEqual(
             content[1],
             {
-                "type": "image",
-                "base64": "iVBORw0KGgoAAAANSUhEUg==",
-                "mime_type": "image/png",
+                "type": "image_url",
+                "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUg=="},
             },
         )
 
@@ -663,7 +668,7 @@ def test_user_content_with_base64_pdf(self):
         )
 
     def test_user_content_image_remote_url(self):
-        """A non-data-URI image URL is passed through as a url image block."""
+        """A remote (non-data-URI) image URL part is passed through unchanged."""
         result = convert_messages(
             [
                 {
@@ -679,7 +684,12 @@ def test_user_content_image_remote_url(self):
         )
         self.assertEqual(
             result[0].content,
-            [{"type": "image", "url": "https://example.com/cat.png"}],
+            [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/cat.png"},
+                }
+            ],
         )
 
     def test_multimodal_blocks_convert_for_providers(self):