From 376a92883d5d707a9bf45f2962ad99701160b5e6 Mon Sep 17 00:00:00 2001
From: Hermetic Ormus <hermeticormus@gmail.com>
Date: Thu, 30 Apr 2026 10:27:47 -0500
Subject: [PATCH] docs(examples): add prompt-caching example covering 3
 patterns

The examples/ directory has agents, batch, streaming, structured
outputs, thinking, and MCP tools but no prompt-caching example. The
feature is heavily documented on platform.claude.com but reading the
SDK examples wouldn't tell you it exists. This adds one runnable file
covering the three patterns most users hit:

1. Cache the system prompt (chatbots, long instructions).
2. Cache system + tool definitions (agent loops).
3. Cache long static context (RAG / Q&A over a fixed document).

Each section has a first call that creates the cache and a second call
that hits it. show_usage() prints input / cache_create / cache_read /
output token counts so the operator can verify the cache hit.
---
 examples/prompt_caching.py | 242 +++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100755 examples/prompt_caching.py

diff --git a/examples/prompt_caching.py b/examples/prompt_caching.py
new file mode 100755
index 000000000..623a26c17
--- /dev/null
+++ b/examples/prompt_caching.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env -S uv run python
+
+"""Prompt caching patterns.
+
+Three patterns, each with a "first call" that creates the cache and a
+"second call" that hits it. Inspect usage.cache_creation_input_tokens and
+usage.cache_read_input_tokens to confirm.
+
+Cache breakpoints are set with cache_control={"type": "ephemeral"} on the
+last block of the prefix you want cached. The cache survives ~5 minutes
+of inactivity. Up to 4 breakpoints per request.
+
+Run: ANTHROPIC_API_KEY=... ./examples/prompt_caching.py
+"""
+
+from __future__ import annotations
+
+import anthropic
+
+MODEL = "claude-sonnet-4-5-20250929"
+
+client = anthropic.Anthropic()
+
+
+def show_usage(label: str, usage: anthropic.types.Usage) -> None:
+    print(
+        f"  {label:<14} input={usage.input_tokens:<5} "
+        f"cache_create={usage.cache_creation_input_tokens or 0:<5} "
+        f"cache_read={usage.cache_read_input_tokens or 0:<5} "
+        f"output={usage.output_tokens}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# 1. Cache the system prompt
+# ---------------------------------------------------------------------------
+# Use when: a chatbot or agent reuses the same long instruction set across
+# many user turns. Cache the system prompt once; every subsequent turn pays
+# only the user message tokens.
+
+LONG_SYSTEM_PROMPT = (
+    "You are a senior staff software engineer reviewing pull requests. "
+    "Apply these rules in order:\n"
+    "1. Correctness over style. Flag bugs first; suggest cleaner phrasing only after.\n"
+    "2. Prefer concrete suggestions with code over abstract advice.\n"
+    "3. Cite the file and line. Never review code you can't see.\n"
+    "4. If a change introduces a behavior shift, ask for a test.\n"
+    "5. Reject PRs that mix unrelated concerns; ask for a split.\n"
+    "6. Treat tests as first-class — a failing test is a real bug.\n"
+    "7. Don't restate the diff. Tell the author what they can't see.\n"
+    # Pad to ~1024 tokens minimum — caches must exceed the per-model floor.
+    + ("Keep responses pragmatic, not exhaustive. " * 80)
+)
+
+
+def example_1_system_prompt() -> None:
+    print("\n[1] cache the system prompt")
+
+    first = client.messages.create(
+        model=MODEL,
+        max_tokens=256,
+        system=[
+            {
+                "type": "text",
+                "text": LONG_SYSTEM_PROMPT,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ],
+        messages=[{"role": "user", "content": "Review: `if (x = 5) { ... }`"}],
+    )
+    show_usage("first call", first.usage)
+
+    second = client.messages.create(
+        model=MODEL,
+        max_tokens=256,
+        system=[
+            {
+                "type": "text",
+                "text": LONG_SYSTEM_PROMPT,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ],
+        messages=[{"role": "user", "content": "Review: `for (let i = 0; i < arr.length; i++)`"}],
+    )
+    show_usage("second call", second.usage)
+
+
+# ---------------------------------------------------------------------------
+# 2. Cache system prompt + tool definitions
+# ---------------------------------------------------------------------------
+# Use when: an agent loop calls the same model with the same tools turn
+# after turn. Tool definitions are usually larger than the system prompt;
+# caching them is the bigger win. Set the breakpoint on the final tool.
+
+TOOLS: list[anthropic.types.ToolParam] = [
+    {
+        "name": "search_orders",
+        "description": "Search the orders database by customer email, date range, or status.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "customer_email": {"type": "string"},
+                "since": {"type": "string", "format": "date"},
+                "until": {"type": "string", "format": "date"},
+                "status": {
+                    "type": "string",
+                    "enum": ["pending", "shipped", "delivered", "cancelled"],
+                },
+            },
+        },
+    },
+    {
+        "name": "get_order_detail",
+        "description": "Fetch the full detail of a single order by its reference number.",
+        "input_schema": {
+            "type": "object",
+            "properties": {"reference_number": {"type": "string"}},
+            "required": ["reference_number"],
+        },
+    },
+    {
+        "name": "issue_refund",
+        "description": "Issue a refund against an order. Requires a reason code.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "reference_number": {"type": "string"},
+                "amount_cents": {"type": "integer"},
+                "reason_code": {
+                    "type": "string",
+                    "enum": ["damaged", "wrong_item", "lost_in_transit", "customer_request"],
+                },
+            },
+            "required": ["reference_number", "amount_cents", "reason_code"],
+            "cache_control": {"type": "ephemeral"},
+        },
+    },
+]
+
+
+def example_2_system_plus_tools() -> None:
+    print("\n[2] cache system prompt + tool definitions")
+
+    first = client.messages.create(
+        model=MODEL,
+        max_tokens=256,
+        system=[
+            {
+                "type": "text",
+                "text": LONG_SYSTEM_PROMPT,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ],
+        tools=TOOLS,
+        messages=[{"role": "user", "content": "Find all pending orders for alice@example.com"}],
+    )
+    show_usage("first call", first.usage)
+
+    second = client.messages.create(
+        model=MODEL,
+        max_tokens=256,
+        system=[
+            {
+                "type": "text",
+                "text": LONG_SYSTEM_PROMPT,
+                "cache_control": {"type": "ephemeral"},
+            }
+        ],
+        tools=TOOLS,
+        messages=[{"role": "user", "content": "Refund order #4421 — wrong item shipped, $1240"}],
+    )
+    show_usage("second call", second.usage)
+
+
+# ---------------------------------------------------------------------------
+# 3. Cache long static context before user turns
+# ---------------------------------------------------------------------------
+# Use when: Q&A over a fixed document, codebase, or knowledge base. Put the
+# document in a user-message text block with cache_control on it; the
+# follow-up question goes in a second block. Repeated questions on the same
+# document hit the cache.
+
+DOCUMENT = (
+    "RAVEN-001 — Network Topology Reference\n"
+    "===========================================\n\n"
+    "Sun (10.10.100.235) is the primary daily driver.\n"
+    "Moon (Tailscale 100.101.247.79) is the backup laptop.\n"
+    "Mercury (100.88.76.14) is the heavy compute workhorse.\n"
+    "Corvin (100.120.236.120) hosts the Raven AI services.\n"
+    "corvin-server (100.114.209.83) hosts *-api.raven-cargo.app.\n"
+    "Venus (100.107.135.17) is the Mac mini.\n"
+    # Pad to exceed the cache floor.
+    + ("Each host advertises its services over Tailscale and the local LAN. " * 200)
+)
+
+
+def example_3_long_context() -> None:
+    print("\n[3] cache a long static document")
+
+    first = client.messages.create(
+        model=MODEL,
+        max_tokens=256,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": DOCUMENT,
+                        "cache_control": {"type": "ephemeral"},
+                    },
+                    {"type": "text", "text": "Which host is the daily driver?"},
+                ],
+            }
+        ],
+    )
+    show_usage("first call", first.usage)
+
+    second = client.messages.create(
+        model=MODEL,
+        max_tokens=256,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": DOCUMENT,
+                        "cache_control": {"type": "ephemeral"},
+                    },
+                    {"type": "text", "text": "What hosts the Raven APIs?"},
+                ],
+            }
+        ],
+    )
+    show_usage("second call", second.usage)
+
+
+if __name__ == "__main__":
+    example_1_system_prompt()
+    example_2_system_plus_tools()
+    example_3_long_context()