Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 242 additions & 0 deletions examples/prompt_caching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#!/usr/bin/env -S uv run python

"""Prompt caching patterns.

Three patterns, each with a "first call" that creates the cache and a
"second call" that hits it. Inspect usage.cache_creation_input_tokens and
usage.cache_read_input_tokens to confirm.

Cache breakpoints are set with cache_control={"type": "ephemeral"} on the
last block of the prefix you want cached. The cache survives ~5 minutes
of inactivity. Up to 4 breakpoints per request.

Run: ANTHROPIC_API_KEY=... ./examples/prompt_caching.py
"""

from __future__ import annotations

import anthropic

MODEL = "claude-sonnet-4-5-20250929"

client = anthropic.Anthropic()


def show_usage(label: str, usage: anthropic.types.Usage) -> None:
print(
f" {label:<14} input={usage.input_tokens:<5} "
f"cache_create={usage.cache_creation_input_tokens or 0:<5} "
f"cache_read={usage.cache_read_input_tokens or 0:<5} "
f"output={usage.output_tokens}"
)


# ---------------------------------------------------------------------------
# 1. Cache the system prompt
# ---------------------------------------------------------------------------
# Use when: a chatbot or agent reuses the same long instruction set across
# many user turns. Cache the system prompt once; every subsequent turn pays
# only the user message tokens.

LONG_SYSTEM_PROMPT = (
"You are a senior staff software engineer reviewing pull requests. "
"Apply these rules in order:\n"
"1. Correctness over style. Flag bugs first; suggest cleaner phrasing only after.\n"
"2. Prefer concrete suggestions with code over abstract advice.\n"
"3. Cite the file and line. Never review code you can't see.\n"
"4. If a change introduces a behavior shift, ask for a test.\n"
"5. Reject PRs that mix unrelated concerns; ask for a split.\n"
"6. Treat tests as first-class — a failing test is a real bug.\n"
"7. Don't restate the diff. Tell the author what they can't see.\n"
# Pad to ~1024 tokens minimum — caches must exceed the per-model floor.
+ ("Keep responses pragmatic, not exhaustive. " * 80)
)


def example_1_system_prompt() -> None:
print("\n[1] cache the system prompt")

first = client.messages.create(
model=MODEL,
max_tokens=256,
system=[
{
"type": "text",
"text": LONG_SYSTEM_PROMPT,
"cache_control": {"type": "ephemeral"},
}
],
messages=[{"role": "user", "content": "Review: `if (x = 5) { ... }`"}],
)
show_usage("first call", first.usage)

second = client.messages.create(
model=MODEL,
max_tokens=256,
system=[
{
"type": "text",
"text": LONG_SYSTEM_PROMPT,
"cache_control": {"type": "ephemeral"},
}
],
messages=[{"role": "user", "content": "Review: `for (let i = 0; i < arr.length; i++)`"}],
)
show_usage("second call", second.usage)


# ---------------------------------------------------------------------------
# 2. Cache system prompt + tool definitions
# ---------------------------------------------------------------------------
# Use when: an agent loop calls the same model with the same tools turn
# after turn. Tool definitions are usually larger than the system prompt;
# caching them is the bigger win. Set the breakpoint on the final tool.

TOOLS: list[anthropic.types.ToolParam] = [
{
"name": "search_orders",
"description": "Search the orders database by customer email, date range, or status.",
"input_schema": {
"type": "object",
"properties": {
"customer_email": {"type": "string"},
"since": {"type": "string", "format": "date"},
"until": {"type": "string", "format": "date"},
"status": {
"type": "string",
"enum": ["pending", "shipped", "delivered", "cancelled"],
},
},
},
},
{
"name": "get_order_detail",
"description": "Fetch the full detail of a single order by its reference number.",
"input_schema": {
"type": "object",
"properties": {"reference_number": {"type": "string"}},
"required": ["reference_number"],
},
},
{
"name": "issue_refund",
"description": "Issue a refund against an order. Requires a reason code.",
"input_schema": {
"type": "object",
"properties": {
"reference_number": {"type": "string"},
"amount_cents": {"type": "integer"},
"reason_code": {
"type": "string",
"enum": ["damaged", "wrong_item", "lost_in_transit", "customer_request"],
},
},
"required": ["reference_number", "amount_cents", "reason_code"],
"cache_control": {"type": "ephemeral"},
},
},
]


def example_2_system_plus_tools() -> None:
print("\n[2] cache system prompt + tool definitions")

first = client.messages.create(
model=MODEL,
max_tokens=256,
system=[
{
"type": "text",
"text": LONG_SYSTEM_PROMPT,
"cache_control": {"type": "ephemeral"},
}
],
tools=TOOLS,
messages=[{"role": "user", "content": "Find all pending orders for alice@example.com"}],
)
show_usage("first call", first.usage)

second = client.messages.create(
model=MODEL,
max_tokens=256,
system=[
{
"type": "text",
"text": LONG_SYSTEM_PROMPT,
"cache_control": {"type": "ephemeral"},
}
],
tools=TOOLS,
messages=[{"role": "user", "content": "Refund order #4421 — wrong item shipped, $1240"}],
)
show_usage("second call", second.usage)


# ---------------------------------------------------------------------------
# 3. Cache long static context before user turns
# ---------------------------------------------------------------------------
# Use when: Q&A over a fixed document, codebase, or knowledge base. Put the
# document in a user-message text block with cache_control on it; the
# follow-up question goes in a second block. Repeated questions on the same
# document hit the cache.

DOCUMENT = (
"RAVEN-001 — Network Topology Reference\n"
"===========================================\n\n"
"Sun (10.10.100.235) is the primary daily driver.\n"
"Moon (Tailscale 100.101.247.79) is the backup laptop.\n"
"Mercury (100.88.76.14) is the heavy compute workhorse.\n"
"Corvin (100.120.236.120) hosts the Raven AI services.\n"
"corvin-server (100.114.209.83) hosts *-api.raven-cargo.app.\n"
"Venus (100.107.135.17) is the Mac mini.\n"
# Pad to exceed the cache floor.
+ ("Each host advertises its services over Tailscale and the local LAN. " * 200)
)


def example_3_long_context() -> None:
print("\n[3] cache a long static document")

first = client.messages.create(
model=MODEL,
max_tokens=256,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": DOCUMENT,
"cache_control": {"type": "ephemeral"},
},
{"type": "text", "text": "Which host is the daily driver?"},
],
}
],
)
show_usage("first call", first.usage)

second = client.messages.create(
model=MODEL,
max_tokens=256,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": DOCUMENT,
"cache_control": {"type": "ephemeral"},
},
{"type": "text", "text": "What hosts the Raven APIs?"},
],
}
],
)
show_usage("second call", second.usage)


if __name__ == "__main__":
example_1_system_prompt()
example_2_system_plus_tools()
example_3_long_context()