tutorialintermediate
Speculative prompt caching May 2025 • Responses Reduce time-to-first-token by warming cache speculatively while users formulate their queries.
cookbook
View original on cookbookSpeculative prompt caching is a technique that reduces time-to-first-token (TTFT) by warming up Claude's cache while users are still formulating their queries. Instead of waiting until a user submits their question to load context into the cache, the system begins cache warming immediately when the user starts typing. This cookbook demonstrates the pattern using SQLite source code as example context, showing how to implement it with the Anthropic Python SDK using async operations and prompt caching controls.
Key Points
- •Speculative caching begins cache warming in the background while users type, eliminating the cache-loading delay from TTFT
- •Use `sample_one_token()` with `max_tokens=1` to warm the cache speculatively without generating unnecessary output
- •Apply `cache_control: {type: 'ephemeral'}` to context blocks to enable prompt caching on large documents
- •Reuse the exact same initial message structure between cache warming and final query to ensure cache hits
- •Include timestamps in cached content to prevent unintended cache sharing across different runs
- •Measure TTFT improvements by comparing standard caching (TTFT ~20.87s) vs speculative caching (significantly reduced)
- •Use `AsyncAnthropic` client with `messages.stream()` for non-blocking cache warming and response generation
- •Monitor cache statistics via `response.usage` attributes: `cache_read_input_tokens` and `cache_creation_input_tokens`
- •Simulate user typing delays with `asyncio.sleep()` to test realistic scenarios where cache warming completes before submission
- •Combine `asyncio.create_task()` for background cache warming with `await cache_task` to ensure completion before final request
Found this useful? Add it to a playbook for a step-by-step implementation guide.
Workflow Diagram
Start Process
Step A
Step B
Step C
Complete
Concepts
Artifacts (4)
speculative_caching_setup.pypythonscript
import asyncio
import copy
import datetime
import time
import httpx
from anthropic import AsyncAnthropic
# Configuration constants
MODEL = "claude-sonnet-4-6"
SQLITE_SOURCES = {
"btree.h": "https://sqlite.org/src/raw/18e5e7b2124c23426a283523e5f31a4bff029131b795bb82391f9d2f3136fc50?at=btree.h",
"btree.c": "https://sqlite.org/src/raw/63ca6b647342e8cef643863cd0962a542f133e1069460725ba4461dcda92b03c?at=btree.c",
}
DEFAULT_CLIENT_ARGS = {
"system": "You are an expert systems programmer helping analyze database internals.",
"max_tokens": 4096,
"temperature": 0,
}helper_functions.pypythonscript
async def get_sqlite_sources() -> dict[str, str]:
print("Downloading SQLite source files...")
source_files = {}
start_time = time.time()
async with httpx.AsyncClient(timeout=30.0) as client:
tasks = []
async def download_file(filename: str, url: str) -> tuple[str, str]:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
print(f"Successfully downloaded {filename}")
return filename, response.text
for filename, url in SQLITE_SOURCES.items():
tasks.append(download_file(filename, url))
results = await asyncio.gather(*tasks)
source_files = dict(results)
duration = time.time() - start_time
print(f"Downloaded {len(source_files)} files in {duration:.2f} seconds")
return source_files
async def create_initial_message():
sources = await get_sqlite_sources()
initial_message = {
"role": "user",
"content": [
{
"type": "text",
"text": f"""Current time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Source to Analyze:
btree.h:
```c
{sources['btree.h']}
```
btree.c:
```c
{sources['btree.c']}
```""",
"cache_control": {"type": "ephemeral"},
}
],
}
return initial_message
async def sample_one_token(client, messages: list):
"""Send a single-token request to warm up the cache"""
args = copy.deepcopy(DEFAULT_CLIENT_ARGS)
args["max_tokens"] = 1
await client.messages.create(
messages=messages,
model=MODEL,
**args,
)
def print_query_statistics(response, query_type: str) -> None:
print(f"\n{query_type} query statistics:")
print(f"\tInput tokens: {response.usage.input_tokens}")
print(f"\tOutput tokens: {response.usage.output_tokens}")
print(f"\tCache read input tokens: {getattr(response.usage, 'cache_read_input_tokens', '---')}")
print(f"\tCache creation input tokens: {getattr(response.usage, 'cache_creation_input_tokens', '---')}")standard_caching_demo.pypythonscript
async def standard_prompt_caching_demo():
client = AsyncAnthropic()
# Prepare the large context
initial_message = await create_initial_message()
# Simulate user typing time
print("User is typing their question...")
await asyncio.sleep(3) # Simulate 3 seconds of typing
user_question = "What is the purpose of the BtShared structure?"
print(f"User submitted: {user_question}")
# Send the full request (context + question)
full_message = copy.deepcopy(initial_message)
full_message["content"].append({
"type": "text",
"text": f"Answer the user's question: {user_question}"
})
print("\nSending request to API...")
start_time = time.time()
first_token_time = None
async with client.messages.stream(
messages=[full_message],
model=MODEL,
**DEFAULT_CLIENT_ARGS,
) as stream:
async for text in stream.text_stream:
if first_token_time is None and text.strip():
first_token_time = time.time() - start_time
print(f"\n🕐 Time to first token: {first_token_time:.2f} seconds")
break
response = await stream.get_final_message()
total_time = time.time() - start_time
print(f"Total response time: {total_time:.2f} seconds")
print_query_statistics(response, "Standard Caching")
return first_token_time, total_timespeculative_caching_demo.pypythonscript
async def speculative_prompt_caching_demo():
client = AsyncAnthropic()
# Prepare the large context
initial_message = await create_initial_message()
# Start speculative caching while user is typing
print("User is typing their question...")
print("🔥 Starting cache warming in background...")
# Warm up cache while user types
cache_task = asyncio.create_task(sample_one_token(client, [initial_message]))
# Simulate user typing time
await asyncio.sleep(3) # Simulate 3 seconds of typing
user_question = "What is the purpose of the BtShared structure?"
print(f"User submitted: {user_question}")
# Ensure cache warming is complete
await cache_task
print("✅ Cache warming completed!")
# Prepare messages for cached query
cached_message = copy.deepcopy(initial_message)
cached_message["content"].append({
"type": "text",
"text": f"Answer the user's question: {user_question}"
})
print("\nSending request to API (with warm cache)...")
start_time = time.time()
first_token_time = None
async with client.messages.stream(
messages=[cached_message],
model=MODEL,
**DEFAULT_CLIENT_ARGS,
) as stream:
async for text in stream.text_stream:
if first_token_time is None and text.strip():
first_token_time = time.time() - start_time
print(f"\n🚀 Time to first token: {first_token_time:.2f} seconds")
break
response = await stream.get_final_message()
total_time = time.time() - start_time
print(f"Total response time: {total_time:.2f} seconds")
print_query_statistics(response, "Speculative Caching")
return first_token_time, total_time