tutorialintermediate

Speculative prompt caching May 2025 • Responses Reduce time-to-first-token by warming cache speculatively while users formulate their queries.

March 8, 2026cookbook

Speculative prompt caching is a technique that reduces time-to-first-token (TTFT) by warming up Claude's cache while users are still formulating their queries. Instead of waiting until a user submits their question to load context into the cache, the system begins cache warming immediately when the user starts typing. This cookbook demonstrates the pattern using SQLite source code as example context, showing how to implement it with the Anthropic Python SDK using async operations and prompt caching controls.

Key Points

•Speculative caching begins cache warming in the background while users type, eliminating the cache-loading delay from TTFT
•Use `sample_one_token()` with `max_tokens=1` to warm the cache speculatively without generating unnecessary output
•Apply `cache_control: {type: 'ephemeral'}` to context blocks to enable prompt caching on large documents
•Reuse the exact same initial message structure between cache warming and final query to ensure cache hits
•Include timestamps in cached content to prevent unintended cache sharing across different runs
•Measure TTFT improvements by comparing standard caching (TTFT ~20.87s) vs speculative caching (significantly reduced)
•Use `AsyncAnthropic` client with `messages.stream()` for non-blocking cache warming and response generation
•Monitor cache statistics via `response.usage` attributes: `cache_read_input_tokens` and `cache_creation_input_tokens`
•Simulate user typing delays with `asyncio.sleep()` to test realistic scenarios where cache warming completes before submission
•Combine `asyncio.create_task()` for background cache warming with `await cache_task` to ensure completion before final request

Found this useful? Add it to a playbook for a step-by-step implementation guide.

Workflow Diagram

Start Process

Step A

Step B

Step C

Complete

Quality★★★★★

Concepts

Prompt Injection Defense Context Management Skills & Tools Tool Use

Artifacts (4)

speculative_caching_setup.pypythonscript

import asyncio
import copy
import datetime
import time
import httpx
from anthropic import AsyncAnthropic

# Configuration constants
MODEL = "claude-sonnet-4-6"
SQLITE_SOURCES = {
    "btree.h": "https://sqlite.org/src/raw/18e5e7b2124c23426a283523e5f31a4bff029131b795bb82391f9d2f3136fc50?at=btree.h",
    "btree.c": "https://sqlite.org/src/raw/63ca6b647342e8cef643863cd0962a542f133e1069460725ba4461dcda92b03c?at=btree.c",
}

DEFAULT_CLIENT_ARGS = {
    "system": "You are an expert systems programmer helping analyze database internals.",
    "max_tokens": 4096,
    "temperature": 0,
}

helper_functions.pypythonscript

async def get_sqlite_sources() -> dict[str, str]:
    print("Downloading SQLite source files...")
    source_files = {}
    start_time = time.time()
    async with httpx.AsyncClient(timeout=30.0) as client:
        tasks = []
        async def download_file(filename: str, url: str) -> tuple[str, str]:
            response = await client.get(url, follow_redirects=True)
            response.raise_for_status()
            print(f"Successfully downloaded {filename}")
            return filename, response.text
        
        for filename, url in SQLITE_SOURCES.items():
            tasks.append(download_file(filename, url))
        
        results = await asyncio.gather(*tasks)
        source_files = dict(results)
    
    duration = time.time() - start_time
    print(f"Downloaded {len(source_files)} files in {duration:.2f} seconds")
    return source_files

async def create_initial_message():
    sources = await get_sqlite_sources()
    initial_message = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": f"""Current time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Source to Analyze:
btree.h:
```c
{sources['btree.h']}
```
btree.c:
```c
{sources['btree.c']}
```""",
                "cache_control": {"type": "ephemeral"},
            }
        ],
    }
    return initial_message

async def sample_one_token(client, messages: list):
    """Send a single-token request to warm up the cache"""
    args = copy.deepcopy(DEFAULT_CLIENT_ARGS)
    args["max_tokens"] = 1
    await client.messages.create(
        messages=messages,
        model=MODEL,
        **args,
    )

def print_query_statistics(response, query_type: str) -> None:
    print(f"\n{query_type} query statistics:")
    print(f"\tInput tokens: {response.usage.input_tokens}")
    print(f"\tOutput tokens: {response.usage.output_tokens}")
    print(f"\tCache read input tokens: {getattr(response.usage, 'cache_read_input_tokens', '---')}")
    print(f"\tCache creation input tokens: {getattr(response.usage, 'cache_creation_input_tokens', '---')}")

standard_caching_demo.pypythonscript

async def standard_prompt_caching_demo():
    client = AsyncAnthropic()
    
    # Prepare the large context
    initial_message = await create_initial_message()
    
    # Simulate user typing time
    print("User is typing their question...")
    await asyncio.sleep(3)  # Simulate 3 seconds of typing
    
    user_question = "What is the purpose of the BtShared structure?"
    print(f"User submitted: {user_question}")
    
    # Send the full request (context + question)
    full_message = copy.deepcopy(initial_message)
    full_message["content"].append({
        "type": "text",
        "text": f"Answer the user's question: {user_question}"
    })
    
    print("\nSending request to API...")
    start_time = time.time()
    first_token_time = None
    
    async with client.messages.stream(
        messages=[full_message],
        model=MODEL,
        **DEFAULT_CLIENT_ARGS,
    ) as stream:
        async for text in stream.text_stream:
            if first_token_time is None and text.strip():
                first_token_time = time.time() - start_time
                print(f"\n🕐 Time to first token: {first_token_time:.2f} seconds")
                break
        
        response = await stream.get_final_message()
    
    total_time = time.time() - start_time
    print(f"Total response time: {total_time:.2f} seconds")
    print_query_statistics(response, "Standard Caching")
    
    return first_token_time, total_time

speculative_caching_demo.pypythonscript

async def speculative_prompt_caching_demo():
    client = AsyncAnthropic()
    
    # Prepare the large context
    initial_message = await create_initial_message()
    
    # Start speculative caching while user is typing
    print("User is typing their question...")
    print("🔥 Starting cache warming in background...")
    
    # Warm up cache while user types
    cache_task = asyncio.create_task(sample_one_token(client, [initial_message]))
    
    # Simulate user typing time
    await asyncio.sleep(3)  # Simulate 3 seconds of typing
    
    user_question = "What is the purpose of the BtShared structure?"
    print(f"User submitted: {user_question}")
    
    # Ensure cache warming is complete
    await cache_task
    print("✅ Cache warming completed!")
    
    # Prepare messages for cached query
    cached_message = copy.deepcopy(initial_message)
    cached_message["content"].append({
        "type": "text",
        "text": f"Answer the user's question: {user_question}"
    })
    
    print("\nSending request to API (with warm cache)...")
    start_time = time.time()
    first_token_time = None
    
    async with client.messages.stream(
        messages=[cached_message],
        model=MODEL,
        **DEFAULT_CLIENT_ARGS,
    ) as stream:
        async for text in stream.text_stream:
            if first_token_time is None and text.strip():
                first_token_time = time.time() - start_time
                print(f"\n🚀 Time to first token: {first_token_time:.2f} seconds")
                break
        
        response = await stream.get_final_message()
    
    total_time = time.time() - start_time
    print(f"Total response time: {total_time:.2f} seconds")
    print_query_statistics(response, "Speculative Caching")
    
    return first_token_time, total_time