tutorialintermediate
Giving Claude a crop tool for better image analysis Give Claude a crop tool to zoom into image regions for detailed analysis of charts, documents, and diagrams.
cookbook
View original on cookbookThis cookbook demonstrates how to build a crop tool for Claude to analyze images with greater detail by zooming into specific regions. The tool uses normalized coordinates (0-1) to let Claude request cropped sections of charts, documents, and diagrams without needing to know pixel dimensions. An agentic loop handles tool calls iteratively until Claude provides a final answer, enabling precise analysis of small text, chart comparisons, and technical details.
Key Points
- •Crop tools are most useful for charts/graphs (comparing close values), documents (reading small text), technical diagrams (following connections), and dense images with fine details
- •Use normalized coordinates (0-1) where (0,0) is top-left and (1,1) is bottom-right to make the tool dimension-agnostic
- •Define the crop tool with x1, y1, x2, y2 parameters in the Anthropic API tools schema for Claude to call
- •Convert PIL images to base64 PNG format for transmission to Claude's API
- •Implement an agentic loop that sends the image with the crop tool available, processes tool_use responses, executes crops, and continues until stop_reason is not 'tool_use'
- •Validate crop coordinates to ensure 0 ≤ values ≤ 1 and x1 < x2, y1 < y2 before executing
- •Return cropped results as both text confirmation and base64-encoded image for Claude to analyze iteratively
- •Claude can autonomously decide when to crop specific regions (e.g., legend, axes, data points) to answer questions accurately
- •The FigureQA dataset provides example charts for testing and demonstrating the crop tool's effectiveness
- •This approach significantly improves Claude's ability to answer detailed questions about complex visual content by enabling focused analysis
Found this useful? Add it to a playbook for a step-by-step implementation guide.
Workflow Diagram
Start Process
Step A
Step B
Step C
Complete
Concepts
Artifacts (4)
crop_tool_definitionpythonconfig
CROP_TOOL = {
"name": "crop_image",
"description": "Crop an image by specifying a bounding box.",
"input_schema": {
"type": "object",
"properties": {
"x1": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "Left edge of bounding box as normalized 0-1 value"
},
"y1": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "Top edge of bounding box as normalized 0-1 value"
},
"x2": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "Right edge of bounding box as normalized 0-1 value"
},
"y2": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "Bottom edge of bounding box as normalized 0-1 value"
}
},
"required": ["x1", "y1", "x2", "y2"]
}
}crop_handler_functionpythonscript
def handle_crop(image: PILImage.Image, x1: float, y1: float, x2: float, y2: float) -> list:
"""Execute the crop and return the result for Claude."""
# Validate
if not all(0 <= c <= 1 for c in [x1, y1, x2, y2]):
return [{"type": "text", "text": "Error: Coordinates must be between 0 and 1"}]
if x1 >= x2 or y1 >= y2:
return [{"type": "text", "text": "Error: Invalid bounding box (need x1 < x2 and y1 < y2)"}]
# Crop
w, h = image.size
cropped = image.crop((int(x1 * w), int(y1 * h), int(x2 * w), int(y2 * h)))
return [
{
"type": "text",
"text": f"Cropped to ({x1:.2f},{y1:.2f})-({x2:.2f},{y2:.2f}): {cropped.width}x{cropped.height}px"
},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": pil_to_base64(cropped)
}
}
]agentic_loop_functionpythonscript
def ask_with_crop_tool(image: PILImage.Image, question: str) -> str:
"""Ask Claude a question about an image, with the crop tool available."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": f"Answer the following question about this image.\n\nThe question is: {question}\n\n"},
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": pil_to_base64(image)}},
{"type": "text", "text": "\n\nUse your crop_image tool to examine specific regions including legends and axes."}
]
}
]
while True:
response = client.messages.create(
model=MODEL,
max_tokens=1024,
tools=[CROP_TOOL],
messages=messages
)
# Print assistant's response
for block in response.content:
if hasattr(block, "text"):
print(f"[Assistant] {block.text}")
elif block.type == "tool_use":
print(f"[Tool] crop_image({block.input})")
# If Claude is done, return
if response.stop_reason != "tool_use":
return
# Execute tool calls and continue
messages.append({"role": "assistant", "content": response.content})
tool_results = []
for block in response.content:
if block.type == "tool_use":
result = handle_crop(image, **block.input)
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": result
})
messages.append({"role": "user", "content": tool_results})image_conversion_helperpythonscript
def pil_to_base64(image: PILImage.Image) -> str:
"""Convert PIL Image to base64 string."""
if image.mode in ("RGBA", "P"):
image = image.convert("RGB")
buffer = BytesIO()
image.save(buffer, format="PNG")
return base64.standard_b64encode(buffer.getvalue()).decode("utf-8")
def get_pil_image(img) -> PILImage.Image:
"""Convert dataset images to PIL."""
if isinstance(img, PILImage.Image):
return img
if isinstance(img, dict) and "bytes" in img:
return PILImage.open(BytesIO(img["bytes"]))
raise ValueError(f"Cannot convert {type(img)}")