1. Error Recovery Patterns
Agents encounter errors frequently — tools fail, APIs return errors, files don’t exist. The key distinction for the CCA: errors should be fed back to the model as tool_result with is_error: true, not raised as exceptions in your application code.
import anthropic
import json
client = anthropic.Anthropic()
def execute_tool_with_recovery(tool_name: str, tool_input: dict) -> dict:
"""Execute a tool call with structured error handling."""
try:
result = execute_tool(tool_name, tool_input)
return {"type": "tool_result", "content": result, "is_error": False}
except FileNotFoundError as e:
# Recoverable: tell the model the file doesn't exist
return {
"type": "tool_result",
"content": json.dumps({
"error": "file_not_found",
"message": str(e),
"suggestion": "Check the file path or list the directory contents first"
}),
"is_error": True
}
except PermissionError as e:
# Recoverable: model can try a different approach
return {
"type": "tool_result",
"content": json.dumps({
"error": "permission_denied",
"message": str(e),
"suggestion": "This file is read-only. Consider creating a new file instead."
}),
"is_error": True
}
except TimeoutError:
# Recoverable: model can retry with simpler command
return {
"type": "tool_result",
"content": json.dumps({
"error": "timeout",
"message": "Command timed out after 30 seconds",
"suggestion": "Try a simpler command or break into smaller steps"
}),
"is_error": True
}
except Exception as e:
# Unknown error — still feed back to model
return {
"type": "tool_result",
"content": json.dumps({
"error": "unknown",
"message": f"Unexpected error: {type(e).__name__}: {str(e)}"
}),
"is_error": True
}
2. Escalation Policies
import json
from enum import Enum
class EscalationLevel(Enum):
AUTO = "auto" # Agent handles fully
NOTIFY = "notify" # Agent handles, human notified
APPROVE = "approve" # Agent proposes, human must approve
BLOCK = "block" # Agent cannot proceed, human required
class EscalationPolicy:
"""Define when an agent should escalate to a human."""
def __init__(self):
self.rules = []
def add_rule(self, condition: str, level: EscalationLevel, reason: str):
self.rules.append({"condition": condition, "level": level, "reason": reason})
def evaluate(self, action: dict) -> EscalationLevel:
"""Check if an action requires escalation."""
# Amount-based escalation
if action.get("amount", 0) > 500:
return EscalationLevel.APPROVE
# Destructive action escalation
if action.get("type") in ["delete", "cancel", "terminate"]:
return EscalationLevel.APPROVE
# Confidence-based escalation
if action.get("confidence", 1.0) < 0.7:
return EscalationLevel.NOTIFY
# Repeated failures
if action.get("retry_count", 0) >= 3:
return EscalationLevel.BLOCK
return EscalationLevel.AUTO
# Usage in agentic loop
policy = EscalationPolicy()
def handle_tool_call_with_escalation(tool_name: str, tool_input: dict) -> dict:
"""Check escalation policy before executing tool."""
action = {"type": tool_name, **tool_input}
level = policy.evaluate(action)
if level == EscalationLevel.AUTO:
return execute_tool(tool_name, tool_input)
elif level == EscalationLevel.NOTIFY:
result = execute_tool(tool_name, tool_input)
notify_human(f"Agent executed {tool_name} with input {tool_input}")
return result
elif level == EscalationLevel.APPROVE:
approval = request_human_approval(tool_name, tool_input)
if approval:
return execute_tool(tool_name, tool_input)
return {"error": "Action rejected by human reviewer"}
else: # BLOCK
return {"error": "Action blocked by policy. Human intervention required."}
E-Commerce Agent Reliability
An online marketplace’s agent handles 10,000 customer interactions daily. Their error strategy: automatic retry for API timeouts (fixes 80% of issues), fallback responses from a knowledge base when tools fail (catches 15%), and structured human escalation for the remaining 5%. Result: 99.2% resolution rate without human intervention.
3. Confidence-Based Routing
import anthropic
import json
client = anthropic.Anthropic()
def route_by_confidence(query: str, threshold: float = 0.8) -> dict:
"""Route to autonomous vs human-assisted based on confidence."""
# Ask Claude to assess its own confidence
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
tools=[{
"name": "assess_and_respond",
"description": "Assess confidence and provide response",
"input_schema": {
"type": "object",
"properties": {
"confidence": {
"type": "number",
"description": "How confident you are in your response (0.0-1.0)"
},
"response": {"type": "string"},
"uncertainty_reason": {
"type": "string",
"description": "If confidence < 0.8, explain what you're uncertain about"
}
},
"required": ["confidence", "response"]
}
}],
tool_choice={"type": "tool", "name": "assess_and_respond"},
messages=[{"role": "user", "content": query}]
)
tool_block = next(b for b in response.content if b.type == "tool_use")
result = tool_block.input
if result["confidence"] >= threshold:
return {"route": "autonomous", "response": result["response"]}
else:
return {
"route": "human_review",
"draft_response": result["response"],
"uncertainty": result.get("uncertainty_reason", ""),
"confidence": result["confidence"]
}
4. Approval Workflows
is_error: true tool results, not thrown as application exceptions, (2) the model can often self-recover when given structured error information, (3) escalation policies define WHEN to involve humans (amount thresholds, destructive actions, low confidence), (4) hooks provide programmatic enforcement of escalation rules (not prompt-based).
5. Define Success & Build Evals (CCA 3.1)
How do you know if your agent is actually working? You need evaluation criteria — measurable definitions of success — and eval pipelines that test your system against them. Without evals, you’re flying blind: every prompt change might improve one case while breaking ten others.
Analogy: Evals are like unit tests for AI systems. Just as you wouldn’t deploy code without tests, you shouldn’t deploy prompts without evals. The difference: AI evals often use LLMs themselves as judges (since human evaluation doesn’t scale).
5.1 Eval Types: Human, LLM-Graded, and Code
import anthropic
import json
client = anthropic.Anthropic()
# THREE types of evaluation (CCA tests all three):
# 1. CODE-BASED EVAL — Deterministic, cheapest, fastest
# Use for: exact match, regex patterns, JSON schema validation, keyword presence
def eval_code_based(output: str, expected: dict) -> dict:
"""Evaluate output using deterministic code checks."""
results = {}
# Check JSON validity
try:
parsed = json.loads(output)
results["valid_json"] = True
except json.JSONDecodeError:
results["valid_json"] = False
return {"pass": False, "results": results}
# Check required fields exist
required = expected.get("required_fields", [])
results["has_required_fields"] = all(f in parsed for f in required)
# Check enum constraints
if "category" in parsed and "allowed_categories" in expected:
results["valid_category"] = parsed["category"] in expected["allowed_categories"]
results["pass"] = all(results.values())
return results
# 2. LLM-GRADED EVAL — Flexible, handles nuance, moderate cost
# Use for: quality assessment, relevance scoring, tone evaluation
def eval_llm_graded(output: str, criteria: str) -> dict:
"""Use Claude as a judge to evaluate output quality."""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=200,
temperature=0,
system="You are an evaluation judge. Score the output on the given criteria. Return JSON.",
messages=[{
"role": "user",
"content": f"Criteria: {criteria}\n\nOutput to evaluate:\n{output}\n\nScore 1-5 and explain."
}]
)
return {"judge_response": response.content[0].text}
# 3. HUMAN EVAL — Gold standard, expensive, slow
# Use for: building golden datasets, validating LLM judges, edge cases
# Pattern: Sample 5-10% of outputs for human review; use results to calibrate LLM judges
# Golden dataset structure:
golden_dataset = [
{
"input": "I was charged twice for my subscription",
"expected_output": {"category": "billing", "priority": "high"},
"eval_type": "code", # Can validate with exact match
"criteria": {"required_fields": ["category", "priority"],
"allowed_categories": ["billing", "technical", "account"]}
},
{
"input": "Can you explain how your API rate limits work?",
"expected_output": "A clear explanation of rate limiting...",
"eval_type": "llm", # Need LLM to judge quality
"criteria": "Response should be accurate, mention specific limits, and be under 200 words"
}
]
print(f"Golden dataset: {len(golden_dataset)} test cases")
print("Mix of code-based (fast) and LLM-graded (nuanced) evals")
5.2 Regression Testing & Eval Coverage
import anthropic
import json
from datetime import datetime
client = anthropic.Anthropic()
def run_eval_suite(golden_dataset: list, system_prompt: str) -> dict:
"""Run evaluation suite and track regression."""
results = {"pass": 0, "fail": 0, "errors": [], "timestamp": datetime.utcnow().isoformat()}
for i, test_case in enumerate(golden_dataset):
# Run the agent/prompt against this test case
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=200,
temperature=0,
system=system_prompt,
messages=[{"role": "user", "content": test_case["input"]}]
)
output = response.content[0].text
# Evaluate based on type
if test_case["eval_type"] == "code":
eval_result = eval_code_based(output, test_case["criteria"])
else:
eval_result = eval_llm_graded(output, test_case["criteria"])
if eval_result.get("pass", True):
results["pass"] += 1
else:
results["fail"] += 1
results["errors"].append({"case": i, "output": output, "eval": eval_result})
results["score"] = results["pass"] / len(golden_dataset) if golden_dataset else 0
return results
# Run eval and compare with baseline
# baseline_score = 0.85 # Previous version
# current = run_eval_suite(golden_dataset, new_system_prompt)
# if current["score"] < baseline_score:
# print(f"REGRESSION: score dropped from {baseline_score} to {current['score']}")
# print(f"Failed cases: {current['errors']}")
print("Pattern: Run evals before AND after every prompt change")
print("Track score over time — any decrease = regression")
6. Files API (CCA 5.3)
The Files API lets agents upload, list, download, and delete files attached to sessions. This is how agents process PDFs, images, spreadsheets, and other documents — critical for document processing workflows like legal summarization or invoice extraction.
6.1 File CRUD Operations
import anthropic
import json
import base64
client = anthropic.Anthropic()
# UPLOAD a file
# POST /v1/files
# file = client.files.upload(
# file=open("contract.pdf", "rb"),
# purpose="session_attachment", # or "batch_input", "eval_dataset"
# metadata={"customer_id": "cust_123", "document_type": "contract"}
# )
# print(f"Uploaded: {file.id}, Size: {file.size_bytes} bytes")
# LIST files
# GET /v1/files?purpose=session_attachment
# files = client.files.list(purpose="session_attachment")
# GET file metadata
# GET /v1/files/{file_id}
# file_info = client.files.get("file_abc123")
# DOWNLOAD file content
# GET /v1/files/{file_id}/content
# content = client.files.download("file_abc123")
# DELETE a file
# DELETE /v1/files/{file_id}
# client.files.delete("file_abc123")
# USE the uploaded file in a later request
# Keep the file ID in your application state and pass the file back through the
# API surface that supports documents/images for the workflow you are building.
print("File lifecycle: upload → reference in a later request → agent processes → download results")
print("Supported: PDF, images (PNG/JPG), text, CSV, JSON, code files")
6.2 PDF & Vision Support
import anthropic
import base64
client = anthropic.Anthropic()
# PDF processing — Claude can read PDFs natively via the Messages API
# Pass PDF content as a base64-encoded document block
# Method 1: Inline PDF in message
with open("contract.pdf", "rb") as f:
pdf_base64 = base64.standard_b64encode(f.read()).decode("utf-8")
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2000,
messages=[{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "base64",
"media_type": "application/pdf",
"data": pdf_base64
}
},
{
"type": "text",
"text": "Extract: parties, effective date, termination clause, and payment terms."
}
]
}]
)
print(response.content[0].text)
# Method 2: Image/Vision (screenshots, diagrams, photos)
# with open("architecture-diagram.png", "rb") as f:
# img_base64 = base64.standard_b64encode(f.read()).decode("utf-8")
#
# response = client.messages.create(
# model="claude-sonnet-4-6",
# max_tokens=1000,
# messages=[{
# "role": "user",
# "content": [
# {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_base64}},
# {"type": "text", "text": "Describe this architecture diagram and identify potential bottlenecks."}
# ]
# }]
# )
print("PDF: use 'document' content block with base64 encoding")
print("Images: use 'image' content block — supports PNG, JPG, GIF, WebP")
print("Vision: Claude can read text in images, analyze diagrams, describe photos")
7. Human Review & Confidence Scoring (CCA 12.4)
Production extraction systems can’t blindly trust AI output — they need to know WHICH fields are reliable and which need human verification. Field-level confidence scoring lets you route only uncertain extractions to human review, reducing cost while maintaining accuracy.
import anthropic
import json
client = anthropic.Anthropic()
# Field-level confidence — each extracted field gets its own confidence score
# This enables selective human review (only review uncertain fields)
extraction_with_confidence = {
"name": "extract_invoice",
"description": "Extract invoice fields with per-field confidence scores.",
"input_schema": {
"type": "object",
"properties": {
"vendor_name": {"type": "object", "properties": {
"value": {"type": "string"},
"confidence": {"type": "number", "minimum": 0, "maximum": 1},
"source": {"type": "string", "description": "Where in document this was found"}
}},
"total_amount": {"type": "object", "properties": {
"value": {"type": "number"},
"confidence": {"type": "number", "minimum": 0, "maximum": 1},
"source": {"type": "string"}
}},
"invoice_date": {"type": "object", "properties": {
"value": {"type": ["string", "null"]},
"confidence": {"type": "number", "minimum": 0, "maximum": 1},
"source": {"type": "string"}
}}
},
"required": ["vendor_name", "total_amount", "invoice_date"]
}
}
def route_for_review(extraction: dict, threshold: float = 0.85) -> dict:
"""Route low-confidence fields to human review."""
needs_review = []
auto_accepted = []
for field, data in extraction.items():
if isinstance(data, dict) and "confidence" in data:
if data["confidence"] < threshold:
needs_review.append({"field": field, "value": data["value"],
"confidence": data["confidence"], "source": data["source"]})
else:
auto_accepted.append({"field": field, "value": data["value"]})
return {
"auto_accepted": auto_accepted,
"needs_review": needs_review,
"review_required": len(needs_review) > 0
}
# Example result:
sample = {
"vendor_name": {"value": "Acme Corp", "confidence": 0.98, "source": "Header, page 1"},
"total_amount": {"value": 1250.00, "confidence": 0.92, "source": "Footer, page 2"},
"invoice_date": {"value": "2024-03-15", "confidence": 0.65, "source": "Body text, unclear format"}
}
routing = route_for_review(sample)
print(f"Auto-accepted: {len(routing['auto_accepted'])} fields")
print(f"Needs review: {len(routing['needs_review'])} fields")
for field in routing["needs_review"]:
print(f" {field['field']}: '{field['value']}' (confidence: {field['confidence']}, source: {field['source']})")
source field alongside each extracted value. This creates a provenance trail — humans reviewing uncertain fields can quickly check the original document location instead of re-reading everything. The CCA calls this “claim-source mappings.”