Back to AI App Dev Series

Anthropic SDK Track Part 13: Escalation, Errors & Human Review

May 22, 2026 Wasil Zafar 35 min read

Design robust error recovery for agentic systems, implement human-in-the-loop escalation policies, build confidence-based routing, and create approval workflows that balance autonomy with safety.

CCA Domain 5 · 15% Tasks 5.1, 5.2

Table of Contents

  1. Error Recovery Patterns
  2. Escalation Policies
  3. Confidence-Based Routing
  4. Approval Workflows
  5. Define Success & Build Evals (CCA 3.1)
  6. Files API (CCA 5.3)
  7. Human Review & Confidence Scoring (CCA 12.4)
What You’ll Learn: Every production agent encounters errors — APIs fail, tools timeout, users ask impossible questions. The difference between a good agent and a great one is how it handles these situations. This article teaches graceful degradation: when to retry, when to escalate to a human, and how to communicate uncertainty honestly. Think of it like a call center agent who knows when to put you on hold vs. transfer you to a supervisor.

1. Error Recovery Patterns

Agents encounter errors frequently — tools fail, APIs return errors, files don’t exist. The key distinction for the CCA: errors should be fed back to the model as tool_result with is_error: true, not raised as exceptions in your application code.

import anthropic
import json

client = anthropic.Anthropic()

def execute_tool_with_recovery(tool_name: str, tool_input: dict) -> dict:
    """Execute a tool call with structured error handling."""
    try:
        result = execute_tool(tool_name, tool_input)
        return {"type": "tool_result", "content": result, "is_error": False}
    except FileNotFoundError as e:
        # Recoverable: tell the model the file doesn't exist
        return {
            "type": "tool_result",
            "content": json.dumps({
                "error": "file_not_found",
                "message": str(e),
                "suggestion": "Check the file path or list the directory contents first"
            }),
            "is_error": True
        }
    except PermissionError as e:
        # Recoverable: model can try a different approach
        return {
            "type": "tool_result",
            "content": json.dumps({
                "error": "permission_denied",
                "message": str(e),
                "suggestion": "This file is read-only. Consider creating a new file instead."
            }),
            "is_error": True
        }
    except TimeoutError:
        # Recoverable: model can retry with simpler command
        return {
            "type": "tool_result",
            "content": json.dumps({
                "error": "timeout",
                "message": "Command timed out after 30 seconds",
                "suggestion": "Try a simpler command or break into smaller steps"
            }),
            "is_error": True
        }
    except Exception as e:
        # Unknown error — still feed back to model
        return {
            "type": "tool_result",
            "content": json.dumps({
                "error": "unknown",
                "message": f"Unexpected error: {type(e).__name__}: {str(e)}"
            }),
            "is_error": True
        }

2. Escalation Policies

import json
from enum import Enum

class EscalationLevel(Enum):
    AUTO = "auto"       # Agent handles fully
    NOTIFY = "notify"   # Agent handles, human notified
    APPROVE = "approve" # Agent proposes, human must approve
    BLOCK = "block"     # Agent cannot proceed, human required

class EscalationPolicy:
    """Define when an agent should escalate to a human."""

    def __init__(self):
        self.rules = []

    def add_rule(self, condition: str, level: EscalationLevel, reason: str):
        self.rules.append({"condition": condition, "level": level, "reason": reason})

    def evaluate(self, action: dict) -> EscalationLevel:
        """Check if an action requires escalation."""
        # Amount-based escalation
        if action.get("amount", 0) > 500:
            return EscalationLevel.APPROVE

        # Destructive action escalation
        if action.get("type") in ["delete", "cancel", "terminate"]:
            return EscalationLevel.APPROVE

        # Confidence-based escalation
        if action.get("confidence", 1.0) < 0.7:
            return EscalationLevel.NOTIFY

        # Repeated failures
        if action.get("retry_count", 0) >= 3:
            return EscalationLevel.BLOCK

        return EscalationLevel.AUTO

# Usage in agentic loop
policy = EscalationPolicy()

def handle_tool_call_with_escalation(tool_name: str, tool_input: dict) -> dict:
    """Check escalation policy before executing tool."""
    action = {"type": tool_name, **tool_input}
    level = policy.evaluate(action)

    if level == EscalationLevel.AUTO:
        return execute_tool(tool_name, tool_input)
    elif level == EscalationLevel.NOTIFY:
        result = execute_tool(tool_name, tool_input)
        notify_human(f"Agent executed {tool_name} with input {tool_input}")
        return result
    elif level == EscalationLevel.APPROVE:
        approval = request_human_approval(tool_name, tool_input)
        if approval:
            return execute_tool(tool_name, tool_input)
        return {"error": "Action rejected by human reviewer"}
    else:  # BLOCK
        return {"error": "Action blocked by policy. Human intervention required."}
Real-World Application

E-Commerce Agent Reliability

An online marketplace’s agent handles 10,000 customer interactions daily. Their error strategy: automatic retry for API timeouts (fixes 80% of issues), fallback responses from a knowledge base when tools fail (catches 15%), and structured human escalation for the remaining 5%. Result: 99.2% resolution rate without human intervention.

Error HandlingReliability

3. Confidence-Based Routing

import anthropic
import json

client = anthropic.Anthropic()

def route_by_confidence(query: str, threshold: float = 0.8) -> dict:
    """Route to autonomous vs human-assisted based on confidence."""

    # Ask Claude to assess its own confidence
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=512,
        tools=[{
            "name": "assess_and_respond",
            "description": "Assess confidence and provide response",
            "input_schema": {
                "type": "object",
                "properties": {
                    "confidence": {
                        "type": "number",
                        "description": "How confident you are in your response (0.0-1.0)"
                    },
                    "response": {"type": "string"},
                    "uncertainty_reason": {
                        "type": "string",
                        "description": "If confidence < 0.8, explain what you're uncertain about"
                    }
                },
                "required": ["confidence", "response"]
            }
        }],
        tool_choice={"type": "tool", "name": "assess_and_respond"},
        messages=[{"role": "user", "content": query}]
    )

    tool_block = next(b for b in response.content if b.type == "tool_use")
    result = tool_block.input

    if result["confidence"] >= threshold:
        return {"route": "autonomous", "response": result["response"]}
    else:
        return {
            "route": "human_review",
            "draft_response": result["response"],
            "uncertainty": result.get("uncertainty_reason", ""),
            "confidence": result["confidence"]
        }

4. Approval Workflows

CCA Tasks 5.1 & 5.2: Key exam concepts: (1) errors should be returned to the model as is_error: true tool results, not thrown as application exceptions, (2) the model can often self-recover when given structured error information, (3) escalation policies define WHEN to involve humans (amount thresholds, destructive actions, low confidence), (4) hooks provide programmatic enforcement of escalation rules (not prompt-based).
Safety Principle: The CCA emphasizes that agents should default to asking rather than guessing when uncertainty is high. A “confident but wrong” autonomous action is worse than an escalation that takes longer but produces a correct result.
Try It Yourself: Build an agent with a 3-tier error handling strategy: (1) Retry with backoff for transient errors (network timeouts), (2) Graceful fallback for tool failures (e.g., if web_search fails, use cached data), (3) Human escalation for policy violations (refund > $500). Test each tier by simulating the failure condition.

5. Define Success & Build Evals (CCA 3.1)

How do you know if your agent is actually working? You need evaluation criteria — measurable definitions of success — and eval pipelines that test your system against them. Without evals, you’re flying blind: every prompt change might improve one case while breaking ten others.

Analogy: Evals are like unit tests for AI systems. Just as you wouldn’t deploy code without tests, you shouldn’t deploy prompts without evals. The difference: AI evals often use LLMs themselves as judges (since human evaluation doesn’t scale).

5.1 Eval Types: Human, LLM-Graded, and Code

import anthropic
import json

client = anthropic.Anthropic()

# THREE types of evaluation (CCA tests all three):

# 1. CODE-BASED EVAL — Deterministic, cheapest, fastest
# Use for: exact match, regex patterns, JSON schema validation, keyword presence
def eval_code_based(output: str, expected: dict) -> dict:
    """Evaluate output using deterministic code checks."""
    results = {}

    # Check JSON validity
    try:
        parsed = json.loads(output)
        results["valid_json"] = True
    except json.JSONDecodeError:
        results["valid_json"] = False
        return {"pass": False, "results": results}

    # Check required fields exist
    required = expected.get("required_fields", [])
    results["has_required_fields"] = all(f in parsed for f in required)

    # Check enum constraints
    if "category" in parsed and "allowed_categories" in expected:
        results["valid_category"] = parsed["category"] in expected["allowed_categories"]

    results["pass"] = all(results.values())
    return results

# 2. LLM-GRADED EVAL — Flexible, handles nuance, moderate cost
# Use for: quality assessment, relevance scoring, tone evaluation
def eval_llm_graded(output: str, criteria: str) -> dict:
    """Use Claude as a judge to evaluate output quality."""
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=200,
        temperature=0,
        system="You are an evaluation judge. Score the output on the given criteria. Return JSON.",
        messages=[{
            "role": "user",
            "content": f"Criteria: {criteria}\n\nOutput to evaluate:\n{output}\n\nScore 1-5 and explain."
        }]
    )
    return {"judge_response": response.content[0].text}

# 3. HUMAN EVAL — Gold standard, expensive, slow
# Use for: building golden datasets, validating LLM judges, edge cases
# Pattern: Sample 5-10% of outputs for human review; use results to calibrate LLM judges

# Golden dataset structure:
golden_dataset = [
    {
        "input": "I was charged twice for my subscription",
        "expected_output": {"category": "billing", "priority": "high"},
        "eval_type": "code",  # Can validate with exact match
        "criteria": {"required_fields": ["category", "priority"],
                     "allowed_categories": ["billing", "technical", "account"]}
    },
    {
        "input": "Can you explain how your API rate limits work?",
        "expected_output": "A clear explanation of rate limiting...",
        "eval_type": "llm",  # Need LLM to judge quality
        "criteria": "Response should be accurate, mention specific limits, and be under 200 words"
    }
]

print(f"Golden dataset: {len(golden_dataset)} test cases")
print("Mix of code-based (fast) and LLM-graded (nuanced) evals")

5.2 Regression Testing & Eval Coverage

import anthropic
import json
from datetime import datetime

client = anthropic.Anthropic()

def run_eval_suite(golden_dataset: list, system_prompt: str) -> dict:
    """Run evaluation suite and track regression."""

    results = {"pass": 0, "fail": 0, "errors": [], "timestamp": datetime.utcnow().isoformat()}

    for i, test_case in enumerate(golden_dataset):
        # Run the agent/prompt against this test case
        response = client.messages.create(
            model="claude-sonnet-4-6",
            max_tokens=200,
            temperature=0,
            system=system_prompt,
            messages=[{"role": "user", "content": test_case["input"]}]
        )
        output = response.content[0].text

        # Evaluate based on type
        if test_case["eval_type"] == "code":
            eval_result = eval_code_based(output, test_case["criteria"])
        else:
            eval_result = eval_llm_graded(output, test_case["criteria"])

        if eval_result.get("pass", True):
            results["pass"] += 1
        else:
            results["fail"] += 1
            results["errors"].append({"case": i, "output": output, "eval": eval_result})

    results["score"] = results["pass"] / len(golden_dataset) if golden_dataset else 0
    return results

# Run eval and compare with baseline
# baseline_score = 0.85  # Previous version
# current = run_eval_suite(golden_dataset, new_system_prompt)
# if current["score"] < baseline_score:
#     print(f"REGRESSION: score dropped from {baseline_score} to {current['score']}")
#     print(f"Failed cases: {current['errors']}")

print("Pattern: Run evals before AND after every prompt change")
print("Track score over time — any decrease = regression")
CCA Exam Pattern (3.1): Questions test: (1) Three eval types: code-based (deterministic), LLM-graded (flexible), human (gold standard). (2) Golden datasets are curated test cases with expected outputs. (3) Regression testing: score must not decrease after prompt changes. (4) Eval coverage: test all intent categories, edge cases, and failure modes.

6. Files API (CCA 5.3)

The Files API lets agents upload, list, download, and delete files attached to sessions. This is how agents process PDFs, images, spreadsheets, and other documents — critical for document processing workflows like legal summarization or invoice extraction.

6.1 File CRUD Operations

import anthropic
import json
import base64

client = anthropic.Anthropic()

# UPLOAD a file
# POST /v1/files
# file = client.files.upload(
#     file=open("contract.pdf", "rb"),
#     purpose="session_attachment",   # or "batch_input", "eval_dataset"
#     metadata={"customer_id": "cust_123", "document_type": "contract"}
# )
# print(f"Uploaded: {file.id}, Size: {file.size_bytes} bytes")

# LIST files
# GET /v1/files?purpose=session_attachment
# files = client.files.list(purpose="session_attachment")

# GET file metadata
# GET /v1/files/{file_id}
# file_info = client.files.get("file_abc123")

# DOWNLOAD file content
# GET /v1/files/{file_id}/content
# content = client.files.download("file_abc123")

# DELETE a file
# DELETE /v1/files/{file_id}
# client.files.delete("file_abc123")

# USE the uploaded file in a later request
# Keep the file ID in your application state and pass the file back through the
# API surface that supports documents/images for the workflow you are building.

print("File lifecycle: upload → reference in a later request → agent processes → download results")
print("Supported: PDF, images (PNG/JPG), text, CSV, JSON, code files")

6.2 PDF & Vision Support

import anthropic
import base64

client = anthropic.Anthropic()

# PDF processing — Claude can read PDFs natively via the Messages API
# Pass PDF content as a base64-encoded document block

# Method 1: Inline PDF in message
with open("contract.pdf", "rb") as f:
    pdf_base64 = base64.standard_b64encode(f.read()).decode("utf-8")

response = client.messages.create(
    model="claude-sonnet-4-6",
    max_tokens=2000,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "document",
                "source": {
                    "type": "base64",
                    "media_type": "application/pdf",
                    "data": pdf_base64
                }
            },
            {
                "type": "text",
                "text": "Extract: parties, effective date, termination clause, and payment terms."
            }
        ]
    }]
)
print(response.content[0].text)

# Method 2: Image/Vision (screenshots, diagrams, photos)
# with open("architecture-diagram.png", "rb") as f:
#     img_base64 = base64.standard_b64encode(f.read()).decode("utf-8")
#
# response = client.messages.create(
#     model="claude-sonnet-4-6",
#     max_tokens=1000,
#     messages=[{
#         "role": "user",
#         "content": [
#             {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_base64}},
#             {"type": "text", "text": "Describe this architecture diagram and identify potential bottlenecks."}
#         ]
#     }]
# )

print("PDF: use 'document' content block with base64 encoding")
print("Images: use 'image' content block — supports PNG, JPG, GIF, WebP")
print("Vision: Claude can read text in images, analyze diagrams, describe photos")
CCA Exam Pattern (5.3): Questions test: (1) Files have purposes: session_attachment, batch_input, eval_dataset. (2) PDFs use the “document” content block type (not “text”). (3) Images use the “image” content block type. (4) Files attached to sessions are accessible throughout the conversation. (5) File metadata supports custom fields for organization.

7. Human Review & Confidence Scoring (CCA 12.4)

Production extraction systems can’t blindly trust AI output — they need to know WHICH fields are reliable and which need human verification. Field-level confidence scoring lets you route only uncertain extractions to human review, reducing cost while maintaining accuracy.

import anthropic
import json

client = anthropic.Anthropic()

# Field-level confidence — each extracted field gets its own confidence score
# This enables selective human review (only review uncertain fields)

extraction_with_confidence = {
    "name": "extract_invoice",
    "description": "Extract invoice fields with per-field confidence scores.",
    "input_schema": {
        "type": "object",
        "properties": {
            "vendor_name": {"type": "object", "properties": {
                "value": {"type": "string"},
                "confidence": {"type": "number", "minimum": 0, "maximum": 1},
                "source": {"type": "string", "description": "Where in document this was found"}
            }},
            "total_amount": {"type": "object", "properties": {
                "value": {"type": "number"},
                "confidence": {"type": "number", "minimum": 0, "maximum": 1},
                "source": {"type": "string"}
            }},
            "invoice_date": {"type": "object", "properties": {
                "value": {"type": ["string", "null"]},
                "confidence": {"type": "number", "minimum": 0, "maximum": 1},
                "source": {"type": "string"}
            }}
        },
        "required": ["vendor_name", "total_amount", "invoice_date"]
    }
}

def route_for_review(extraction: dict, threshold: float = 0.85) -> dict:
    """Route low-confidence fields to human review."""
    needs_review = []
    auto_accepted = []

    for field, data in extraction.items():
        if isinstance(data, dict) and "confidence" in data:
            if data["confidence"] < threshold:
                needs_review.append({"field": field, "value": data["value"],
                                    "confidence": data["confidence"], "source": data["source"]})
            else:
                auto_accepted.append({"field": field, "value": data["value"]})

    return {
        "auto_accepted": auto_accepted,
        "needs_review": needs_review,
        "review_required": len(needs_review) > 0
    }

# Example result:
sample = {
    "vendor_name": {"value": "Acme Corp", "confidence": 0.98, "source": "Header, page 1"},
    "total_amount": {"value": 1250.00, "confidence": 0.92, "source": "Footer, page 2"},
    "invoice_date": {"value": "2024-03-15", "confidence": 0.65, "source": "Body text, unclear format"}
}

routing = route_for_review(sample)
print(f"Auto-accepted: {len(routing['auto_accepted'])} fields")
print(f"Needs review: {len(routing['needs_review'])} fields")
for field in routing["needs_review"]:
    print(f"  {field['field']}: '{field['value']}' (confidence: {field['confidence']}, source: {field['source']})")
Key Pattern: Include a source field alongside each extracted value. This creates a provenance trail — humans reviewing uncertain fields can quickly check the original document location instead of re-reading everything. The CCA calls this “claim-source mappings.”