1. Token Management
Claude models have a 200K token context window. In agentic loops, context grows rapidly — each tool call adds input/output tokens. Without management, you’ll hit the limit and the agent will fail mid-task.
import anthropic
client = anthropic.Anthropic()
# Token counting via the API
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[{"role": "user", "content": "Hello, world!"}]
)
# Usage info returned with every response
print(f"Input tokens: {response.usage.input_tokens}")
print(f"Output tokens: {response.usage.output_tokens}")
print(f"Total: {response.usage.input_tokens + response.usage.output_tokens}")
# For planning: estimate tokens before sending
# Rule of thumb: 1 token ≈ 4 characters (English text)
# Code is denser: 1 token ≈ 3 characters
def estimate_tokens(text: str) -> int:
"""Rough token estimate for planning purposes."""
return len(text) // 4
# Track cumulative usage in agentic loop
class TokenTracker:
def __init__(self, budget: int = 150000):
self.budget = budget
self.total_input = 0
self.total_output = 0
def record(self, usage):
self.total_input += usage.input_tokens
self.total_output += usage.output_tokens
@property
def remaining(self) -> int:
return self.budget - (self.total_input + self.total_output)
@property
def should_compact(self) -> bool:
"""Trigger compaction when 70% of budget is used."""
return self.remaining < self.budget * 0.3
2. Summarization Strategies
import anthropic
import json
client = anthropic.Anthropic()
def summarize_conversation(messages: list, keep_last_n: int = 4) -> list:
"""Compact conversation history by summarizing older messages."""
if len(messages) <= keep_last_n + 2:
return messages # Too short to summarize
# Split: old messages to summarize, recent to keep verbatim
to_summarize = messages[:-keep_last_n]
to_keep = messages[-keep_last_n:]
# Generate summary of older conversation
summary_response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system="Summarize this conversation history concisely. Preserve: (1) key decisions made, (2) files modified, (3) current task state, (4) any constraints or requirements discovered.",
messages=[{
"role": "user",
"content": f"Summarize this conversation:\n\n{json.dumps(to_summarize, indent=2)}"
}]
)
summary_text = summary_response.content[0].text
# Reconstruct: summary as first message + recent messages
compacted = [
{"role": "user", "content": f"[Conversation history summary]\n{summary_text}"},
{"role": "assistant", "content": "Understood. I have the context from our previous conversation. Let me continue."}
] + to_keep
return compacted
Legal Document Analysis at Scale
A law firm processes 500-page contracts using Claude. Their solution: chunk documents into sections, process each with a focused prompt, maintain a running summary of findings, and use the summary as context for cross-referencing. Result: 95% accuracy on clause identification across 500+ pages.
3. Sliding Window Pattern
import anthropic
client = anthropic.Anthropic()
class SlidingWindowAgent:
"""Agent that maintains a fixed-size context window."""
def __init__(self, max_messages: int = 20, system: str = ""):
self.max_messages = max_messages
self.system = system
self.messages = []
self.pinned_context = "" # Critical info always included
def add_message(self, role: str, content: str):
self.messages.append({"role": role, "content": content})
# Trim when exceeding window
if len(self.messages) > self.max_messages:
self._compact()
def _compact(self):
"""Keep first 2 messages (task setup) + last N messages."""
keep_first = 2
keep_last = self.max_messages - keep_first - 2 # Leave room for summary
old_middle = self.messages[keep_first:-keep_last]
# Extract key facts from discarded messages
facts = self._extract_facts(old_middle)
self.pinned_context += f"\n{facts}"
# Rebuild: first messages + recent messages
self.messages = self.messages[:keep_first] + self.messages[-keep_last:]
def _extract_facts(self, messages: list) -> str:
"""Extract critical facts from messages being discarded."""
# In production, use Claude to summarize
files_mentioned = set()
decisions = []
for msg in messages:
content = msg.get("content", "")
if "modified" in content.lower() or "created" in content.lower():
decisions.append(content[:100])
return f"Previous actions: {'; '.join(decisions[:5])}"
def get_effective_messages(self) -> list:
"""Messages with pinned context prepended."""
if self.pinned_context:
context_msg = {"role": "user", "content": f"[Context from earlier]: {self.pinned_context}"}
return [context_msg] + self.messages
return self.messages
4. Memory Architectures
4.1 Automatic Compaction in the Agent SDK
The Claude Agent SDK handles context window management automatically. When the context approaches its limit, the SDK compacts the conversation: it summarizes older history to free space, keeping recent exchanges and key decisions intact. You can also trigger compaction manually.
# Agent SDK Automatic Compaction
# Docs: https://code.claude.com/docs/en/agent-sdk/agent-loop#automatic-compaction
import asyncio
from claude_agent_sdk import (
query,
ClaudeAgentOptions,
SystemMessage,
ResultMessage,
)
async def agent_with_compaction_awareness():
"""The SDK compacts automatically — here's how to detect and customize it."""
async for message in query(
prompt="Refactor the entire auth module and update all tests",
options=ClaudeAgentOptions(
allowed_tools=["Read", "Edit", "Bash", "Grep", "Glob"],
),
):
# Detect compaction events
if isinstance(message, SystemMessage):
if message.subtype == "compact_boundary":
# SDK just summarized older context to free space
print("[COMPACTION] Context was summarized to fit window")
# Older messages are now condensed into a summary
# Recent messages + key decisions preserved
if isinstance(message, ResultMessage):
print(f"Done. Total cost: ${message.total_cost_usd:.4f}")
# --- Manual compaction: trigger with /compact ---
async def manual_compact():
"""Send /compact as a prompt to force compaction on demand."""
async for message in query(
prompt="/compact", # Slash command triggers compaction
options=ClaudeAgentOptions(
continue_conversation=True, # Must be continuing a session
),
):
if isinstance(message, SystemMessage) and message.subtype == "compact_boundary":
print("Manual compaction complete")
asyncio.run(agent_with_compaction_awareness())
What compaction preserves vs. loses:
| Preserved | May Be Lost |
|---|---|
| Recent messages (last few turns) | Exact wording from early messages |
| Key decisions and reasoning | Intermediate tool call details |
| Files modified (paths + what changed) | Full file contents read early on |
| Current task objective | Specific error messages from early turns |
| CLAUDE.md content (re-injected every request) | Verbose tool output from early exploration |
settingSources), not in the initial prompt. CLAUDE.md content is re-injected on every request and survives compaction. Initial prompt instructions may be summarized away.
# Customize compaction in your CLAUDE.md:
# Summary instructions
When summarizing this conversation, always preserve:
- The current task objective and acceptance criteria
- File paths that have been read or modified
- Test results and error messages (exact text)
- Decisions made and the reasoning behind them
- Any constraints or requirements discovered during exploration
# The compactor reads this section and respects it during auto-summarization.
# This is NOT a magic header — the compactor matches on intent, not exact text.
4.2 Multi-Tier Memory Architecture
flowchart TD
A["Working Memory
(Current messages)"] --> B["Short-Term Memory
(Summarized history)"]
B --> C["Long-Term Memory
(Persistent store)"]
A -->|"Exceeds window"| D["Compaction"]
D -->|"Key facts"| B
D -->|"Decisions & patterns"| C
C -->|"Retrieve relevant"| A
5. Durable Memory Patterns
The current Agent SDK does not expose a managed memory_stores CRUD API. In practice, durable memory comes from three places: filesystem-backed memory Claude already loads (CLAUDE.md and auto memory), your application’s own database or vector store, and session transcripts mirrored via SessionStore when you need resumability across hosts.
5.1 Filesystem Memory: CLAUDE.md and Auto Memory
# Filesystem-backed memory in the Agent SDK
from claude_agent_sdk import ClaudeAgentOptions
# Default SDK behavior loads project/user settings and memory files from disk.
project_context = ClaudeAgentOptions(
cwd="/srv/support-agent",
setting_sources=["user", "project"],
)
# In multi-tenant environments, disable filesystem memory when sessions
# must not inherit context from other tenants or prior local runs.
isolated_context = ClaudeAgentOptions(
cwd="/srv/tenant-123",
setting_sources=[],
env={"CLAUDE_CODE_DISABLE_AUTO_MEMORY": "1"},
)
print("CLAUDE.md: durable project/user instructions loaded from the filesystem")
print("Auto memory: local Claude Code memory files loaded unless explicitly disabled")
print("Use per-tenant cwd + setting_sources=[] for strict isolation")
5.2 Application-Managed Durable Memory
# Application-managed memory store
# Your app, not the Agent SDK, owns long-term memory records.
from typing import List, Dict
def retrieve_customer_facts(customer_id: str, query_text: str) -> List[Dict[str, str]]:
"""Look up relevant facts in your own DB or vector store."""
# Replace with Postgres, Redis, Elasticsearch, pgvector, Pinecone, etc.
return [
{"content": "Customer prefers concise responses", "source": "crm"},
{"content": "Customer is on Enterprise plan, 100 seats", "source": "billing"},
]
def save_customer_fact(customer_id: str, content: str, source: str) -> None:
"""Persist new durable memory in application storage."""
# Example: INSERT INTO customer_memory (...)
print(f"Saved memory for {customer_id}: {content} ({source})")
print("Durable memory lives in your application store, not a Claude-managed API")
print("SessionStore mirrors transcripts only; it does not replace long-term memory")
5.3 Using Retrieved Memory in Agents
import asyncio
from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage
def build_memory_context(facts):
return "\n".join(f"- {fact['content']} (source: {fact['source']})" for fact in facts)
async def support_agent(user_message: str, customer_id: str):
"""Inject retrieved facts into the current run, then persist new learnings in app code."""
facts = retrieve_customer_facts(customer_id, user_message)
memory_context = build_memory_context(facts)
enhanced_prompt = f"""Customer context:
{memory_context}
User request:
{user_message}
Use the known context when answering. If you learn a durable new fact, mention it explicitly so the host application can store it."""
async for message in query(
prompt=enhanced_prompt,
options=ClaudeAgentOptions(
allowed_tools=["Read", "Glob", "Grep"],
),
):
if isinstance(message, ResultMessage) and message.subtype == "success":
print(message.result)
# Host application logic decides what to persist.
save_customer_fact(customer_id, "Upgraded to 100 seats", "conversation")
asyncio.run(support_agent("I just upgraded to 100 seats", "user_123"))
memory_stores API. (2) Durable memory usually comes from CLAUDE.md, auto memory, or your own application store. (3) SessionStore mirrors transcripts for resumption but is not a long-term knowledge database. (4) Multi-tenant apps should isolate cwd, control setting_sources, and disable auto memory when needed.
6. Advanced Context Patterns
Beyond basic sliding windows, production agents need sophisticated techniques for managing their context window: the /compact command for on-demand compression, scratchpad files for intermediate state, and awareness of the “lost-in-the-middle” problem where information in the center of long contexts gets less attention.
6.1 /compact Command
import anthropic
import json
client = anthropic.Anthropic()
# /compact — On-demand context compression in Claude Code
# When context gets too large, /compact summarizes the conversation
# while preserving critical details (tool results, decisions made)
# Simulating what /compact does internally:
def compact_context(messages: list, max_tokens: int = 2000) -> list:
"""Compress conversation history while preserving key information."""
# Identify what MUST be preserved:
# - System prompt (never compressed)
# - Last N messages (recent context)
# - Tool results that haven't been acted on yet
# - Explicit decisions and commitments made
preserve_last = 4 # Always keep last 4 messages
preserved = messages[-preserve_last:]
to_summarize = messages[:-preserve_last]
if not to_summarize:
return messages # Nothing to compact
# Generate a structured summary of older messages
summary_prompt = f"""Summarize this conversation history into a concise briefing.
MUST include:
- Key decisions made
- Important facts learned (from tool results)
- Current task state and next steps
- Any constraints or rules established
Conversation to summarize:
{json.dumps(to_summarize, indent=2)}"""
summary = client.messages.create(
model="claude-haiku-4-5", # Use cheap model for summarization
max_tokens=max_tokens,
messages=[{"role": "user", "content": summary_prompt}]
)
# Replace old messages with summary + preserved recent messages
compacted = [
{"role": "user", "content": f"[Context Summary]\n{summary.content[0].text}"},
{"role": "assistant", "content": "Understood. I have the context from our previous conversation."}
] + preserved
return compacted
# Example: 50-message conversation compressed to ~6 messages
long_conversation = [{"role": "user", "content": f"Message {i}"} for i in range(50)]
compacted = compact_context(long_conversation)
print(f"Before: {len(long_conversation)} messages → After: {len(compacted)} messages")
6.2 Lost-in-the-Middle Problem
import anthropic
client = anthropic.Anthropic()
# The "lost-in-the-middle" problem:
# Claude pays MORE attention to content at the START and END of the context window
# Content buried in the MIDDLE gets less attention (lower recall)
# ❌ BAD: Important instruction buried in middle of long context
bad_messages = [
{"role": "user", "content": "[... 5000 words of context ...]"},
{"role": "user", "content": "IMPORTANT: Always cite sources."}, # BURIED
{"role": "user", "content": "[... 5000 more words ...]"},
{"role": "user", "content": "Now summarize everything above."}
]
# ✅ GOOD: Put critical instructions at START (system prompt) or END (latest message)
good_system = "CRITICAL: Always cite sources with [Source: name, page] format."
good_messages = [
{"role": "user", "content": "[... 10000 words of context ...]"},
{"role": "user", "content": "Summarize the above. Remember: cite all sources with [Source: name, page] format."}
]
# ✅ BEST: Use structured "case facts" blocks that Claude recognizes as important
structured_context = """## CASE FACTS (refer back to these throughout your response)
- Customer: TechCorp (Enterprise, 500 seats)
- Issue: Data export failing since 2024-03-10
- Previous attempts: restart, cache clear, API key rotation
- SLA deadline: 24 hours remaining
## EVIDENCE
[... supporting documentation ...]
## TASK
Diagnose the root cause and provide a fix. Reference CASE FACTS in your answer."""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1000,
messages=[{"role": "user", "content": structured_context}]
)
print(response.content[0].text[:200])
6.3 Scratchpad Files Pattern
import anthropic
import json
client = anthropic.Anthropic()
# Scratchpad pattern: Agent writes intermediate state to files
# instead of holding everything in the context window.
# This prevents context overflow during complex multi-step tasks.
# Example: Research agent investigating a complex question
# Instead of keeping all findings in context, it writes to a scratchpad:
scratchpad_system = """You are a research agent with a scratchpad file.
## Working Method
1. For each research step, write findings to the scratchpad (write_scratchpad tool)
2. Before synthesizing, read the full scratchpad (read_scratchpad tool)
3. The scratchpad persists across context compactions
4. Use sections: ## Findings, ## Open Questions, ## Conclusions
This lets you handle complex research without context overflow."""
scratchpad_tools = [
{
"name": "write_scratchpad",
"description": "Append findings to the persistent scratchpad file.",
"input_schema": {
"type": "object",
"properties": {
"section": {"type": "string", "enum": ["findings", "questions", "conclusions"]},
"content": {"type": "string"}
},
"required": ["section", "content"]
}
},
{
"name": "read_scratchpad",
"description": "Read the current scratchpad contents.",
"input_schema": {"type": "object", "properties": {}}
}
]
# The scratchpad lives outside the context window (as a file or external store)
# Even after /compact, the agent can read_scratchpad to recover its findings
print("Scratchpad pattern: externalizes state to survive context compaction")
print("Key benefit: agent can handle 100+ step research without context overflow")