1. Context Caching
Context caching allows you to reuse large prefixes (system instructions, documents, few-shot examples) across multiple requests without re-processing them each time. This dramatically reduces both cost and latency for applications with shared context.
1.1 Implicit Caching (Automatic)
Starting with Gemini 2.5+, implicit caching is enabled by default. When consecutive requests share the same prefix (system instruction + initial content), the API automatically caches and reuses processed tokens:
response.usage_metadata.cached_content_token_count to verify cache hits.
from google import genai
from google.genai import types
client = genai.Client()
# These two requests share the same system instruction prefix
# The second request will automatically hit the implicit cache
system_instruction = "You are a legal expert specializing in EU GDPR compliance. " * 100 # Large prefix
response1 = client.models.generate_content(
model="gemini-3.5-flash",
contents="What are the key principles of GDPR?",
config=types.GenerateContentConfig(
system_instruction=system_instruction
)
)
print(f"Request 1 - Cached tokens: {response1.usage_metadata.cached_content_token_count}")
response2 = client.models.generate_content(
model="gemini-3.5-flash",
contents="What are the penalties for GDPR non-compliance?",
config=types.GenerateContentConfig(
system_instruction=system_instruction # Same prefix = cache hit
)
)
print(f"Request 2 - Cached tokens: {response2.usage_metadata.cached_content_token_count}")
1.2 Explicit Caching (Deliberate)
For maximum control, create named caches that persist across sessions. This is ideal for large documents or few-shot example sets that multiple users or requests reference:
from google import genai
from google.genai import types
client = genai.Client()
# Step 1: Upload a large document
uploaded_file = client.files.upload(file="company-handbook-500pages.pdf")
# Step 2: Create an explicit cache with the document
cache = client.caches.create(
model="gemini-3.5-flash",
config=types.CreateCachedContentConfig(
display_name="company-handbook-cache",
contents=[
types.Content(
role="user",
parts=[types.Part.from_uri(file_uri=uploaded_file.uri, mime_type="application/pdf")]
)
],
system_instruction="You are an HR assistant with access to the company handbook. Answer employee questions based on this document.",
ttl="3600s" # Cache lives for 1 hour
)
)
print(f"Cache created: {cache.name}")
print(f"Token count: {cache.usage_metadata.total_token_count}")
print(f"Expires: {cache.expire_time}")
Use the cache in subsequent requests:
from google import genai
from google.genai import types
client = genai.Client()
# Use cached content — reference by cache name
cache_name = "cachedContents/abc123" # From cache creation
response = client.models.generate_content(
model="gemini-3.5-flash",
contents="What is the company policy on remote work?",
config=types.GenerateContentConfig(
cached_content=cache_name
)
)
print(response.text)
print(f"Cached tokens used: {response.usage_metadata.cached_content_token_count}")
# Cached tokens are billed at ~75% discount
1.3 TTL Management & Cleanup
from google import genai
client = genai.Client()
# List all active caches
for cache in client.caches.list():
print(f" {cache.name} | Tokens: {cache.usage_metadata.total_token_count} | Expires: {cache.expire_time}")
# Update TTL (extend cache lifetime)
client.caches.update(
name="cachedContents/abc123",
config={"ttl": "7200s"} # Extend to 2 hours
)
# Delete a cache when no longer needed
client.caches.delete(name="cachedContents/abc123")
print("Cache deleted.")
2. Flex & Priority Inference
Google offers two inference modes to balance cost and latency based on your workload requirements:
Priority Inference: Guaranteed low latency — ideal for user-facing applications where response time directly impacts UX.
from google import genai
from google.genai import types
client = genai.Client()
# Flex Inference: Cost-optimized (for background/batch workloads)
flex_response = client.models.generate_content(
model="gemini-3.5-flash",
contents="Summarize the key points of this research paper...",
config=types.GenerateContentConfig(
routing_config=types.RoutingConfig(
routing_mode="FLEX" # Lower priority, lower cost
)
)
)
print(f"Flex response: {flex_response.text[:100]}...")
# Priority Inference: Latency-optimized (for user-facing)
priority_response = client.models.generate_content(
model="gemini-3.5-flash",
contents="What time is the next train to London?",
config=types.GenerateContentConfig(
routing_config=types.RoutingConfig(
routing_mode="PRIORITY" # Guaranteed low latency
)
)
)
print(f"Priority response: {priority_response.text[:100]}...")
2.1 When to Use Each Mode
| Use Case | Mode | Rationale |
|---|---|---|
| Chatbot responses | Priority | Users expect <2s response |
| Bulk document classification | Flex | No real-time requirement |
| Real-time autocomplete | Priority | Latency-critical UX |
| Nightly report generation | Flex | Scheduled, no urgency |
| Live customer support | Priority | Agent waiting for response |
| Content moderation queue | Flex | Minutes-level SLA acceptable |
Continuous Quality Monitoring
A customer support company monitors their Gemini chatbot 24/7: every 100th response is evaluated by a secondary LLM for accuracy and tone, daily reports track quality trends, and automated alerts fire if hallucination rate exceeds 5%. They caught a quality regression within 2 hours of a prompt change.
3. Batch API
The Batch API allows you to submit large sets of requests for asynchronous processing at reduced cost. Results are delivered when all requests complete:
from google import genai
from google.genai import types
client = genai.Client()
# Prepare batch requests
batch_requests = [
{"contents": "Classify this review as positive/negative: 'Great product, fast shipping!'"},
{"contents": "Classify this review as positive/negative: 'Terrible quality, broke after a day.'"},
{"contents": "Classify this review as positive/negative: 'Average, nothing special.'"},
{"contents": "Classify this review as positive/negative: 'Exceeded all my expectations!'"},
{"contents": "Classify this review as positive/negative: 'Waste of money, do not buy.'"},
]
# Submit batch job
batch_job = client.batches.create(
model="gemini-3.5-flash",
requests=batch_requests,
config=types.BatchConfig(
display_name="review-classification-batch"
)
)
print(f"Batch submitted: {batch_job.name}")
print(f"Status: {batch_job.state}")
print(f"Total requests: {len(batch_requests)}")
3.1 Retrieving Batch Results
import time
from google import genai
client = genai.Client()
batch_name = "batches/batch-abc123" # From batch creation
# Poll for completion
while True:
batch = client.batches.get(name=batch_name)
print(f"Status: {batch.state} | Completed: {batch.completed_count}/{batch.total_count}")
if batch.state == "SUCCEEDED":
# Retrieve results
for i, result in enumerate(batch.results):
print(f" Request {i+1}: {result.response.text}")
break
elif batch.state == "FAILED":
print(f"Batch failed: {batch.error}")
break
time.sleep(10)
4. Webhooks (Preview)
Instead of polling for async results (batch jobs, deep research), configure webhooks to receive push notifications when operations complete:
from google import genai
from google.genai import types
client = genai.Client()
# Submit a batch with webhook notification
batch_job = client.batches.create(
model="gemini-3.5-flash",
requests=[
{"contents": "Summarize: The quarterly earnings exceeded expectations..."},
{"contents": "Summarize: Supply chain disruptions impacted margins..."},
],
config=types.BatchConfig(
display_name="earnings-summaries",
webhook_config=types.WebhookConfig(
url="https://your-server.com/api/webhooks/gemini",
secret="your-webhook-secret-key"
)
)
)
print(f"Batch submitted with webhook: {batch_job.name}")
print("Results will be POSTed to your webhook URL on completion.")
Your webhook endpoint receives a payload like:
{
"event_type": "batch.completed",
"batch_name": "batches/batch-abc123",
"state": "SUCCEEDED",
"completed_count": 2,
"total_count": 2,
"timestamp": "2026-05-24T10:30:00Z"
}
5. Data Logging & Datasets
Opt-in data logging captures model interactions for quality evaluation, fine-tuning dataset creation, and compliance auditing:
5.1 PII Management
import re
from google import genai
from google.genai import types
client = genai.Client()
def sanitize_pii(text: str) -> str:
"""Remove common PII patterns before logging."""
# Email addresses
text = re.sub(r'[\w.-]+@[\w.-]+\.\w+', '[EMAIL_REDACTED]', text)
# Phone numbers (various formats)
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE_REDACTED]', text)
# Credit card numbers
text = re.sub(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', '[CC_REDACTED]', text)
# SSN
text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN_REDACTED]', text)
return text
# Generate with logging enabled
user_input = "My email is john@example.com and my phone is 555-123-4567"
sanitized_input = sanitize_pii(user_input)
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=sanitized_input,
config=types.GenerateContentConfig(
# Enable logging for this request (opt-in)
labels={"logging": "enabled", "dataset": "customer-support-v2"}
)
)
print(f"Original: {user_input}")
print(f"Logged as: {sanitized_input}")
print(f"Response: {response.text}")
from google import genai
client = genai.Client()
# Archive traces for quality monitoring
# Logged interactions can be exported to datasets for evaluation
dataset_config = {
"name": "customer-support-eval-may-2026",
"description": "Logged interactions for monthly quality review",
"filter": {"labels.dataset": "customer-support-v2"},
"date_range": {"start": "2026-05-01", "end": "2026-05-31"}
}
print(f"Dataset configured: {dataset_config['name']}")
print("Use AI Studio or the API to export and evaluate logged interactions.")
6. Rate Limits & Billing
Understanding and managing rate limits is critical for production stability:
| Model | RPM (Requests/min) | TPM (Tokens/min) | RPD (Requests/day) |
|---|---|---|---|
| Gemini 3.5 Flash | 2,000 | 4,000,000 | Unlimited |
| Gemini 3.1 Pro | 1,000 | 4,000,000 | Unlimited |
| Gemini 3.1 Flash Lite | 4,000 | 4,000,000 | Unlimited |
import time
import random
from google import genai
client = genai.Client()
def generate_with_backoff(prompt: str, max_retries: int = 5) -> str:
"""Generate content with exponential backoff for rate limits."""
for attempt in range(max_retries):
try:
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=prompt
)
return response.text
except Exception as e:
error_str = str(e)
if "429" in error_str or "RESOURCE_EXHAUSTED" in error_str:
wait_time = (2 ** attempt) + random.uniform(0, 1)
print(f"Rate limited. Retrying in {wait_time:.1f}s (attempt {attempt + 1}/{max_retries})")
time.sleep(wait_time)
else:
raise # Re-raise non-rate-limit errors
raise Exception("Max retries exceeded for rate limiting")
# Usage
result = generate_with_backoff("What is the capital of France?")
print(result)
import asyncio
from collections import deque
from google import genai
client = genai.Client()
class RequestQueue:
"""Simple rate-limiting queue for Gemini API requests."""
def __init__(self, max_rpm: int = 1500):
self.max_rpm = max_rpm
self.request_times = deque()
async def wait_for_slot(self):
"""Wait until a request slot is available."""
now = asyncio.get_event_loop().time()
# Remove timestamps older than 60 seconds
while self.request_times and (now - self.request_times[0]) > 60:
self.request_times.popleft()
# If at capacity, wait for the oldest request to age out
if len(self.request_times) >= self.max_rpm:
wait_time = 60 - (now - self.request_times[0])
await asyncio.sleep(max(0, wait_time))
self.request_times.append(now)
async def generate(self, prompt: str) -> str:
"""Generate with rate limiting."""
await self.wait_for_slot()
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=prompt
)
return response.text
# Usage
queue = RequestQueue(max_rpm=1500) # Stay under 2000 RPM limit
# result = asyncio.run(queue.generate("Hello!"))
response.usage_metadata to audit every request’s token breakdown.
Next in the Gemini SDK Track
In Part 14: Enterprise Migration & Framework Ecosystem, we’ll migrate from generateContent to the Interactions API, set up GCP OAuth service accounts, and integrate Gemini with LangChain, LangGraph, CrewAI, LlamaIndex, and the Vercel AI SDK.