1. Planning
Planning enables crews to decompose complex tasks into structured steps before execution begins. When enabled, a dedicated planning LLM analyzes all tasks and creates an execution plan that agents follow, improving coherence and reducing wasted iterations.
1.1 Crew with Planning Enabled
from crewai import Agent, Task, Crew, Process
# Define agents
researcher = Agent(
role="Market Researcher",
goal="Gather comprehensive market data",
backstory="Expert at finding and analyzing market information",
verbose=True
)
analyst = Agent(
role="Data Analyst",
goal="Transform raw data into actionable insights",
backstory="Statistical expert who finds patterns in data",
verbose=True
)
writer = Agent(
role="Report Writer",
goal="Create clear, compelling business reports",
backstory="Technical writer specializing in executive communications",
verbose=True
)
# Define tasks
research_task = Task(
description="Research the AI SaaS market size, growth rate, and top 5 competitors",
expected_output="Market data with sources: size, CAGR, competitor list with revenues",
agent=researcher
)
analysis_task = Task(
description="Analyze the market data to identify gaps and opportunities",
expected_output="SWOT analysis with 3 specific market entry opportunities",
agent=analyst,
context=[research_task]
)
report_task = Task(
description="Write an executive summary report with recommendations",
expected_output="1-page executive summary with top 3 actionable recommendations",
agent=writer,
context=[analysis_task]
)
# Enable planning — crew creates execution plan before starting
crew = Crew(
agents=[researcher, analyst, writer],
tasks=[research_task, analysis_task, report_task],
process=Process.sequential,
planning=True, # Enable planning!
verbose=True
)
result = crew.kickoff()
print(result.raw)
from crewai import Agent, Task, Crew, Process
# Custom planning LLM — use a different model for planning vs execution
planner = Agent(
role="Strategic Planner",
goal="Create optimal execution plans",
backstory="Expert at decomposing complex tasks",
verbose=True
)
worker = Agent(
role="Task Executor",
goal="Execute tasks according to the plan",
backstory="Efficient worker who follows plans precisely",
verbose=True
)
task = Task(
description="Build a complete REST API specification for a todo app with auth",
expected_output="OpenAPI 3.0 spec with all endpoints, schemas, and auth flows",
agent=worker
)
# Planning with custom LLM configuration
crew = Crew(
agents=[worker],
tasks=[task],
planning=True,
planning_llm="gpt-4o" # Use a specific model for planning
)
result = crew.kickoff()
print(result.raw)
2. Reasoning
Reasoning capabilities allow agents to perform multi-step logical analysis before acting. This is particularly valuable for tasks involving complex decisions, trade-offs, or ambiguous requirements.
2.1 Reasoning Agent
from crewai import Agent, Task, Crew
# Agent configured for deep reasoning
architect = Agent(
role="Solutions Architect",
goal="Design optimal system architectures with clear reasoning",
backstory="""Senior solutions architect with 20 years experience.
You think through problems methodically, considering trade-offs,
constraints, and long-term implications before recommending solutions.
You always explain your reasoning chain.""",
verbose=True,
max_iter=15, # Allow more iterations for complex reasoning
allow_delegation=False # Focus on own reasoning
)
# Complex task requiring reasoning
architecture_task = Task(
description="""Design a system architecture for a real-time collaborative
document editor (like Google Docs) with these requirements:
- 10M concurrent users
- Sub-200ms latency for character-level edits
- Strong eventual consistency
- Offline support with conflict resolution
- End-to-end encryption for enterprise tier
Reason through the trade-offs between:
1. OT (Operational Transform) vs CRDT approaches
2. WebSocket vs WebTransport for real-time sync
3. Regional vs global consistency models""",
expected_output="""Architecture decision document with:
- Chosen approach with explicit reasoning
- Trade-off analysis for each decision
- Component diagram description
- Scalability analysis""",
agent=architect
)
crew = Crew(agents=[architect], tasks=[architecture_task])
result = crew.kickoff()
print(result.raw)
3. Testing Crews
CrewAI provides CLI-based testing and supports programmatic test suites. Testing ensures your crews produce consistent, high-quality results across runs.
# Run crew tests via CLI
crewai test
# Run with specific number of iterations
crewai test -n 3
# This executes your crew multiple times and evaluates:
# - Output quality consistency
# - Task completion rates
# - Token usage efficiency
3.1 Unit Testing Agents and Tasks
import pytest
from crewai import Agent, Task, Crew
class TestResearchCrew:
"""Test suite for the research crew."""
def setup_method(self):
"""Set up test fixtures."""
self.researcher = Agent(
role="Test Researcher",
goal="Find information accurately",
backstory="Research expert for testing",
verbose=False # Quiet for tests
)
def test_agent_creation(self):
"""Test agent is created with correct attributes."""
assert self.researcher.role == "Test Researcher"
assert self.researcher.goal == "Find information accurately"
def test_task_creation(self):
"""Test task is created with proper configuration."""
task = Task(
description="Find the population of Tokyo",
expected_output="Population number with source",
agent=self.researcher
)
assert task.description == "Find the population of Tokyo"
assert task.agent == self.researcher
def test_crew_execution(self):
"""Test crew produces non-empty output."""
task = Task(
description="What is 2 + 2?",
expected_output="The number 4",
agent=self.researcher
)
crew = Crew(
agents=[self.researcher],
tasks=[task],
verbose=False
)
result = crew.kickoff()
assert result.raw is not None
assert len(result.raw) > 0
def test_output_contains_expected_content(self):
"""Test output contains expected information."""
task = Task(
description="List exactly 3 primary colors",
expected_output="Red, Blue, Yellow",
agent=self.researcher
)
crew = Crew(
agents=[self.researcher],
tasks=[task],
verbose=False
)
result = crew.kickoff()
output = result.raw.lower()
assert "red" in output or "blue" in output
# Run with: pytest test_crew.py -v
3.2 Mocking LLM Calls
import pytest
from unittest.mock import patch, MagicMock
from crewai import Agent, Task, Crew
def test_crew_with_mocked_llm():
"""Test crew behavior without making actual LLM API calls."""
researcher = Agent(
role="Researcher",
goal="Research topics",
backstory="Expert researcher",
verbose=False
)
task = Task(
description="Research AI trends",
expected_output="List of trends",
agent=researcher
)
crew = Crew(
agents=[researcher],
tasks=[task],
verbose=False
)
# Mock the LLM to return a predictable response
with patch.object(crew, 'kickoff') as mock_kickoff:
mock_result = MagicMock()
mock_result.raw = "1. Large Language Models\n2. AI Agents\n3. Multimodal AI"
mock_kickoff.return_value = mock_result
result = crew.kickoff()
assert "AI Agents" in result.raw
assert mock_kickoff.called
def test_task_output_format():
"""Validate task output format without LLM calls."""
# Test that output_json produces valid JSON structure
from pydantic import BaseModel
from typing import List
class TrendReport(BaseModel):
trends: List[str]
confidence: float
# Validate schema independently of LLM
report = TrendReport(trends=["AI Agents", "RAG"], confidence=0.85)
assert len(report.trends) == 2
assert 0 <= report.confidence <= 1
Strategic Planning Automation
A startup accelerator uses CrewAI with planning enabled for their quarterly strategy reviews: the crew decomposes “evaluate our portfolio performance” into specific research tasks, assigns them to specialist agents, and synthesizes findings into actionable recommendations. Planning ensures no important angle is missed. Result: strategy documents that take founders 2 days to produce are generated in 45 minutes.
4. Replay Tasks
Replay allows you to re-execute specific tasks from the most recent crew kickoff without re-running the entire crew. This is invaluable for debugging and iterating on individual task configurations.
4.1 Replay Workflow
# List tasks from the latest kickoff
crewai log
# Replay a specific task by its ID
crewai replay -t task_id_here
# This re-executes only that task using the same context
# from the original run, allowing you to:
# - Test prompt changes without full re-runs
# - Debug a single failing task
# - Iterate on expected_output formatting
from crewai import Agent, Task, Crew
# Set up a crew for replay demonstration
analyst = Agent(
role="Financial Analyst",
goal="Produce accurate financial analysis",
backstory="CFA-certified analyst with 10 years experience",
verbose=True
)
# Task 1: Data gathering (expensive, don't want to repeat)
gather_task = Task(
description="Gather Q1 2026 revenue data for FAANG companies",
expected_output="Revenue figures with YoY growth percentages",
agent=analyst
)
# Task 2: Analysis (this is what we want to iterate on)
analysis_task = Task(
description="Analyze the revenue trends and predict Q2 outlook",
expected_output="Trend analysis with Q2 predictions and confidence intervals",
agent=analyst,
context=[gather_task]
)
crew = Crew(agents=[analyst], tasks=[gather_task, analysis_task])
# First run — executes both tasks
result = crew.kickoff()
print(f"First run: {result.raw[:100]}...")
# After reviewing, you can replay just the analysis task via CLI:
# crewai replay -t
# This uses cached gather_task output as context
5. Crew Evaluation
Evaluate crew performance systematically by measuring quality, speed, and cost. Use evaluation results to iterate on agent definitions, task configurations, and process settings.
5.1 Evaluation Framework
import time
from crewai import Agent, Task, Crew
def evaluate_crew(crew, test_inputs, expected_outputs, n_runs=3):
"""Evaluate a crew's performance across multiple runs.
Args:
crew: The Crew instance to evaluate
test_inputs: List of input dictionaries for crew.kickoff(inputs=...)
expected_outputs: List of expected output patterns
n_runs: Number of evaluation runs per input
"""
results = []
for i, (test_input, expected) in enumerate(zip(test_inputs, expected_outputs)):
run_results = []
for run in range(n_runs):
start_time = time.time()
result = crew.kickoff(inputs=test_input)
elapsed = time.time() - start_time
# Quality check: does output contain expected content?
output = result.raw.lower()
quality_score = sum(1 for keyword in expected if keyword.lower() in output)
quality_pct = quality_score / len(expected) * 100
run_results.append({
"run": run + 1,
"time_seconds": round(elapsed, 2),
"quality_pct": quality_pct,
"output_length": len(result.raw),
"token_usage": getattr(result, 'token_usage', None)
})
results.append({
"test_case": i + 1,
"input": test_input,
"runs": run_results,
"avg_quality": sum(r["quality_pct"] for r in run_results) / n_runs,
"avg_time": sum(r["time_seconds"] for r in run_results) / n_runs
})
return results
# Example evaluation setup
writer = Agent(
role="Content Writer",
goal="Write high-quality blog posts",
backstory="Expert content writer",
verbose=False
)
task = Task(
description="Write a short blog intro about {topic}",
expected_output="A 2-3 sentence engaging blog introduction",
agent=writer
)
crew = Crew(agents=[writer], tasks=[task])
# Define test cases
test_inputs = [
{"topic": "machine learning"},
{"topic": "remote work"},
{"topic": "sustainable energy"}
]
expected_keywords = [
["machine learning", "ai", "data"],
["remote", "work", "team"],
["sustainable", "energy", "green"]
]
# Run evaluation
eval_results = evaluate_crew(crew, test_inputs, expected_keywords, n_runs=2)
# Print results
for result in eval_results:
print(f"\nTest Case {result['test_case']}: {result['input']}")
print(f" Avg Quality: {result['avg_quality']:.0f}%")
print(f" Avg Time: {result['avg_time']:.1f}s")
Next in the CrewAI SDK Track
In Part 11: Human-in-the-Loop & Multimodal, we’ll implement human feedback integration for critical decisions, multi-modal agent capabilities with vision and audio, and production deployment patterns.