1. Speech Generation (Preview)
Gemini’s speech generation capabilities allow you to synthesize natural-sounding speech from text, with support for multiple voices, emotional expression, and fine-grained pacing control through expressive audio tags.
1.1 TTS Capabilities
Generate speech audio directly from text prompts using generate_content with audio output configuration:
from google import genai
from google.genai import types
client = genai.Client()
# Generate speech from text
response = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name="Kore"
)
)
)
),
contents="Welcome to the Gemini SDK tutorial series. Today we'll explore speech generation and audio processing capabilities."
)
# Save the audio output
audio_data = response.candidates[0].content.parts[0].inline_data.data
with open("speech_output.wav", "wb") as f:
f.write(audio_data)
print("Speech saved to speech_output.wav")
1.2 Voice Selection
Multiple pre-built voices are available, each with distinct characteristics:
| Voice | Character | Best For |
|---|---|---|
Zephyr | Bright, clear | Tutorials, explanations |
Puck | Upbeat, energetic | Marketing, announcements |
Charon | Deep, authoritative | Narration, formal content |
Kore | Warm, conversational | Dialogue, customer service |
Fenrir | Bold, commanding | Presentations, alerts |
Leda | Gentle, soothing | Meditation, bedtime stories |
from google import genai
from google.genai import types
client = genai.Client()
voices = ["Zephyr", "Puck", "Charon", "Kore"]
text = "The quick brown fox jumps over the lazy dog."
for voice_name in voices:
response = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice_name
)
)
)
),
contents=text
)
audio_data = response.candidates[0].content.parts[0].inline_data.data
with open(f"voice_sample_{voice_name.lower()}.wav", "wb") as f:
f.write(audio_data)
print(f"Generated: voice_sample_{voice_name.lower()}.wav")
1.3 Expressive Audio Tags
Control tone, emotion, pacing, and emphasis using inline tags within your text prompt:
from google import genai
from google.genai import types
client = genai.Client()
# Use expressive tags for nuanced speech
expressive_text = """
Say excitedly: "Wow, we just hit one million users!"
Then say calmly and slowly: "Let me walk you through what this means for our roadmap."
Then say with emphasis: "Security remains our number one priority."
End with a warm tone: "Thank you all for being part of this journey."
"""
response = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name="Kore"
)
)
)
),
contents=expressive_text
)
audio_data = response.candidates[0].content.parts[0].inline_data.data
with open("expressive_speech.wav", "wb") as f:
f.write(audio_data)
print("Expressive speech saved.")
2. Audio Understanding
Gemini can process audio files for transcription, summarization, speaker identification, sentiment analysis, and Q&A. Audio is tokenized at 32 tokens/second, making even long recordings affordable to process.
2.1 Processing Audio Files
from google import genai
from google.genai import types
import time
client = genai.Client()
# Upload an audio file
audio_file = client.files.upload(file="meeting_recording.mp3")
# Wait for processing
while audio_file.state == "PROCESSING":
time.sleep(2)
audio_file = client.files.get(name=audio_file.name)
print(f"Audio ready: {audio_file.name}")
print(f"Duration: ~{audio_file.size_bytes // 16000}s (estimated)")
# Transcribe and summarize
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=[
types.Content(role="user", parts=[
types.Part(file_data=types.FileData(
file_uri=audio_file.uri,
mime_type="audio/mp3"
)),
types.Part(text="Transcribe this audio and provide a bullet-point summary of key discussion points.")
])
]
)
print(response.text)
2.2 Transcription & Analysis Tasks
Combine audio input with structured outputs for reliable extraction:
from google import genai
from google.genai import types
client = genai.Client()
# Assume audio_file is already uploaded and ACTIVE
audio_file = client.files.get(name="files/audio123")
# Extract structured meeting notes
response = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema={
"type": "object",
"properties": {
"speakers_detected": {"type": "integer"},
"duration_minutes": {"type": "number"},
"language": {"type": "string"},
"topics": {
"type": "array",
"items": {"type": "string"}
},
"action_items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"task": {"type": "string"},
"assigned_to": {"type": "string"},
"deadline_mentioned": {"type": "boolean"}
},
"required": ["task"]
}
},
"sentiment": {"type": "string", "enum": ["positive", "neutral", "negative", "mixed"]}
},
"required": ["speakers_detected", "topics", "sentiment"]
}
),
contents=[
types.Content(role="user", parts=[
types.Part(file_data=types.FileData(
file_uri=audio_file.uri,
mime_type="audio/mp3"
)),
types.Part(text="Analyze this meeting recording. Extract speakers, topics, action items, and overall sentiment.")
])
]
)
import json
analysis = json.loads(response.text)
print(f"Speakers: {analysis['speakers_detected']}")
print(f"Sentiment: {analysis['sentiment']}")
print(f"Topics: {', '.join(analysis['topics'])}")
for item in analysis.get('action_items', []):
print(f" → {item['task']}")
Automated Financial Reporting
A fintech company built a Gemini-powered analyst that queries their data warehouse via function calls: get_revenue(period), get_expenses(category, period), compare_periods(metric, period1, period2). Analysts ask questions in natural language and receive formatted reports with charts — reducing report generation from 4 hours to 5 minutes.
3. Veo: Video Synthesis
Veo is Google’s video generation model, capable of creating high-definition video clips from text descriptions. The current preview model (veo-3.1-generate-preview) supports camera movements, scene transitions, and temporal coherence.
3.1 Text-to-Video
from google import genai
from google.genai import types
import time
client = genai.Client()
# Generate a video from text description
operation = client.models.generate_videos(
model="veo-3.1-generate-preview",
prompt="A drone shot slowly rising over a misty mountain lake at sunrise, revealing snow-capped peaks in the background. Cinematic, 4K quality.",
config=types.GenerateVideosConfig(
number_of_videos=1,
duration_seconds=8,
aspect_ratio="16:9"
)
)
# Video generation is asynchronous — poll for completion
print("Generating video...")
while not operation.done:
time.sleep(10)
operation = client.operations.get(operation)
print(f" Status: {operation.metadata}")
# Download the generated video
if operation.result:
video = operation.result.generated_videos[0]
with open("mountain_lake.mp4", "wb") as f:
f.write(video.video.video_bytes)
print("Video saved: mountain_lake.mp4")
else:
print(f"Generation failed: {operation.error}")
3.2 Generation Parameters
Control the video output characteristics:
| Parameter | Options | Description |
|---|---|---|
duration_seconds | 5–8 | Length of generated clip |
aspect_ratio | 16:9, 9:16, 1:1 | Output video dimensions |
number_of_videos | 1–4 | Variants to generate |
negative_prompt | Free text | What to avoid in output |
from google import genai
from google.genai import types
client = genai.Client()
# Generate with specific parameters
operation = client.models.generate_videos(
model="veo-3.1-generate-preview",
prompt="A barista carefully pouring latte art in a cozy coffee shop, close-up shot, warm lighting, shallow depth of field",
config=types.GenerateVideosConfig(
number_of_videos=2,
duration_seconds=5,
aspect_ratio="9:16",
negative_prompt="blurry, low quality, distorted hands"
)
)
print(f"Generation started. Operation: {operation.name}")
print("Poll operation.done to check completion...")
4. Imagen: Image Synthesis
Imagen 4.0 is Google’s flagship image generation model, producing hyper-realistic images with precise spatial understanding, accurate text rendering, and photorealistic lighting.
4.1 Hyper-Realistic Generation
from google import genai
from google.genai import types
client = genai.Client()
# Generate a photorealistic image
response = client.models.generate_images(
model="imagen-4.0-generate-preview",
prompt="A modern minimalist home office with floor-to-ceiling windows overlooking a forest, natural wood desk, single monitor, indoor plant, morning light casting long shadows",
config=types.GenerateImagesConfig(
number_of_images=2,
aspect_ratio="16:9",
safety_filter_level="BLOCK_MEDIUM_AND_ABOVE"
)
)
# Save generated images
for i, image in enumerate(response.generated_images):
filename = f"home_office_{i+1}.png"
with open(filename, "wb") as f:
f.write(image.image.image_bytes)
print(f"Saved: {filename}")
print(f"Generated {len(response.generated_images)} images")
4.2 Mask-Based Editing
Edit specific regions of an existing image using masks:
from google import genai
from google.genai import types
client = genai.Client()
# Load base image and mask
base_image_bytes = open("product_photo.png", "rb").read()
mask_bytes = open("background_mask.png", "rb").read() # White = editable area
# Replace background while keeping the product
response = client.models.generate_images(
model="imagen-4.0-generate-preview",
prompt="Professional product photography background: clean white marble surface with soft studio lighting and subtle shadows",
config=types.GenerateImagesConfig(
number_of_images=1,
edit_config=types.EditConfig(
edit_mode="EDIT_MODE_INPAINT_INSERTION"
)
),
image=types.Image(image_bytes=base_image_bytes),
mask=types.Image(image_bytes=mask_bytes)
)
for image in response.generated_images:
with open("product_new_background.png", "wb") as f:
f.write(image.image.image_bytes)
print("Edited image saved.")
negative_prompt to avoid unwanted artifacts.
5. Lyria: Music & Audio Generation
Lyria 3 is Google’s music generation model, capable of creating original compositions, ambient soundscapes, and musical arrangements from text descriptions. Lyria RealTime enables low-latency streaming for interactive audio experiences.
5.1 Musical Generation
from google import genai
from google.genai import types
client = genai.Client()
# Generate a music track
response = client.models.generate_content(
model="lyria-realtime-preview",
config=types.GenerateContentConfig(
response_modalities=["AUDIO"]
),
contents="Create a 30-second lo-fi hip hop beat with soft piano chords, vinyl crackle, and a relaxed drum pattern. Perfect for studying."
)
# Save the generated audio
audio_data = response.candidates[0].content.parts[0].inline_data.data
with open("lofi_study_beat.wav", "wb") as f:
f.write(audio_data)
print("Music saved: lofi_study_beat.wav")
5.2 RealTime Audio Streaming
Lyria RealTime enables low-latency audio generation for interactive applications — music that adapts in real-time to user input or game state:
from google import genai
from google.genai import types
client = genai.Client()
# Generate ambient audio layers
ambient_prompt = """Create a layered ambient soundscape:
- Base layer: Gentle rain on a window
- Mid layer: Distant thunder every 10 seconds
- Top layer: Soft wind chimes occasionally
The mood should be peaceful and meditative. Duration: 60 seconds."""
response = client.models.generate_content(
model="lyria-realtime-preview",
config=types.GenerateContentConfig(
response_modalities=["AUDIO"]
),
contents=ambient_prompt
)
audio_data = response.candidates[0].content.parts[0].inline_data.data
with open("rain_ambience.wav", "wb") as f:
f.write(audio_data)
print("Ambient audio saved: rain_ambience.wav")
print(f"Audio size: {len(audio_data) / 1024:.1f} KB")
Next in the Gemini SDK Track
In Part 5: Thinking, Reasoning & Thought Signatures, we’ll explore how to control the model’s reasoning depth with thinking budgets, inspect thought processes for debugging, and leverage thought signatures for complex multi-step problem solving.