1. Image Understanding
Gemini natively processes images as part of its multimodal input. You can pass images inline (base64-encoded) or via file URIs from the Files API. The model can describe scenes, read text (OCR), analyze charts, identify objects, and reason about spatial relationships.
1.1 Inline Base64 Images
For small images (under 20MB), encode directly in the request:
from google import genai
from google.genai import types
import base64
import pathlib
client = genai.Client()
# Read and encode a local image
image_path = pathlib.Path("architecture_diagram.png")
image_bytes = image_path.read_bytes()
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=[
types.Content(role="user", parts=[
types.Part(inline_data=types.Blob(
mime_type="image/png",
data=image_bytes
)),
types.Part(text="Describe this architecture diagram. List all components and their connections.")
])
]
)
print(response.text)
1.2 File URI References
For larger files or reusable assets, upload first via the Files API then reference by URI:
from google import genai
from google.genai import types
client = genai.Client()
# Upload the image once
uploaded_file = client.files.upload(file="dashboard_screenshot.png")
print(f"Uploaded: {uploaded_file.name} (state: {uploaded_file.state})")
# Reference it in multiple requests without re-uploading
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=[
types.Content(role="user", parts=[
types.Part(file_data=types.FileData(
file_uri=uploaded_file.uri,
mime_type=uploaded_file.mime_type
)),
types.Part(text="What metrics are shown on this dashboard? Are any in a critical state?")
])
]
)
print(response.text)
1.3 Spatial Layout Analysis
Gemini can analyze spatial relationships, identify UI components, and describe layouts:
from google import genai
from google.genai import types
client = genai.Client()
image_bytes = open("mobile_app_mockup.png", "rb").read()
response = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema={
"type": "object",
"properties": {
"screen_type": {"type": "string"},
"components": {
"type": "array",
"items": {
"type": "object",
"properties": {
"element": {"type": "string"},
"position": {"type": "string", "enum": ["top", "center", "bottom"]},
"purpose": {"type": "string"}
},
"required": ["element", "position", "purpose"]
}
},
"accessibility_issues": {"type": "array", "items": {"type": "string"}}
},
"required": ["screen_type", "components"]
}
),
contents=[
types.Content(role="user", parts=[
types.Part(inline_data=types.Blob(mime_type="image/png", data=image_bytes)),
types.Part(text="Analyze this mobile app screen. Identify all UI components and any accessibility issues.")
])
]
)
import json
analysis = json.loads(response.text)
print(f"Screen type: {analysis['screen_type']}")
for comp in analysis['components']:
print(f" [{comp['position']}] {comp['element']} — {comp['purpose']}")
media_resolution setting in GenerateContentConfig to balance cost vs. detail.
2. Image Generation
While Gemini excels at image understanding, Google provides dedicated models for image generation. The primary model is Imagen 4.0 for hyper-realistic image synthesis.
2.1 Imagen Model
from google import genai
from google.genai import types
client = genai.Client()
# Generate an image with Imagen 4.0
response = client.models.generate_images(
model="imagen-4.0-generate-preview",
prompt="A serene Japanese garden in autumn with a red maple tree reflecting in a koi pond, photorealistic, golden hour lighting",
config=types.GenerateImagesConfig(
number_of_images=1,
aspect_ratio="16:9"
)
)
# Save the generated image
for i, image in enumerate(response.generated_images):
with open(f"generated_garden_{i}.png", "wb") as f:
f.write(image.image.image_bytes)
print(f"Saved: generated_garden_{i}.png")
2.2 Editing Operations
Imagen supports mask-based editing for inpainting (filling masked regions) and outpainting (extending beyond image boundaries):
from google import genai
from google.genai import types
client = genai.Client()
# Image editing with a mask (inpainting)
base_image = open("room_photo.png", "rb").read()
mask_image = open("room_mask.png", "rb").read() # White = area to edit
response = client.models.generate_images(
model="imagen-4.0-generate-preview",
prompt="Replace the old sofa with a modern minimalist grey sectional couch",
config=types.GenerateImagesConfig(
number_of_images=1,
edit_config=types.EditConfig(
edit_mode="EDIT_MODE_INPAINT_INSERTION"
)
),
image=types.Image(image_bytes=base_image),
mask=types.Image(image_bytes=mask_image)
)
for image in response.generated_images:
with open("room_edited.png", "wb") as f:
f.write(image.image.image_bytes)
print("Edited image saved.")
blocked_reason field when generation is refused.
Real Estate Listing Automation
A property platform uses Gemini’s multimodal capabilities to auto-generate listings: agents upload photos of properties, Gemini describes rooms, identifies features (hardwood floors, natural light, updated kitchen), estimates square footage from images, and writes compelling listing descriptions — reducing listing time from 2 hours to 10 minutes.
3. Video Processing
Gemini can process video files up to 1 hour in length, understanding visual events, reading on-screen text, describing actions, and answering temporal questions (“What happens at 2:30?”). Videos are tokenized at 263 tokens/second.
3.1 Upload & Processing
Videos must be uploaded via the Files API and reach ACTIVE state before use:
from google import genai
import time
client = genai.Client()
# Upload the video file
print("Uploading video...")
video_file = client.files.upload(file="product_demo.mp4")
print(f"Upload complete: {video_file.name}")
# Poll until processing completes
while video_file.state == "PROCESSING":
print(" Processing...")
time.sleep(5)
video_file = client.files.get(name=video_file.name)
if video_file.state == "ACTIVE":
print(f"Video ready! URI: {video_file.uri}")
else:
print(f"Error: Video state is {video_file.state}")
3.2 Temporal Analysis
Once active, query the video with timestamp-aware questions:
from google import genai
from google.genai import types
client = genai.Client()
# Assume video_file is already uploaded and ACTIVE
video_file = client.files.get(name="files/abc123")
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=[
types.Content(role="user", parts=[
types.Part(file_data=types.FileData(
file_uri=video_file.uri,
mime_type="video/mp4"
)),
types.Part(text="""Analyze this product demo video:
1. List each feature demonstrated with its timestamp
2. Identify any UI bugs or glitches visible
3. Rate the overall demo quality (1-10) with reasoning""")
])
]
)
print(response.text)
print(f"\nVideo tokens used: ~{video_file.video_metadata.video_duration.seconds * 263}")
4. Document & PDF Analysis
Gemini treats PDFs as multimodal input — processing both the text content and the visual layout (charts, diagrams, tables) on each page. This makes it superior to text-only extraction tools.
4.1 PDF as Multimodal Input
from google import genai
from google.genai import types
client = genai.Client()
# Upload a PDF document
pdf_file = client.files.upload(file="quarterly_report.pdf")
# Wait for processing
import time
while pdf_file.state == "PROCESSING":
time.sleep(2)
pdf_file = client.files.get(name=pdf_file.name)
# Query the document
response = client.models.generate_content(
model="gemini-3.5-flash",
contents=[
types.Content(role="user", parts=[
types.Part(file_data=types.FileData(
file_uri=pdf_file.uri,
mime_type="application/pdf"
)),
types.Part(text="Summarize the key financial metrics from this quarterly report. Include revenue, profit margin, and YoY growth.")
])
]
)
print(response.text)
4.2 Structured Extraction from Documents
Combine PDF input with structured outputs for reliable data extraction:
from google import genai
from google.genai import types
client = genai.Client()
# Assume pdf_file is already uploaded
pdf_file = client.files.get(name="files/def456")
response = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema={
"type": "object",
"properties": {
"document_title": {"type": "string"},
"total_pages": {"type": "integer"},
"key_findings": {
"type": "array",
"items": {"type": "string"},
"maxItems": 5
},
"tables_found": {"type": "integer"},
"charts_found": {"type": "integer"},
"action_items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"item": {"type": "string"},
"priority": {"type": "string", "enum": ["high", "medium", "low"]},
"page": {"type": "integer"}
},
"required": ["item", "priority"]
}
}
},
"required": ["document_title", "key_findings"]
}
),
contents=[
types.Content(role="user", parts=[
types.Part(file_data=types.FileData(
file_uri=pdf_file.uri,
mime_type="application/pdf"
)),
types.Part(text="Extract structured information from this document including key findings, tables/charts count, and action items.")
])
]
)
import json
extraction = json.loads(response.text)
print(f"Document: {extraction['document_title']}")
print(f"Key findings ({len(extraction['key_findings'])}):")
for finding in extraction['key_findings']:
print(f" • {finding}")
5. The Files API
The Files API is your central hub for managing media assets. Files are stored server-side and can be referenced in multiple generation requests without re-uploading. Files automatically expire after 48 hours.
5.1 Upload, List, Get, Delete
from google import genai
client = genai.Client()
# UPLOAD — supports images, video, audio, PDF, text
uploaded = client.files.upload(file="presentation.pdf")
print(f"Uploaded: {uploaded.name} ({uploaded.size_bytes} bytes)")
# LIST — paginate through all uploaded files
print("\nAll uploaded files:")
for f in client.files.list():
print(f" {f.name} | {f.mime_type} | {f.state}")
# GET — retrieve metadata for a specific file
file_info = client.files.get(name=uploaded.name)
print(f"\nFile details: {file_info.display_name}")
print(f" State: {file_info.state}")
print(f" URI: {file_info.uri}")
print(f" Expiry: {file_info.expiration_time}")
# DELETE — remove a file before natural expiry
client.files.delete(name=uploaded.name)
print(f"\nDeleted: {uploaded.name}")
5.2 File State Lifecycle
Files transition through states after upload:
| State | Meaning | Can Use in Generation? |
|---|---|---|
PROCESSING | Server is analyzing/transcoding the file | No — must wait |
ACTIVE | Ready for use in generation requests | Yes |
FAILED | Processing error (unsupported format, corruption) | No — re-upload |
Supported MIME types include: image/png, image/jpeg, image/webp, image/gif, video/mp4, video/webm, audio/mp3, audio/wav, audio/ogg, application/pdf, text/plain, and more.
5.3 Media Resolution Settings
Control the resolution at which images are processed to balance cost and detail:
from google import genai
from google.genai import types
client = genai.Client()
image_bytes = open("high_res_photo.jpg", "rb").read()
# Low resolution — fewer tokens, faster, cheaper
response_low = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
media_resolution="MEDIA_RESOLUTION_LOW"
),
contents=[
types.Content(role="user", parts=[
types.Part(inline_data=types.Blob(mime_type="image/jpeg", data=image_bytes)),
types.Part(text="What is the main subject of this photo?")
])
]
)
# High resolution — more tokens, better detail recognition
response_high = client.models.generate_content(
model="gemini-3.5-flash",
config=types.GenerateContentConfig(
media_resolution="MEDIA_RESOLUTION_HIGH"
),
contents=[
types.Content(role="user", parts=[
types.Part(inline_data=types.Blob(mime_type="image/jpeg", data=image_bytes)),
types.Part(text="Read all text visible in this image, including small labels.")
])
]
)
print(f"Low-res response: {response_low.text[:100]}...")
print(f"High-res response: {response_high.text[:100]}...")
MEDIA_RESOLUTION_LOW for classification, general description, and scene understanding. Use MEDIA_RESOLUTION_HIGH for OCR, fine-grained object detection, and reading small text. The token cost difference can be 4× or more.
Next in the Gemini SDK Track
In Part 4: Speech, Audio & Generative Media, we’ll explore speech generation with expressive TTS tags, audio understanding, Veo video synthesis, Imagen image generation, and Lyria music creation.