1. Embeddings API
Embeddings turn text into vectors so your application can compare meaning numerically instead of relying on exact word overlap. Start by understanding the shape and cost of a single embedding, then move to batch generation and similarity search.
from openai import OpenAI
client = OpenAI()
# Generate a single embedding
response = client.embeddings.create(
model="text-embedding-3-small", # 1536 dims, cheaper
input="OpenAI embeddings capture semantic meaning of text.",
)
embedding = response.data[0].embedding
print(f"Dimensions: {len(embedding)}")
print(f"First 5 values: {embedding[:5]}")
print(f"Usage: {response.usage.total_tokens} tokens")
The next snippet makes the key retrieval idea concrete: embed the documents, embed the query, compare both spaces with cosine similarity, and rank the nearest matches. That same pattern is the conceptual core of most semantic search systems.
from openai import OpenAI
import numpy as np
client = OpenAI()
# Batch embeddings with cosine similarity search
documents = [
"Python is a programming language used for web development and data science.",
"JavaScript runs in browsers and on servers with Node.js.",
"Machine learning models learn patterns from training data.",
"Docker containers package applications with their dependencies.",
"SQL databases store data in structured tables with relationships.",
]
# Embed all documents
response = client.embeddings.create(
model="text-embedding-3-small",
input=documents,
)
doc_embeddings = [item.embedding for item in response.data]
# Embed query
query = "How do neural networks learn?"
query_response = client.embeddings.create(
model="text-embedding-3-small",
input=query,
)
query_embedding = query_response.data[0].embedding
# Cosine similarity search
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
scores = [(i, cosine_similarity(query_embedding, doc_emb)) for i, doc_emb in enumerate(doc_embeddings)]
scores.sort(key=lambda x: x[1], reverse=True)
print(f"Query: '{query}'\n")
for idx, score in scores[:3]:
print(f" [{score:.4f}] {documents[idx]}")
2. Vector Stores
Managed vector stores reduce the amount of infrastructure you have to own yourself. Instead of building chunking, indexing, and retrieval orchestration from scratch, you can upload files and let the platform manage the retrieval substrate.
from openai import OpenAI
client = OpenAI()
# Create a vector store
vector_store = client.vector_stores.create(
name="Product Documentation",
)
print(f"Vector store created: {vector_store.id}")
# Upload files to the vector store
with open("docs/api-reference.pdf", "rb") as f:
file = client.files.create(file=f, purpose="assistants")
# Attach file to vector store
client.vector_stores.files.create(
vector_store_id=vector_store.id,
file_id=file.id,
)
print(f"File attached: {file.id}")
# Check indexing status
vs_file = client.vector_stores.files.retrieve(
vector_store_id=vector_store.id,
file_id=file.id,
)
print(f"Status: {vs_file.status}") # in_progress → completed
Semantic Customer Support
An e-commerce company replaced keyword search in their help center with embeddings. Customer queries like “my package never arrived” now match articles about “delivery tracking” and “lost shipments” even without exact keyword overlap. Result: 45% reduction in support tickets, 3x improvement in first-contact resolution.
3. File Search Tool
File Search connects those managed stores directly to model calls. The model can retrieve relevant chunks as a built-in tool, which means your application code can stay focused on user workflow while still exposing citations and grounded answers.
from openai import OpenAI
client = OpenAI()
# Use File Search as a tool with the Responses API
response = client.responses.create(
model="gpt-4.1",
input="What are the rate limits for the embeddings API?",
tools=[{
"type": "file_search",
"vector_store_ids": ["vs_abc123"], # Your vector store ID
}],
)
print(response.output_text)
# Access citations from file search results
for item in response.output:
if hasattr(item, "content"):
for block in item.content:
if hasattr(block, "annotations"):
for ann in block.annotations:
print(f" Citation: {ann.filename} (chunk {ann.index})")
4. RAG Patterns
Retrieval-augmented generation is really a composition pattern: retrieve context first, then generate with explicit grounding instructions. The simplified class below shows that composition in its smallest understandable form.
from openai import OpenAI
import numpy as np
client = OpenAI()
class SimpleRAG:
"""RAG pipeline using OpenAI embeddings + in-memory search."""
def __init__(self, model: str = "text-embedding-3-small"):
self.model = model
self.documents: list[str] = []
self.embeddings: list[list[float]] = []
def add_documents(self, docs: list[str]):
"""Embed and store documents."""
response = client.embeddings.create(model=self.model, input=docs)
self.documents.extend(docs)
self.embeddings.extend([item.embedding for item in response.data])
def search(self, query: str, top_k: int = 3) -> list[str]:
"""Find most relevant documents for a query."""
q_resp = client.embeddings.create(model=self.model, input=query)
q_emb = q_resp.data[0].embedding
scores = []
for i, doc_emb in enumerate(self.embeddings):
score = np.dot(q_emb, doc_emb) / (np.linalg.norm(q_emb) * np.linalg.norm(doc_emb))
scores.append((i, score))
scores.sort(key=lambda x: x[1], reverse=True)
return [self.documents[i] for i, _ in scores[:top_k]]
def query(self, question: str) -> str:
"""Full RAG: retrieve context → generate answer."""
context_docs = self.search(question, top_k=3)
context = "\n\n".join(context_docs)
response = client.responses.create(
model="gpt-4.1-mini",
instructions="Answer based only on the provided context. If unsure, say so.",
input=f"Context:\n{context}\n\nQuestion: {question}",
)
return response.output_text
# Usage
rag = SimpleRAG()
rag.add_documents([
"The OpenAI Embeddings API supports text-embedding-3-small (1536 dims) and text-embedding-3-large (3072 dims).",
"Rate limits for embeddings are 3000 RPM and 1,000,000 TPM on Tier 1.",
"Vector stores can hold up to 10,000 files with automatic chunking and indexing.",
])
answer = rag.query("What are the embedding model dimensions?")
print(f"Answer: {answer}")
5. Hybrid Search
Hybrid search is often the practical production answer because neither semantic search nor lexical search is perfect by itself. The combined scorer below keeps the intuition simple: weight dense similarity and keyword relevance, then rank by the merged score.
from openai import OpenAI
import numpy as np
import re
from collections import Counter
client = OpenAI()
def bm25_score(query: str, document: str, avg_dl: float = 50.0, k1: float = 1.5, b: float = 0.75) -> float:
"""Simple BM25 scoring for keyword relevance."""
query_terms = query.lower().split()
doc_terms = re.findall(r'\w+', document.lower())
doc_len = len(doc_terms)
term_freq = Counter(doc_terms)
score = 0.0
for term in query_terms:
tf = term_freq.get(term, 0)
idf = 1.0 # Simplified — use full IDF in production
numerator = tf * (k1 + 1)
denominator = tf + k1 * (1 - b + b * doc_len / avg_dl)
score += idf * numerator / denominator
return score
def hybrid_search(query: str, documents: list[str], alpha: float = 0.7) -> list[tuple[int, float]]:
"""Hybrid search: alpha * semantic + (1-alpha) * keyword."""
# Semantic scores
response = client.embeddings.create(model="text-embedding-3-small", input=[query] + documents)
q_emb = response.data[0].embedding
doc_embs = [response.data[i+1].embedding for i in range(len(documents))]
sem_scores = [np.dot(q_emb, d) / (np.linalg.norm(q_emb) * np.linalg.norm(d)) for d in doc_embs]
# Keyword scores (normalized)
kw_scores = [bm25_score(query, doc) for doc in documents]
max_kw = max(kw_scores) if max(kw_scores) > 0 else 1
kw_scores = [s / max_kw for s in kw_scores]
# Combine
combined = [(i, alpha * sem + (1 - alpha) * kw) for i, (sem, kw) in enumerate(zip(sem_scores, kw_scores))]
combined.sort(key=lambda x: x[1], reverse=True)
return combined
documents = [
"The text-embedding-3-small model has 1536 dimensions and costs $0.02 per 1M tokens.",
"Use cosine similarity to compare embedding vectors for semantic search.",
"BM25 is a keyword-based ranking function used in traditional search engines.",
]
results = hybrid_search("embedding dimensions and cost", documents)
for idx, score in results:
print(f" [{score:.4f}] {documents[idx][:60]}...")
Next in the SDK Track
In OA Part 8: Agents SDK, we’ll build intelligent agents with the OpenAI Agents SDK — definitions, handoffs, guardrails, and observability.