How to build a RAG endpoint with FastAPI
Direct answer
Use FastAPI to create an HTTP endpoint that accepts user queries, retrieves relevant documents via a vector store, and calls an LLM like gpt-4o to generate answers augmented by retrieved context.
Setup
Install
pip install fastapi uvicorn openai langchain faiss-cpu Env vars
OPENAI_API_KEY Imports
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate Examples
in{"query": "What is the capital of France?"}
outThe capital of France is Paris.
in{"query": "Explain the theory of relativity."}
outThe theory of relativity, developed by Albert Einstein, includes special and general relativity, describing the laws of physics in different frames of reference and gravity as curvature of spacetime.
in{"query": "Who wrote 'Pride and Prejudice'?"}
out'Pride and Prejudice' was written by Jane Austen.
Integration steps
- Initialize FastAPI app and define request/response models.
- Load or build a vector store (e.g., FAISS) with document embeddings using OpenAIEmbeddings.
- Receive user query via POST endpoint and embed it.
- Query the vector store for top relevant documents.
- Construct a prompt combining retrieved documents and user query.
- Call OpenAI's gpt-4o chat completion endpoint with the prompt.
- Return the generated answer as the API response.
Full code
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
# Initialize FastAPI app
app = FastAPI()
# Initialize OpenAI client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# Define request model
class QueryRequest(BaseModel):
query: str
# Load or create vector store (example with dummy data for demo)
# In production, load from persistent storage
embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
# Example documents
documents = [
{"id": "1", "text": "Paris is the capital of France."},
{"id": "2", "text": "Albert Einstein developed the theory of relativity."},
{"id": "3", "text": "Jane Austen wrote Pride and Prejudice."}
]
# Create embeddings for documents
texts = [doc["text"] for doc in documents]
vector_store = FAISS.from_texts(texts, embeddings)
# Prompt template combining retrieved docs and user query
prompt_template = ChatPromptTemplate.from_template(
"""
You are a helpful assistant. Use the following context to answer the question.
Context:
{context}
Question:
{question}
Answer:"""
)
@app.post("/rag")
async def rag_endpoint(request: QueryRequest):
query = request.query
# Embed query and retrieve top 3 docs
docs = vector_store.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
# Format prompt
prompt = prompt_template.format_prompt(context=context, question=query).to_messages()
# Call OpenAI chat completion
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=prompt
)
answer = response.choices[0].message.content
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return {"answer": answer}
# To run: uvicorn this_file_name:app --reload output
{"answer": "Paris is the capital of France."} API trace
Request
{"model": "gpt-4o", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Context: Paris is the capital of France.\nQuestion: What is the capital of France?\nAnswer:"}]} Response
{"choices": [{"message": {"content": "Paris is the capital of France."}}], "usage": {"total_tokens": 50}} Extract
response.choices[0].message.contentVariants
Streaming response ›
Use streaming to provide partial answers in real-time for better user experience on long responses.
import os
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from openai import OpenAI
import asyncio
app = FastAPI()
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
@app.post("/rag-stream")
async def rag_stream(request: Request):
data = await request.json()
query = data.get("query", "")
messages = [
{"role": "user", "content": query}
]
def event_stream():
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
stream=True
)
for chunk in response:
yield chunk.choices[0].delta.get("content", "")
return StreamingResponse(event_stream(), media_type="text/event-stream") Async FastAPI endpoint ›
Use async endpoints to handle multiple concurrent requests efficiently.
import os
from fastapi import FastAPI
from pydantic import BaseModel
from openai import OpenAI
app = FastAPI()
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
class QueryRequest(BaseModel):
query: str
@app.post("/rag-async")
async def rag_async(request: QueryRequest):
response = await client.chat.completions.acreate(
model="gpt-4o",
messages=[{"role": "user", "content": request.query}]
)
return {"answer": response.choices[0].message.content} Use Anthropic Claude model ›
Use Anthropic Claude models for better coding and reasoning tasks or if you prefer Claude's style.
import os
import anthropic
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
class QueryRequest(BaseModel):
query: str
@app.post("/rag-claude")
async def rag_claude(request: QueryRequest):
system_prompt = "You are a helpful assistant."
messages = [{"role": "user", "content": request.query}]
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
system=system_prompt,
messages=messages
)
return {"answer": response.content} Performance
Latency~800ms for gpt-4o non-streaming calls
Cost~$0.002 per 500 tokens exchanged with gpt-4o
Rate limitsTier 1: 500 RPM / 30K TPM for OpenAI API
- Limit retrieved documents to top 3-5 to reduce prompt size.
- Use concise prompt templates to save tokens.
- Cache embeddings and reuse vector store to avoid recomputing.
| Approach | Latency | Cost/call | Best for |
|---|---|---|---|
| Standard RAG with FastAPI + gpt-4o | ~800ms | ~$0.002 | General purpose Q&A with context |
| Streaming response | Starts within 300ms, streams over time | ~$0.002 | Long answers with better UX |
| Async FastAPI endpoint | ~800ms | ~$0.002 | High concurrency environments |
| Anthropic Claude model | ~900ms | ~$0.0018 | Better coding and reasoning tasks |
Quick tip
Pre-embed your document corpus and use a vector store like FAISS to efficiently retrieve relevant context for RAG queries.
Common mistake
Not combining retrieved documents properly in the prompt, leading to irrelevant or hallucinated answers.