How to build a RAG system with LangChain
Direct answer
Use LangChain to combine vector-based document retrieval with an LLM like
gpt-4o by embedding documents, storing them in a vector store, and querying with context-augmented prompts for RAG.Setup
Install
pip install langchain_openai faiss-cpu Env vars
OPENAI_API_KEY Imports
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import ChatPromptTemplate Examples
inQuery: 'What is LangChain used for?'
outAnswer: 'LangChain is a framework to build applications with LLMs, including RAG systems that combine retrieval and generation.'
inQuery: 'Explain RAG with LangChain'
outAnswer: 'RAG uses a vector store to find relevant documents, then an LLM generates answers based on those documents.'
inQuery: 'How to embed documents?'
outAnswer: 'Use OpenAIEmbeddings to convert text into vectors and store them in FAISS for fast similarity search.'
Integration steps
- Install LangChain, FAISS, and set your OPENAI_API_KEY in environment variables.
- Load and split your documents into chunks using a document loader.
- Embed document chunks with OpenAIEmbeddings and store them in a FAISS vector store.
- Initialize a ChatOpenAI model for generation.
- Create a retrieval-augmented prompt template that injects retrieved documents.
- Query the vector store with user input, retrieve relevant docs, and generate answers with context.
Full code
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import ChatPromptTemplate
# Load documents
loader = TextLoader("./docs/sample.txt")
docs = loader.load()
# Initialize embeddings and vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
# Initialize LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0)
# Define prompt template with context injection
prompt_template = ChatPromptTemplate.from_template(
"""
Use the following context to answer the question.
Context: {context}
Question: {question}
Answer:
"""
)
# Function to perform RAG query
def rag_query(query: str) -> str:
# Retrieve relevant docs
docs = vectorstore.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
# Format prompt
prompt = prompt_template.format_prompt(context=context, question=query)
# Generate answer
response = llm(prompt.to_messages())
return response.choices[0].message.content
# Example usage
if __name__ == "__main__":
question = "What is LangChain?"
answer = rag_query(question)
print(f"Q: {question}\nA: {answer}") output
Q: What is LangChain? A: LangChain is a framework that helps developers build applications with large language models by combining document retrieval and generation to provide accurate, context-aware answers.
API trace
Request
{"model": "gpt-4o", "messages": [{"role": "user", "content": "Use the following context to answer the question. Context: ... Question: What is LangChain? Answer:"}]} Response
{"choices": [{"message": {"content": "LangChain is a framework that helps developers build applications with large language models..."}}], "usage": {"total_tokens": 150}} Extract
response.choices[0].message.contentVariants
Streaming RAG with LangChain ›
Use streaming when you want to display partial answers in real-time for better user experience.
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import ChatPromptTemplate
loader = TextLoader("./docs/sample.txt")
docs = loader.load()
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
llm = ChatOpenAI(model="gpt-4o", temperature=0, streaming=True)
prompt_template = ChatPromptTemplate.from_template(
"""
Use the following context to answer the question.
Context: {context}
Question: {question}
Answer:
"""
)
def rag_query_stream(query: str):
docs = vectorstore.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
prompt = prompt_template.format_prompt(context=context, question=query)
for chunk in llm.stream(prompt.to_messages()):
print(chunk.choices[0].delta.content, end='')
if __name__ == "__main__":
rag_query_stream("Explain RAG with LangChain.") Async RAG Query ›
Use async when integrating RAG queries into asynchronous web servers or concurrent workflows.
import os
import asyncio
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import ChatPromptTemplate
loader = TextLoader("./docs/sample.txt")
docs = loader.load()
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt_template = ChatPromptTemplate.from_template(
"""
Use the following context to answer the question.
Context: {context}
Question: {question}
Answer:
"""
)
async def rag_query_async(query: str) -> str:
docs = vectorstore.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
prompt = prompt_template.format_prompt(context=context, question=query)
response = await llm.acall(prompt.to_messages())
return response.choices[0].message.content
async def main():
answer = await rag_query_async("What is LangChain?")
print(f"Answer: {answer}")
if __name__ == "__main__":
asyncio.run(main()) Use Claude 3.5 Sonnet for RAG ›
Use Claude 3.5 Sonnet for higher coding accuracy or when preferring Anthropic's API.
import os
import anthropic
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import ChatPromptTemplate
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
loader = TextLoader("./docs/sample.txt")
docs = loader.load()
# Embeddings and vector store setup omitted for brevity
prompt_template = ChatPromptTemplate.from_template(
"""
Use the following context to answer the question.
Context: {context}
Question: {question}
Answer:
"""
)
def rag_query_claude(query: str) -> str:
# Retrieve relevant docs from FAISS (assume vectorstore variable exists)
docs = vectorstore.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
prompt = prompt_template.format_prompt(context=context, question=query)
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
system="You are a helpful assistant.",
messages=[{"role": "user", "content": prompt.to_string()}]
)
return response.content[0].text
if __name__ == "__main__":
print(rag_query_claude("Explain RAG with LangChain.")) Performance
Latency~800ms for gpt-4o non-streaming RAG query
Cost~$0.002 per 500 tokens exchanged with gpt-4o
Rate limitsTier 1: 500 RPM / 30K TPM for OpenAI API
- Chunk documents into 500-token pieces to balance retrieval relevance and token cost.
- Limit retrieved documents to top 3-5 to reduce prompt size.
- Use temperature=0 for deterministic answers and fewer tokens.
| Approach | Latency | Cost/call | Best for |
|---|---|---|---|
| Standard RAG with gpt-4o | ~800ms | ~$0.002 | Balanced accuracy and cost |
| Streaming RAG | ~900ms | ~$0.002 | Real-time user interaction |
| Async RAG | ~800ms | ~$0.002 | Concurrent workflows |
| Claude 3.5 Sonnet RAG | ~700ms | ~$0.0025 | Best coding and reasoning accuracy |
Quick tip
Always embed and index your documents before querying to ensure fast and relevant retrieval in RAG.
Common mistake
Beginners often forget to chunk large documents before embedding, leading to poor retrieval quality.