LlamaIndex Cheat Sheet — RAG, Indexing & Retrieval
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore Turn documents into queryable knowledge via embeddings and semantic search.
Like building a searchable library: documents are books, nodes are chapters, embeddings are catalog cards organized by meaning, and queries are librarians finding the most relevant chapters for your question.
Core Patterns
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
import os
# Load and parse
documents = SimpleDirectoryReader("./data").load_data()
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents)
# Create index
index = VectorStoreIndex(nodes)
# Query
query_engine = index.as_query_engine()
response = query_engine.query("What is the main topic?")
print(response) response.response # LLM-generated answer
response.source_nodes # Retrieved source chunks from llama_index.core import VectorStoreIndex, Settings
from llama_index.llms.anthropic import Anthropic
from llama_index.embeddings.openai import OpenAIEmbedding
import os
# Set global LLM
llm = Anthropic(
model="claude-3-5-sonnet-20241022",
api_key=os.environ["ANTHROPIC_API_KEY"]
)
Settings.llm = llm
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
# Now all index operations use Anthropic
index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine()
response = query_engine.query("Question here") from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext
from pinecone import Pinecone
import os
# Connect to Pinecone
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("my-index")
vector_store = PineconeVectorStore(pinecone_index=index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Create index
documents = SimpleDirectoryReader("./data").load_data()
vdb_index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context
)
# Query
query_engine = vdb_index.as_query_engine()
response = query_engine.query("Query text") from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
index = VectorStoreIndex(nodes)
retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
results = retriever.retrieve("search query")
for node in results:
print(node.get_content())
print(f"Score: {node.score}") List of Node objects with .get_content() and .score from llama_index.core.retrievers import VectorIndexRetriever, BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
vector_retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
bm25_retriever = BM25Retriever.from_nodes(nodes, similarity_top_k=3)
hybrid_retriever = QueryFusionRetriever(
retrievers=[vector_retriever, bm25_retriever],
similarity_top_k=5
)
results = hybrid_retriever.retrieve("query") from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import load_index_from_storage
# Load from default storage
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine()
response = query_engine.query("Question") Key API Methods
| Method / Property | Description | Returns |
|---|---|---|
VectorStoreIndex(nodes, ...) | Create index from Node list. Auto-embeds if Settings.embed_model set. | VectorStoreIndex instance |
VectorStoreIndex.from_documents(documents, ...) | Create index directly from Document list (auto-parses into nodes). | VectorStoreIndex instance |
index.as_query_engine(similarity_top_k=2, ...) | Create QueryEngine for generation. Retrieves top_k nodes, passes to LLM. | QueryEngine instance |
query_engine.query(str_or_QueryBundle) | Execute query, retrieve + generate. Returns Response with .response and .source_nodes. | Response object |
SentenceSplitter(chunk_size=512, chunk_overlap=20) | Parse documents into overlapping text chunks (nodes). | List[Node] |
SimpleDirectoryReader(input_dir).load_data() | Batch load text files from directory. Supports .txt, .md, .pdf (with extra loaders). | List[Document] |
Settings.llm = ... | Set global LLM for all indexes. Affects all new indexes in current process. | None (side effect) |
index.storage_context.persist(persist_dir='./storage') | Save index metadata and embeddings to disk for reload. | None |
Common Parameters
VectorStoreIndex, QueryEngine, SentenceSplitter
| Parameter | Default | Effect |
|---|---|---|
similarity_top_k | 2 | Number of context nodes passed to LLM during query |
chunk_size | 1024 | SentenceSplitter max text per node (tokens approx) |
chunk_overlap | 20 | SentenceSplitter overlap between consecutive chunks |
response_mode | 'compact' | QueryEngine synthesis: 'compact', 'refine', 'tree_summarize', 'simple_summarize' |
streaming | False | QueryEngine returns token-by-token stream instead of full response |
Common Errors & Fixes
ImportError: cannot import name 'VectorStoreIndex' from 'llama_index' Cause: Using old llama-index package. Correct import path is llama_index.core (v0.10+).
Uninstall old package: pip uninstall llama-index
Install new core package: pip install llama-index-core
Update imports: from llama_index.core import VectorStoreIndex ValueError: Embedding dimension mismatch: 1536 vs 384 Cause: Vector store was created with one embedding model (e.g., text-embedding-3-small = 1536 dims), but you're using a different model (e.g., sentence-transformers = 384 dims).
Ensure consistent embedding model:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
# Now create index: all nodes will use same 1536-dim embeddings RuntimeError: No LLM provided. Set Settings.llm Cause: Tried to create QueryEngine without configuring a default LLM.
Set global LLM before querying:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
Settings.llm = OpenAI(model="gpt-4o")
query_engine = index.as_query_engine()
response = query_engine.query("question") TypeError: list indices must be integers or slices, not str (in node_parser) Cause: SentenceSplitter received raw text instead of Document objects.
Wrap text in Document:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
text = "Your long text here"
doc = Document(text=text)
splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents([doc]) AttributeError: 'list' object has no attribute 'embedding' (when loading from Pinecone) Cause: Pinecone metadata contains non-text fields that confuse node reconstruction.
Ensure Pinecone index metadata only stores text fields:
vector_store = PineconeVectorStore(
pinecone_index=index,
namespace="my-namespace"
)
# Recreate index with clean metadata
index = VectorStoreIndex.from_documents(
documents,
storage_context=StorageContext.from_defaults(vector_store=vector_store)
) Production Gotchas
Changing Settings.llm or Settings.embed_model affects ALL subsequent index operations in that Python process. If you have multiple pipelines, isolate them or reset Settings explicitly between them. This is a source of silent bugs when switching between models.
By default, node IDs are auto-generated hashes based on content. If two documents have identical chunks, they get the same node ID, causing one to overwrite the other in the index. Set explicit node_id on each node or use metadata to disambiguate if merging multiple sources.
SimpleDirectoryReader only handles .txt, .md, .rst by default. Loading .pdf requires pip install llama-index-readers-file and creating a custom loader. Missing loader dependencies silently skip files instead of failing loudly.
QueryEngine retrieves top-k nodes, but the LLM can ignore them and generate false information. Always inspect response.source_nodes to validate the LLM actually used your retrieved context. Use response_mode='tree_summarize' for more grounded responses when context is large.
Building a VectorStoreIndex does NOT automatically save to disk. Call index.storage_context.persist(persist_dir='./storage') explicitly, or you lose all embeddings when the process exits. Pinecone and other remote stores auto-persist, but local storage doesn't.
Every chunk (node) is embedded separately. A 1000-page document with chunk_size=512 generates ~2000 embeddings. Using OpenAI text-embedding-3-small at $0.02 per million tokens = ~$0.04 per document. Batch operations and monitor token usage in production.
End-to-end example: load docs, create index, query with streaming, inspect sources.
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
# Configure global settings
Settings.llm = OpenAI(model="gpt-4o", api_key=os.environ["OPENAI_API_KEY"])
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
# Step 1: Load documents
documents = SimpleDirectoryReader("./data").load_data()
print(f"Loaded {len(documents)} documents")
# Step 2: Parse into nodes
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} nodes")
# Step 3: Create index
index = VectorStoreIndex(nodes)
print("Index created")
# Step 4: Query with details
query_engine = index.as_query_engine(similarity_top_k=3)
response = query_engine.query("What is the main finding?")
# Step 5: Inspect results
print(f"\nResponse: {response.response}")
print(f"\nSources:")
for node in response.source_nodes:
print(f" - {node.get_content()[:100]}... (score: {node.score:.2f})")
# Step 6: Save for later
index.storage_context.persist(persist_dir="./storage")
print("\nIndex persisted")