How to use GPTCache in Python
Direct answer
Use the gptcache Python library to wrap your LLM calls with caching, which stores and retrieves responses to reduce redundant API usage and lower costs.
Setup
Install
pip install gptcache openai Env vars
OPENAI_API_KEY Imports
import os
from openai import OpenAI
from gptcache import cache
from gptcache.adapter import OpenAIAdapter
from gptcache.manager.factory import get_data_manager
from gptcache.processor.pre import get_prompt
from gptcache.processor.post import get_response Examples
inWhat is GPTCache?
outGPTCache is a caching framework for LLMs that stores previous responses to avoid repeated API calls.
inExplain how to integrate GPTCache with OpenAI in Python.
outInitialize GPTCache with an OpenAI adapter, then call your LLM through GPTCache to cache and reuse responses automatically.
inWhat happens if the same prompt is sent twice?
outGPTCache returns the cached response on the second call, avoiding a new API request.
Integration steps
- Install the gptcache and openai Python packages.
- Set your OpenAI API key in the environment variable OPENAI_API_KEY.
- Import GPTCache modules and initialize the cache with an OpenAI adapter.
- Wrap your LLM calls with GPTCache to automatically cache and retrieve responses.
- Send prompts through GPTCache instead of directly calling the OpenAI client.
- Retrieve cached responses instantly on repeated prompts, reducing API usage.
Full code
import os
from openai import OpenAI
from gptcache import cache
from gptcache.adapter import OpenAIAdapter
from gptcache.manager.factory import get_data_manager
from gptcache.processor.pre import get_prompt
from gptcache.processor.post import get_response
# Set your OpenAI API key in environment variable
api_key = os.environ["OPENAI_API_KEY"]
# Initialize OpenAI client
client = OpenAI(api_key=api_key)
# Initialize GPTCache with OpenAI adapter
adapter = OpenAIAdapter(client=client)
# Use in-memory data manager for caching
data_manager = get_data_manager("memory")
# Initialize cache with adapter and data manager
cache.init(
pre_embedding_func=get_prompt,
post_embedding_func=get_response,
data_manager=data_manager,
adapter=adapter
)
# Function to get response via GPTCache
def ask_gptcache(prompt: str) -> str:
# GPTCache will check cache or call OpenAI
response = cache.get(prompt)
if response is None:
# Cache miss: call OpenAI directly
completion = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
response = completion.choices[0].message.content
# Store in cache
cache.set(prompt, response)
return response
# Example usage
if __name__ == "__main__":
prompt = "Explain GPTCache in Python."
answer = ask_gptcache(prompt)
print("Response:", answer)
# Repeat to demonstrate caching
answer_cached = ask_gptcache(prompt)
print("Cached Response:", answer_cached) output
Response: GPTCache is a Python caching framework that stores LLM responses to reduce redundant API calls and optimize costs. Cached Response: GPTCache is a Python caching framework that stores LLM responses to reduce redundant API calls and optimize costs.
API trace
Request
{"model": "gpt-4o", "messages": [{"role": "user", "content": "Explain GPTCache in Python."}]} Response
{"choices": [{"message": {"content": "GPTCache is a Python caching framework that stores LLM responses..."}}], "usage": {"total_tokens": 50}} Extract
response.choices[0].message.contentVariants
Streaming GPTCache Integration ›
Use streaming when you want to display partial LLM responses in real-time for better user experience.
import os
from openai import OpenAI
from gptcache import cache
from gptcache.adapter import OpenAIAdapter
from gptcache.manager.factory import get_data_manager
from gptcache.processor.pre import get_prompt
from gptcache.processor.post import get_response
api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)
adapter = OpenAIAdapter(client=client)
data_manager = get_data_manager("memory")
cache.init(pre_embedding_func=get_prompt, post_embedding_func=get_response, data_manager=data_manager, adapter=adapter)
# Streaming example
prompt = "Explain GPTCache streaming in Python."
stream = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], stream=True)
print("Streaming response:")
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
print(delta, end="", flush=True)
# Cache the full response manually if needed
# (Streaming caching requires custom handling) Async GPTCache Usage ›
Use async integration when your application requires concurrent LLM calls or is built on async frameworks.
import os
import asyncio
from openai import OpenAI
from gptcache import cache
from gptcache.adapter import OpenAIAdapter
from gptcache.manager.factory import get_data_manager
from gptcache.processor.pre import get_prompt
from gptcache.processor.post import get_response
async def main():
api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)
adapter = OpenAIAdapter(client=client)
data_manager = get_data_manager("memory")
cache.init(pre_embedding_func=get_prompt, post_embedding_func=get_response, data_manager=data_manager, adapter=adapter)
prompt = "Async GPTCache example in Python."
# Async call to OpenAI
completion = await client.chat.completions.acreate(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
response = completion.choices[0].message.content
print("Async response:", response)
asyncio.run(main()) Alternative Model with GPTCache ›
Use alternative models to optimize cost or latency depending on your use case and model availability.
import os
from openai import OpenAI
from gptcache import cache
from gptcache.adapter import OpenAIAdapter
from gptcache.manager.factory import get_data_manager
from gptcache.processor.pre import get_prompt
from gptcache.processor.post import get_response
api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)
adapter = OpenAIAdapter(client=client)
data_manager = get_data_manager("memory")
cache.init(pre_embedding_func=get_prompt, post_embedding_func=get_response, data_manager=data_manager, adapter=adapter)
prompt = "Explain GPTCache usage with Claude."
# Change model to Claude via Anthropic client or use OpenAI-compatible endpoint if available
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
response = completion.choices[0].message.content
print("Response with alternative model:", response) Performance
Latency~800ms for a typical GPT-4o call without cache; near-instant on cache hit
Cost~$0.002 per 500 tokens for GPT-4o; caching reduces repeated costs to near zero
Rate limitsDepends on underlying LLM provider; GPTCache itself has no rate limits
- Use concise prompts to reduce token usage.
- Cache common queries to avoid repeated token consumption.
- Batch multiple queries when possible to optimize throughput.
| Approach | Latency | Cost/call | Best for |
|---|---|---|---|
| Direct LLM call | ~800ms | ~$0.002 | Fresh responses, no caching |
| GPTCache with memory cache | ~10ms on hit | ~$0.002 only on miss | Cost optimization with repeated queries |
| Streaming GPTCache | Partial ~100ms chunks | ~$0.002 on miss | Real-time UI with caching |
| Async GPTCache | ~800ms async | ~$0.002 | Concurrent calls in async apps |
Quick tip
Always initialize GPTCache with the same adapter and data manager instance to ensure consistent caching behavior.
Common mistake
Beginners often forget to call cache.set() after a cache miss, causing repeated API calls for the same prompt.