Semantic Caching#
This guide covers how to use semantic caching to reduce LLM costs and latency.
Overview#
Semantic caching stores LLM responses and returns them for similar queries, reducing:
API costs by avoiding redundant LLM calls
Latency by returning cached responses instantly
Redis OpenAI Agents provides a two-level cache:
Level 1 (Exact): Hash-based exact string matching
Level 2 (Semantic): Vector-based similarity matching
Prerequisites#
docker run -d --name redis -p 6379:6379 redis:8
import os
os.environ.setdefault("OPENAI_API_KEY", "your-api-key-here")
REDIS_URL = os.environ.get("REDIS_URL", "redis://localhost:6379")
Basic Usage#
from redis_openai_agents import SemanticCache
# Create a cache
cache = SemanticCache(
redis_url=REDIS_URL,
similarity_threshold=0.9, # Similarity threshold (higher = stricter)
ttl=3600 # 1 hour TTL
)
print("Cache initialized!")
Store and Retrieve#
# Store a response
query = "What is the capital of France?"
response = "The capital of France is Paris."
cache.set(query=query, response=response)
print(f"Stored: {query}")
# Check for exact match
result = cache.get(query="What is the capital of France?")
if result:
print(f"Cache HIT (exact): {result.response}")
else:
print("Cache MISS")
# Check for semantic match (similar but not identical)
result = cache.get(query="What's France's capital city?")
if result:
print(f"Cache HIT (semantic): {result.response}")
print(f"Distance: {result.distance:.4f}")
else:
print("Cache MISS")
Integration with Agent Runs#
from agents import Agent, Runner
agent = Agent(
name="assistant",
instructions="You are a helpful assistant."
)
async def run_with_cache(query: str) -> str:
"""Run agent with caching."""
# Check cache first
cached = cache.get(query=query)
if cached:
print(f" Cache HIT! Distance: {cached.distance:.4f}")
return cached.response
# Cache miss - call the agent
print(" Cache MISS - calling LLM...")
result = await Runner.run(agent, input=query)
response = result.final_output
# Store in cache
cache.set(query=query, response=response)
return response
# First call - cache miss
print("Query 1: What is machine learning?")
response1 = await run_with_cache("What is machine learning?")
print(f"Response: {response1[:100]}...\n")
# Similar query - cache hit
print("Query 2: Can you explain machine learning?")
response2 = await run_with_cache("Can you explain machine learning?")
print(f"Response: {response2[:100]}...\n")
# Different query - cache miss
print("Query 3: What is quantum computing?")
response3 = await run_with_cache("What is quantum computing?")
print(f"Response: {response3[:100]}...")
RedisCachingModel#
For automatic caching, use RedisCachingModel as a model wrapper.
from redis_openai_agents import RedisCachingModel
# Create a caching model wrapper
caching_model = RedisCachingModel(
model="gpt-4o-mini",
redis_url=REDIS_URL,
enable_semantic_cache=True,
semantic_threshold=0.85,
cache_ttl=7200
)
# Use with an agent
cached_agent = Agent(
name="cached-assistant",
instructions="You are a helpful assistant.",
model=caching_model # Automatic caching!
)
# Caching happens automatically
result1 = await Runner.run(cached_agent, input="What is Python?")
print(f"Response 1: {result1.final_output[:100]}...")
# Similar query will use cache
result2 = await Runner.run(cached_agent, input="Tell me about Python programming")
print(f"Response 2: {result2.final_output[:100]}...")
# Strict cache - only very similar queries match
strict_cache = SemanticCache(
redis_url=REDIS_URL,
similarity_threshold=0.95,
name="strict-cache"
)
# Lenient cache - broader matching
lenient_cache = SemanticCache(
redis_url=REDIS_URL,
similarity_threshold=0.75,
name="lenient-cache"
)
TTL (Time-to-Live)#
Set TTL based on how often your data changes:
# Short TTL for frequently changing data
cache = SemanticCache(redis_url=REDIS_URL, ttl=300) # 5 minutes
# Long TTL for stable data
cache = SemanticCache(redis_url=REDIS_URL, ttl=86400) # 24 hours
# No TTL (manual invalidation only)
cache = SemanticCache(redis_url=REDIS_URL, ttl=None)
Cache Statistics#
# Get cache statistics
stats = cache.get_stats()
print(f"Cache entries: {stats.get('entries', 'N/A')}")
print(f"Hit rate: {stats.get('hit_rate', 'N/A')}")
Cleanup#
# Clear test caches
cache.clear()
strict_cache.clear()
lenient_cache.clear()
print("Caches cleared!")