Semantic Caching#

This guide covers how to use semantic caching to reduce LLM costs and latency.

Overview#

Semantic caching stores LLM responses and returns them for similar queries, reducing:

  • API costs by avoiding redundant LLM calls

  • Latency by returning cached responses instantly

Redis OpenAI Agents provides a two-level cache:

  1. Level 1 (Exact): Hash-based exact string matching

  2. Level 2 (Semantic): Vector-based similarity matching

Prerequisites#

docker run -d --name redis -p 6379:6379 redis:8
import os
os.environ.setdefault("OPENAI_API_KEY", "your-api-key-here")

REDIS_URL = os.environ.get("REDIS_URL", "redis://localhost:6379")

Basic Usage#

from redis_openai_agents import SemanticCache

# Create a cache
cache = SemanticCache(
    redis_url=REDIS_URL,
    similarity_threshold=0.9,  # Similarity threshold (higher = stricter)
    ttl=3600                   # 1 hour TTL
)

print("Cache initialized!")

Store and Retrieve#

# Store a response
query = "What is the capital of France?"
response = "The capital of France is Paris."

cache.set(query=query, response=response)
print(f"Stored: {query}")
# Check for exact match
result = cache.get(query="What is the capital of France?")
if result:
    print(f"Cache HIT (exact): {result.response}")
else:
    print("Cache MISS")
# Check for semantic match (similar but not identical)
result = cache.get(query="What's France's capital city?")
if result:
    print(f"Cache HIT (semantic): {result.response}")
    print(f"Distance: {result.distance:.4f}")
else:
    print("Cache MISS")

Integration with Agent Runs#

from agents import Agent, Runner

agent = Agent(
    name="assistant",
    instructions="You are a helpful assistant."
)

async def run_with_cache(query: str) -> str:
    """Run agent with caching."""
    # Check cache first
    cached = cache.get(query=query)
    if cached:
        print(f"  Cache HIT! Distance: {cached.distance:.4f}")
        return cached.response
    
    # Cache miss - call the agent
    print("  Cache MISS - calling LLM...")
    result = await Runner.run(agent, input=query)
    response = result.final_output
    
    # Store in cache
    cache.set(query=query, response=response)
    
    return response
# First call - cache miss
print("Query 1: What is machine learning?")
response1 = await run_with_cache("What is machine learning?")
print(f"Response: {response1[:100]}...\n")

# Similar query - cache hit
print("Query 2: Can you explain machine learning?")
response2 = await run_with_cache("Can you explain machine learning?")
print(f"Response: {response2[:100]}...\n")

# Different query - cache miss
print("Query 3: What is quantum computing?")
response3 = await run_with_cache("What is quantum computing?")
print(f"Response: {response3[:100]}...")

RedisCachingModel#

For automatic caching, use RedisCachingModel as a model wrapper.

from redis_openai_agents import RedisCachingModel

# Create a caching model wrapper
caching_model = RedisCachingModel(
    model="gpt-4o-mini",
    redis_url=REDIS_URL,
    enable_semantic_cache=True,
    semantic_threshold=0.85,
    cache_ttl=7200
)

# Use with an agent
cached_agent = Agent(
    name="cached-assistant",
    instructions="You are a helpful assistant.",
    model=caching_model  # Automatic caching!
)
# Caching happens automatically
result1 = await Runner.run(cached_agent, input="What is Python?")
print(f"Response 1: {result1.final_output[:100]}...")

# Similar query will use cache
result2 = await Runner.run(cached_agent, input="Tell me about Python programming")
print(f"Response 2: {result2.final_output[:100]}...")
# Strict cache - only very similar queries match
strict_cache = SemanticCache(
    redis_url=REDIS_URL,
    similarity_threshold=0.95,
    name="strict-cache"
)

# Lenient cache - broader matching
lenient_cache = SemanticCache(
    redis_url=REDIS_URL,
    similarity_threshold=0.75,
    name="lenient-cache"
)

TTL (Time-to-Live)#

Set TTL based on how often your data changes:

# Short TTL for frequently changing data
cache = SemanticCache(redis_url=REDIS_URL, ttl=300)  # 5 minutes

# Long TTL for stable data
cache = SemanticCache(redis_url=REDIS_URL, ttl=86400)  # 24 hours

# No TTL (manual invalidation only)
cache = SemanticCache(redis_url=REDIS_URL, ttl=None)

Cache Statistics#

# Get cache statistics
stats = cache.get_stats()
print(f"Cache entries: {stats.get('entries', 'N/A')}")
print(f"Hit rate: {stats.get('hit_rate', 'N/A')}")

Cleanup#

# Clear test caches
cache.clear()
strict_cache.clear()
lenient_cache.clear()

print("Caches cleared!")