llm-caching
SKILL.md
LLM Caching
Cut LLM costs and latency with exact match, semantic, and provider-side caching layers.
When to Use This Skill
Use this skill when:
- The same or similar queries are asked repeatedly (FAQ bots, support tools)
- LLM API costs are growing and you need immediate savings
- Serving high request volumes where repeated queries cause bottlenecks
- Implementing prompt caching for long system prompts (Anthropic/OpenAI)
- Building offline-capable AI features that need response persistence
Caching Layers
Request → Exact Cache → Semantic Cache → Provider Cache → LLM API
↓ hit ↓ hit ↓ hit
instant ~5ms 50-80% cheaper
Layer 1: Exact Match Cache (Redis)
import hashlib
import json
import redis
from openai import OpenAI
r = redis.Redis(host="localhost", port=6379, decode_responses=True)
client = OpenAI()
def build_cache_key(model: str, messages: list, temperature: float) -> str:
"""Deterministic key from request parameters."""
payload = json.dumps({
"model": model,
"messages": messages,
"temperature": temperature,
}, sort_keys=True)
return f"llm:exact:{hashlib.sha256(payload.encode()).hexdigest()}"
def cached_completion(model: str, messages: list, temperature: float = 0.0,
ttl: int = 3600) -> dict:
key = build_cache_key(model, messages, temperature)
# Check cache
if cached := r.get(key):
return json.loads(cached)
# Call API
response = client.chat.completions.create(
model=model, messages=messages, temperature=temperature
)
result = response.model_dump()
# Cache result (only cache deterministic responses)
if temperature == 0.0:
r.setex(key, ttl, json.dumps(result))
return result
Layer 2: Semantic Cache (GPTCache)
from gptcache import cache, Config
from gptcache.adapter import openai
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
# Configure GPTCache with Qdrant backend
def init_gptcache(cache_obj, llm: str):
onnx = Onnx() # local embedding model
data_manager = get_data_manager(
CacheBase("redis"), # metadata store
VectorBase("qdrant",
host="localhost",
port=6333,
collection_name=f"llm-cache-{llm}",
dimension=onnx.dimension),
)
cache_obj.init(
embedding_func=onnx.to_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
config=Config(similarity_threshold=0.80), # 80% similarity = cache hit
)
cache.set_openai_key()
init_gptcache(cache, "gpt-4o-mini")
# Now openai calls are automatically cached
response = openai.ChatCompletion.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "What is machine learning?"}],
)
# Second call with similar question ("Explain machine learning") → cache hit
Custom Semantic Cache (Production-Grade)
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, Range
import numpy as np
import uuid
import time
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5") # fast, 33M params
qdrant = QdrantClient("http://localhost:6333")
CACHE_COLLECTION = "semantic-cache"
SIMILARITY_THRESHOLD = 0.88
CACHE_TTL_SECONDS = 86400 # 24h
# Create collection once
qdrant.create_collection(
collection_name=CACHE_COLLECTION,
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
on_disk_payload=True,
)
def semantic_cache_lookup(query: str, model: str) -> str | None:
embedding = embed_model.encode(query).tolist()
results = qdrant.query_points(
collection_name=CACHE_COLLECTION,
query=embedding,
query_filter=Filter(must=[
FieldCondition(key="model", match={"value": model}),
FieldCondition(key="expires_at", range=Range(gte=time.time())),
]),
limit=1,
score_threshold=SIMILARITY_THRESHOLD,
)
if results.points:
return results.points[0].payload["response"]
return None
def semantic_cache_store(query: str, response: str, model: str):
embedding = embed_model.encode(query).tolist()
qdrant.upsert(
collection_name=CACHE_COLLECTION,
points=[PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={
"query": query,
"response": response,
"model": model,
"created_at": time.time(),
"expires_at": time.time() + CACHE_TTL_SECONDS,
},
)],
)
def smart_llm_call(query: str, model: str = "gpt-4o-mini") -> dict:
# 1. Semantic lookup
if cached_response := semantic_cache_lookup(query, model):
return {"response": cached_response, "source": "semantic_cache", "cost": 0}
# 2. LLM call
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": query}],
)
text = response.choices[0].message.content
cost = litellm.completion_cost(response)
# 3. Store in cache
semantic_cache_store(query, text, model)
return {"response": text, "source": "llm_api", "cost": cost}
Layer 3: Provider-Side Prompt Caching
# Anthropic — cache long system prompts (saves 90% on cached input tokens)
import anthropic
client = anthropic.Anthropic()
# Long system prompt — mark for caching
SYSTEM_PROMPT = open("knowledge-base.txt").read() # e.g., 50k tokens
def call_with_prompt_cache(user_question: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{"type": "text", "text": "You are a helpful assistant."},
{
"type": "text",
"text": SYSTEM_PROMPT,
"cache_control": {"type": "ephemeral"}, # cache this block
}
],
messages=[{"role": "user", "content": user_question}],
)
# Log cache efficiency
usage = response.usage
cache_savings = usage.cache_read_input_tokens * 0.9 # 90% discount on cached
print(f"Cache hits: {usage.cache_read_input_tokens} tokens "
f"(saved ~${cache_savings * 3.0 / 1_000_000:.4f})")
return response.content[0].text
# OpenAI — automatic for repeated prefixes (≥1,024 tokens)
# No code change needed; cached tokens appear in usage.prompt_tokens_details
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": LONG_SYSTEM_PROMPT}, # auto-cached
{"role": "user", "content": user_question},
]
)
cached = response.usage.prompt_tokens_details.cached_tokens
print(f"OpenAI cached {cached} tokens")
Cache Warming
async def warm_cache(common_queries: list[str], model: str):
"""Pre-populate cache with known frequent queries."""
import asyncio
from openai import AsyncOpenAI
aclient = AsyncOpenAI()
async def warm_single(query: str):
if not semantic_cache_lookup(query, model):
response = await aclient.chat.completions.create(
model=model,
messages=[{"role": "user", "content": query}],
)
text = response.choices[0].message.content
semantic_cache_store(query, text, model)
print(f"Warmed: {query[:50]}...")
await asyncio.gather(*[warm_single(q) for q in common_queries])
# Warm on startup
import asyncio
asyncio.run(warm_cache(FREQUENT_QUERIES, "gpt-4o-mini"))
Cache Metrics
from prometheus_client import Counter, Histogram
cache_hits = Counter("llm_cache_hits_total", "Cache hits", ["cache_layer", "model"])
cache_misses = Counter("llm_cache_misses_total", "Cache misses", ["model"])
cache_savings_usd = Counter("llm_cache_savings_usd_total", "USD saved by cache", ["model"])
# Use in your smart_llm_call function
if source == "semantic_cache":
cache_hits.labels(cache_layer="semantic", model=model).inc()
cache_savings_usd.labels(model=model).inc(estimated_cost)
else:
cache_misses.labels(model=model).inc()
Redis Configuration for LLM Caching
# redis.conf tuning for LLM cache workload
maxmemory 8gb
maxmemory-policy allkeys-lru # evict least-recently-used when full
save "" # disable persistence (cache is ephemeral)
appendonly no
tcp-keepalive 60
Common Issues
| Issue | Cause | Fix |
|---|---|---|
| Low cache hit rate | Threshold too strict | Lower SIMILARITY_THRESHOLD to 0.82–0.85 |
| Stale cached responses | Long TTL | Use topic-specific TTLs; invalidate on data updates |
| Cache serving wrong answers | Threshold too loose | Raise threshold or add model-name filtering |
| Redis OOM | No eviction policy | Set maxmemory + allkeys-lru |
| Slow semantic lookup | Large cache collection | Add payload index on model + expires_at |
Best Practices
- Start with exact cache — zero cost, instant wins for identical queries.
- Semantic threshold of 0.88–0.92 balances hit rate vs. accuracy; tune with your data.
- Set per-model TTLs: longer for stable knowledge (1 week), shorter for news/events (1 hour).
- Always filter by model name in semantic cache — different models give different answers.
- Log cache hit rate as a KPI; target 30%+ for FAQ-style applications.
Related Skills
- llm-cost-optimization - Full cost strategy
- llm-gateway - Gateway-level caching
- vector-database-ops - Qdrant setup
- agent-observability - Cache metrics dashboards
Weekly Installs
2
Repository
bagelhole/devop…t-skillsGitHub Stars
13
First Seen
6 days ago
Security Audits
Installed on
opencode2
antigravity2
claude-code2
github-copilot2
codex2
zencoder2