llm-caching

SKILL.md

LLM Caching

Cut LLM costs and latency with exact match, semantic, and provider-side caching layers.

When to Use This Skill

Use this skill when:

  • The same or similar queries are asked repeatedly (FAQ bots, support tools)
  • LLM API costs are growing and you need immediate savings
  • Serving high request volumes where repeated queries cause bottlenecks
  • Implementing prompt caching for long system prompts (Anthropic/OpenAI)
  • Building offline-capable AI features that need response persistence

Caching Layers

Request → Exact Cache → Semantic Cache → Provider Cache → LLM API
             ↓ hit            ↓ hit             ↓ hit
           instant          ~5ms           50-80% cheaper

Layer 1: Exact Match Cache (Redis)

import hashlib
import json
import redis
from openai import OpenAI

r = redis.Redis(host="localhost", port=6379, decode_responses=True)
client = OpenAI()

def build_cache_key(model: str, messages: list, temperature: float) -> str:
    """Deterministic key from request parameters."""
    payload = json.dumps({
        "model": model,
        "messages": messages,
        "temperature": temperature,
    }, sort_keys=True)
    return f"llm:exact:{hashlib.sha256(payload.encode()).hexdigest()}"

def cached_completion(model: str, messages: list, temperature: float = 0.0,
                      ttl: int = 3600) -> dict:
    key = build_cache_key(model, messages, temperature)

    # Check cache
    if cached := r.get(key):
        return json.loads(cached)

    # Call API
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=temperature
    )
    result = response.model_dump()

    # Cache result (only cache deterministic responses)
    if temperature == 0.0:
        r.setex(key, ttl, json.dumps(result))

    return result

Layer 2: Semantic Cache (GPTCache)

from gptcache import cache, Config
from gptcache.adapter import openai
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation

# Configure GPTCache with Qdrant backend
def init_gptcache(cache_obj, llm: str):
    onnx = Onnx()                              # local embedding model
    data_manager = get_data_manager(
        CacheBase("redis"),                    # metadata store
        VectorBase("qdrant",
                   host="localhost",
                   port=6333,
                   collection_name=f"llm-cache-{llm}",
                   dimension=onnx.dimension),
    )
    cache_obj.init(
        embedding_func=onnx.to_embeddings,
        data_manager=data_manager,
        similarity_evaluation=SearchDistanceEvaluation(),
        config=Config(similarity_threshold=0.80),  # 80% similarity = cache hit
    )

cache.set_openai_key()
init_gptcache(cache, "gpt-4o-mini")

# Now openai calls are automatically cached
response = openai.ChatCompletion.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "What is machine learning?"}],
)
# Second call with similar question ("Explain machine learning") → cache hit

Custom Semantic Cache (Production-Grade)

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, Range
import numpy as np
import uuid
import time

embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")  # fast, 33M params
qdrant = QdrantClient("http://localhost:6333")

CACHE_COLLECTION = "semantic-cache"
SIMILARITY_THRESHOLD = 0.88
CACHE_TTL_SECONDS = 86400  # 24h

# Create collection once
qdrant.create_collection(
    collection_name=CACHE_COLLECTION,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    on_disk_payload=True,
)

def semantic_cache_lookup(query: str, model: str) -> str | None:
    embedding = embed_model.encode(query).tolist()
    results = qdrant.query_points(
        collection_name=CACHE_COLLECTION,
        query=embedding,
        query_filter=Filter(must=[
            FieldCondition(key="model", match={"value": model}),
            FieldCondition(key="expires_at", range=Range(gte=time.time())),
        ]),
        limit=1,
        score_threshold=SIMILARITY_THRESHOLD,
    )
    if results.points:
        return results.points[0].payload["response"]
    return None

def semantic_cache_store(query: str, response: str, model: str):
    embedding = embed_model.encode(query).tolist()
    qdrant.upsert(
        collection_name=CACHE_COLLECTION,
        points=[PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload={
                "query": query,
                "response": response,
                "model": model,
                "created_at": time.time(),
                "expires_at": time.time() + CACHE_TTL_SECONDS,
            },
        )],
    )

def smart_llm_call(query: str, model: str = "gpt-4o-mini") -> dict:
    # 1. Semantic lookup
    if cached_response := semantic_cache_lookup(query, model):
        return {"response": cached_response, "source": "semantic_cache", "cost": 0}

    # 2. LLM call
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": query}],
    )
    text = response.choices[0].message.content
    cost = litellm.completion_cost(response)

    # 3. Store in cache
    semantic_cache_store(query, text, model)

    return {"response": text, "source": "llm_api", "cost": cost}

Layer 3: Provider-Side Prompt Caching

# Anthropic — cache long system prompts (saves 90% on cached input tokens)
import anthropic

client = anthropic.Anthropic()

# Long system prompt — mark for caching
SYSTEM_PROMPT = open("knowledge-base.txt").read()  # e.g., 50k tokens

def call_with_prompt_cache(user_question: str) -> str:
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        system=[
            {"type": "text", "text": "You are a helpful assistant."},
            {
                "type": "text",
                "text": SYSTEM_PROMPT,
                "cache_control": {"type": "ephemeral"},  # cache this block
            }
        ],
        messages=[{"role": "user", "content": user_question}],
    )
    # Log cache efficiency
    usage = response.usage
    cache_savings = usage.cache_read_input_tokens * 0.9  # 90% discount on cached
    print(f"Cache hits: {usage.cache_read_input_tokens} tokens "
          f"(saved ~${cache_savings * 3.0 / 1_000_000:.4f})")
    return response.content[0].text

# OpenAI — automatic for repeated prefixes (≥1,024 tokens)
# No code change needed; cached tokens appear in usage.prompt_tokens_details
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": LONG_SYSTEM_PROMPT},  # auto-cached
        {"role": "user", "content": user_question},
    ]
)
cached = response.usage.prompt_tokens_details.cached_tokens
print(f"OpenAI cached {cached} tokens")

Cache Warming

async def warm_cache(common_queries: list[str], model: str):
    """Pre-populate cache with known frequent queries."""
    import asyncio
    from openai import AsyncOpenAI

    aclient = AsyncOpenAI()

    async def warm_single(query: str):
        if not semantic_cache_lookup(query, model):
            response = await aclient.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": query}],
            )
            text = response.choices[0].message.content
            semantic_cache_store(query, text, model)
            print(f"Warmed: {query[:50]}...")

    await asyncio.gather(*[warm_single(q) for q in common_queries])

# Warm on startup
import asyncio
asyncio.run(warm_cache(FREQUENT_QUERIES, "gpt-4o-mini"))

Cache Metrics

from prometheus_client import Counter, Histogram

cache_hits = Counter("llm_cache_hits_total", "Cache hits", ["cache_layer", "model"])
cache_misses = Counter("llm_cache_misses_total", "Cache misses", ["model"])
cache_savings_usd = Counter("llm_cache_savings_usd_total", "USD saved by cache", ["model"])

# Use in your smart_llm_call function
if source == "semantic_cache":
    cache_hits.labels(cache_layer="semantic", model=model).inc()
    cache_savings_usd.labels(model=model).inc(estimated_cost)
else:
    cache_misses.labels(model=model).inc()

Redis Configuration for LLM Caching

# redis.conf tuning for LLM cache workload
maxmemory 8gb
maxmemory-policy allkeys-lru    # evict least-recently-used when full
save ""                          # disable persistence (cache is ephemeral)
appendonly no
tcp-keepalive 60

Common Issues

Issue Cause Fix
Low cache hit rate Threshold too strict Lower SIMILARITY_THRESHOLD to 0.82–0.85
Stale cached responses Long TTL Use topic-specific TTLs; invalidate on data updates
Cache serving wrong answers Threshold too loose Raise threshold or add model-name filtering
Redis OOM No eviction policy Set maxmemory + allkeys-lru
Slow semantic lookup Large cache collection Add payload index on model + expires_at

Best Practices

  • Start with exact cache — zero cost, instant wins for identical queries.
  • Semantic threshold of 0.88–0.92 balances hit rate vs. accuracy; tune with your data.
  • Set per-model TTLs: longer for stable knowledge (1 week), shorter for news/events (1 hour).
  • Always filter by model name in semantic cache — different models give different answers.
  • Log cache hit rate as a KPI; target 30%+ for FAQ-style applications.

Related Skills

Weekly Installs
2
GitHub Stars
13
First Seen
6 days ago
Installed on
opencode2
antigravity2
claude-code2
github-copilot2
codex2
zencoder2