llm-cost-optimization
SKILL.md
LLM Cost Optimization
Cut LLM costs by 50–90% with the right combination of caching, model selection, prompt optimization, and self-hosting.
When to Use This Skill
Use this skill when:
- LLM API spend is growing faster than revenue
- You need to attribute AI costs to teams, products, or customers
- Implementing caching to avoid redundant LLM calls
- Deciding when to switch from API providers to self-hosted models
- Optimizing prompt length without sacrificing quality
Cost Levers by Impact
| Strategy | Typical Savings | Effort |
|---|---|---|
| Semantic caching | 20–50% | Low |
| Model right-sizing | 30–70% | Low |
| Prompt compression | 10–30% | Medium |
| Provider caching (prompt cache) | 10–25% | Low |
| Batching offline workloads | 50% (Batch API) | Medium |
| Self-hosting 7–8B models | 80–95% at scale | High |
| Quantization | 30–50% VRAM cost | Medium |
Track Costs First
# Use LiteLLM's cost tracking (automatic per-model pricing)
import litellm
response = litellm.completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Hello"}],
)
cost = litellm.completion_cost(response)
print(f"Cost: ${cost:.6f}")
# Add custom cost callbacks
def log_cost(kwargs, completion_response, start_time, end_time):
cost = kwargs.get("response_cost", 0)
model = kwargs.get("model")
user = kwargs.get("user")
# Send to your analytics DB
db.record_cost(user=user, model=model, cost=cost)
litellm.success_callback = [log_cost]
Model Right-Sizing
# Route by task complexity — don't use GPT-4o for everything
def get_model_for_task(task_type: str) -> str:
routing = {
"classification": "gpt-4o-mini", # ~30× cheaper than gpt-4o
"summarization": "gpt-4o-mini",
"extraction": "gpt-4o-mini",
"simple_qa": "gpt-4o-mini",
"complex_reasoning": "gpt-4o",
"code_generation": "claude-sonnet-4-6",
"creative_writing": "claude-opus-4-6",
}
return routing.get(task_type, "gpt-4o-mini")
# Cost comparison (per 1M tokens, 2025 approx.)
# gpt-4o-mini: input $0.15 / output $0.60
# gpt-4o: input $2.50 / output $10.00
# claude-sonnet-4-6: input $3.00 / output $15.00
# llama-3.1-8b (self): ~$0.05–0.10 all-in (GPU amortized)
Prompt Caching (Provider-Side)
# Anthropic — cache long system prompts (saves 90% on cached tokens)
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{
"type": "text",
"text": "You are a helpful assistant.",
},
{
"type": "text",
"text": open("large-context.txt").read(), # large doc
"cache_control": {"type": "ephemeral"}, # cache this!
}
],
messages=[{"role": "user", "content": "Summarize the key points."}],
)
# First call: full price. Subsequent calls: 90% discount on cached part.
print(f"Cache read tokens: {response.usage.cache_read_input_tokens}")
# OpenAI — prompt caching is automatic for repeated prefixes >1024 tokens
# No code change needed; check usage.prompt_tokens_details.cached_tokens
Batching with OpenAI Batch API (50% Discount)
import json
from openai import OpenAI
client = OpenAI()
# Prepare batch requests
requests = [
{
"custom_id": f"task-{i}",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": f"Classify: {text}"}],
"max_tokens": 50,
}
}
for i, text in enumerate(texts)
]
# Write JSONL file
with open("batch.jsonl", "w") as f:
for req in requests:
f.write(json.dumps(req) + "\n")
# Upload and create batch
batch_file = client.files.create(file=open("batch.jsonl", "rb"), purpose="batch")
batch = client.batches.create(
input_file_id=batch_file.id,
endpoint="/v1/chat/completions",
completion_window="24h",
)
print(f"Batch ID: {batch.id}") # poll status with client.batches.retrieve(batch.id)
Semantic Caching
import hashlib
import json
import redis
import numpy as np
from sentence_transformers import SentenceTransformer
r = redis.Redis(host="localhost", port=6379)
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
SIMILARITY_THRESHOLD = 0.92
CACHE_TTL = 3600 * 24 # 24 hours
def cached_llm_call(prompt: str, llm_fn) -> str:
# 1. Exact match (free)
exact_key = f"exact:{hashlib.sha256(prompt.encode()).hexdigest()}"
if cached := r.get(exact_key):
return cached.decode()
# 2. Semantic match
query_vec = embed_model.encode(prompt)
cached_keys = r.keys("sem:*")
for key in cached_keys:
data = json.loads(r.get(key))
similarity = np.dot(query_vec, data["embedding"]) / (
np.linalg.norm(query_vec) * np.linalg.norm(data["embedding"])
)
if similarity >= SIMILARITY_THRESHOLD:
return data["response"]
# 3. Cache miss — call LLM
response = llm_fn(prompt)
# Store exact match
r.setex(exact_key, CACHE_TTL, response)
# Store semantic embedding
sem_key = f"sem:{hashlib.sha256(prompt.encode()).hexdigest()}"
r.setex(sem_key, CACHE_TTL, json.dumps({
"embedding": query_vec.tolist(),
"response": response,
"prompt": prompt,
}))
return response
Prompt Compression
# LLMLingua — compress long prompts by 3–20× with minimal quality loss
from llmlingua import PromptCompressor
compressor = PromptCompressor(
model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
device_map="cpu",
)
compressed = compressor.compress_prompt(
long_context,
ratio=0.5, # keep 50% of tokens
rank_method="longllmlingua",
)
print(f"Original: {len(long_context.split())} words")
print(f"Compressed: {len(compressed['compressed_prompt'].split())} words")
print(f"Savings: {compressed['saving']}")
Self-Hosting Break-Even Calculator
def break_even_analysis(
monthly_api_spend_usd: float,
gpu_cost_per_hour_usd: float = 2.50, # e.g., A10G on AWS
utilization: float = 0.70, # 70% GPU utilization
) -> dict:
monthly_gpu_cost = gpu_cost_per_hour_usd * 24 * 30 * utilization
break_even = monthly_gpu_cost / monthly_api_spend_usd
recommendation = (
"Self-host now — strong ROI" if break_even < 0.5 else
"Self-host if traffic grows 2×" if break_even < 0.8 else
"Stick with API — not enough scale yet"
)
return {
"monthly_gpu_cost": f"${monthly_gpu_cost:.0f}",
"monthly_api_spend": f"${monthly_api_spend_usd:.0f}",
"gpu_as_pct_of_api": f"{break_even*100:.0f}%",
"recommendation": recommendation,
}
# Example: $5k/month on OpenAI, $2.50/hr A10G
print(break_even_analysis(5000))
# → gpu_cost ~$1,260/mo = 25% of API spend → self-host now
Cost Dashboard (Grafana)
# Emit cost metrics to Prometheus
from prometheus_client import Counter, Histogram
llm_cost_total = Counter(
"llm_cost_usd_total",
"Total LLM spend in USD",
["model", "team", "task_type"],
)
llm_tokens_total = Counter(
"llm_tokens_total",
"Total tokens used",
["model", "token_type"], # token_type: prompt, completion, cached
)
def track_call(model, team, task_type, response):
cost = calculate_cost(model, response.usage)
llm_cost_total.labels(model=model, team=team, task_type=task_type).inc(cost)
llm_tokens_total.labels(model=model, token_type="prompt").inc(
response.usage.prompt_tokens)
llm_tokens_total.labels(model=model, token_type="completion").inc(
response.usage.completion_tokens)
Best Practices
- Use
gpt-4o-miniorclaude-haikufor 80% of tasks — they're 10–30× cheaper. - Enable prompt caching for system prompts >1,024 tokens (Anthropic) or >1,024 tokens (OpenAI).
- Audit your top 5 prompts by token count — compress or cache them.
- Set hard budget limits with LiteLLM virtual keys before costs spiral.
- Self-host 7B–8B models when monthly API spend exceeds $2k/month.
Related Skills
- llm-gateway - Centralized cost control
- llm-caching - Semantic caching patterns
- vllm-server - Self-hosted inference
- agent-observability - Token and cost telemetry
Weekly Installs
3
Repository
bagelhole/devop…t-skillsGitHub Stars
13
First Seen
6 days ago
Security Audits
Installed on
opencode3
antigravity3
claude-code3
github-copilot3
codex3
zencoder3