llm-app-patterns
SKILL.md
π€ LLM Application Patterns
Production-ready patterns for building LLM applications, inspired by Dify and industry best practices.
When to Use This Skill
Use this skill when:
- Designing LLM-powered applications
- Implementing RAG (Retrieval-Augmented Generation)
- Building AI agents with tools
- Setting up LLMOps monitoring
- Choosing between agent architectures
1. RAG Pipeline Architecture
Overview
RAG (Retrieval-Augmented Generation) grounds LLM responses in your data.
βββββββββββββββ βββββββββββββββ βββββββββββββββ
β Ingest ββββββΆβ Retrieve ββββββΆβ Generate β
β Documents β β Context β β Response β
βββββββββββββββ βββββββββββββββ βββββββββββββββ
β β β
βΌ βΌ βΌ
βββββββββββ βββββββββββββ βββββββββββββ
β Chunkingβ β Vector β β LLM β
βEmbeddingβ β Search β β + Contextβ
βββββββββββ βββββββββββββ βββββββββββββ
1.1 Document Ingestion
# Chunking strategies
class ChunkingStrategy:
# Fixed-size chunks (simple but may break context)
FIXED_SIZE = "fixed_size" # e.g., 512 tokens
# Semantic chunking (preserves meaning)
SEMANTIC = "semantic" # Split on paragraphs/sections
# Recursive splitting (tries multiple separators)
RECURSIVE = "recursive" # ["\n\n", "\n", " ", ""]
# Document-aware (respects structure)
DOCUMENT_AWARE = "document_aware" # Headers, lists, etc.
# Recommended settings
CHUNK_CONFIG = {
"chunk_size": 512, # tokens
"chunk_overlap": 50, # token overlap between chunks
"separators": ["\n\n", "\n", ". ", " "],
}
1.2 Embedding & Storage
# Vector database selection
VECTOR_DB_OPTIONS = {
"pinecone": {
"use_case": "Production, managed service",
"scale": "Billions of vectors",
"features": ["Hybrid search", "Metadata filtering"]
},
"weaviate": {
"use_case": "Self-hosted, multi-modal",
"scale": "Millions of vectors",
"features": ["GraphQL API", "Modules"]
},
"chromadb": {
"use_case": "Development, prototyping",
"scale": "Thousands of vectors",
"features": ["Simple API", "In-memory option"]
},
"pgvector": {
"use_case": "Existing Postgres infrastructure",
"scale": "Millions of vectors",
"features": ["SQL integration", "ACID compliance"]
}
}
# Embedding model selection
EMBEDDING_MODELS = {
"openai/text-embedding-3-small": {
"dimensions": 1536,
"cost": "$0.02/1M tokens",
"quality": "Good for most use cases"
},
"openai/text-embedding-3-large": {
"dimensions": 3072,
"cost": "$0.13/1M tokens",
"quality": "Best for complex queries"
},
"local/bge-large": {
"dimensions": 1024,
"cost": "Free (compute only)",
"quality": "Comparable to OpenAI small"
}
}
1.3 Retrieval Strategies
# Basic semantic search
def semantic_search(query: str, top_k: int = 5):
query_embedding = embed(query)
results = vector_db.similarity_search(
query_embedding,
top_k=top_k
)
return results
# Hybrid search (semantic + keyword)
def hybrid_search(query: str, top_k: int = 5, alpha: float = 0.5):
"""
alpha=1.0: Pure semantic
alpha=0.0: Pure keyword (BM25)
alpha=0.5: Balanced
"""
semantic_results = vector_db.similarity_search(query)
keyword_results = bm25_search(query)
# Reciprocal Rank Fusion
return rrf_merge(semantic_results, keyword_results, alpha)
# Multi-query retrieval
def multi_query_retrieval(query: str):
"""Generate multiple query variations for better recall"""
queries = llm.generate_query_variations(query, n=3)
all_results = []
for q in queries:
all_results.extend(semantic_search(q))
return deduplicate(all_results)
# Contextual compression
def compressed_retrieval(query: str):
"""Retrieve then compress to relevant parts only"""
docs = semantic_search(query, top_k=10)
compressed = llm.extract_relevant_parts(docs, query)
return compressed
1.4 Generation with Context
RAG_PROMPT_TEMPLATE = """
Answer the user's question based ONLY on the following context.
If the context doesn't contain enough information, say "I don't have enough information to answer that."
Context:
{context}
Question: {question}
Answer:"""
def generate_with_rag(question: str):
# Retrieve
context_docs = hybrid_search(question, top_k=5)
context = "\n\n".join([doc.content for doc in context_docs])
# Generate
prompt = RAG_PROMPT_TEMPLATE.format(
context=context,
question=question
)
response = llm.generate(prompt)
# Return with citations
return {
"answer": response,
"sources": [doc.metadata for doc in context_docs]
}
2. Agent Architectures
2.1 ReAct Pattern (Reasoning + Acting)
Thought: I need to search for information about X
Action: search("X")
Observation: [search results]
Thought: Based on the results, I should...
Action: calculate(...)
Observation: [calculation result]
Thought: I now have enough information
Action: final_answer("The answer is...")
REACT_PROMPT = """
You are an AI assistant that can use tools to answer questions.
Available tools:
{tools_description}
Use this format:
Thought: [your reasoning about what to do next]
Action: [tool_name(arguments)]
Observation: [tool result - this will be filled in]
... (repeat Thought/Action/Observation as needed)
Thought: I have enough information to answer
Final Answer: [your final response]
Question: {question}
"""
class ReActAgent:
def __init__(self, tools: list, llm):
self.tools = {t.name: t for t in tools}
self.llm = llm
self.max_iterations = 10
def run(self, question: str) -> str:
prompt = REACT_PROMPT.format(
tools_description=self._format_tools(),
question=question
)
for _ in range(self.max_iterations):
response = self.llm.generate(prompt)
if "Final Answer:" in response:
return self._extract_final_answer(response)
action = self._parse_action(response)
observation = self._execute_tool(action)
prompt += f"\nObservation: {observation}\n"
return "Max iterations reached"
2.2 Function Calling Pattern
# Define tools as functions with schemas
TOOLS = [
{
"name": "search_web",
"description": "Search the web for current information",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query"
}
},
"required": ["query"]
}
},
{
"name": "calculate",
"description": "Perform mathematical calculations",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Math expression to evaluate"
}
},
"required": ["expression"]
}
}
]
class FunctionCallingAgent:
def run(self, question: str) -> str:
messages = [{"role": "user", "content": question}]
while True:
response = self.llm.chat(
messages=messages,
tools=TOOLS,
tool_choice="auto"
)
if response.tool_calls:
for tool_call in response.tool_calls:
result = self._execute_tool(
tool_call.name,
tool_call.arguments
)
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": str(result)
})
else:
return response.content
2.3 Plan-and-Execute Pattern
class PlanAndExecuteAgent:
"""
1. Create a plan (list of steps)
2. Execute each step
3. Replan if needed
"""
def run(self, task: str) -> str:
# Planning phase
plan = self.planner.create_plan(task)
# Returns: ["Step 1: ...", "Step 2: ...", ...]
results = []
for step in plan:
# Execute each step
result = self.executor.execute(step, context=results)
results.append(result)
# Check if replan needed
if self._needs_replan(task, results):
new_plan = self.planner.replan(
task,
completed=results,
remaining=plan[len(results):]
)
plan = new_plan
# Synthesize final answer
return self.synthesizer.summarize(task, results)
2.4 Multi-Agent Collaboration
class AgentTeam:
"""
Specialized agents collaborating on complex tasks
"""
def __init__(self):
self.agents = {
"researcher": ResearchAgent(),
"analyst": AnalystAgent(),
"writer": WriterAgent(),
"critic": CriticAgent()
}
self.coordinator = CoordinatorAgent()
def solve(self, task: str) -> str:
# Coordinator assigns subtasks
assignments = self.coordinator.decompose(task)
results = {}
for assignment in assignments:
agent = self.agents[assignment.agent]
result = agent.execute(
assignment.subtask,
context=results
)
results[assignment.id] = result
# Critic reviews
critique = self.agents["critic"].review(results)
if critique.needs_revision:
# Iterate with feedback
return self.solve_with_feedback(task, results, critique)
return self.coordinator.synthesize(results)
3. Prompt IDE Patterns
3.1 Prompt Templates with Variables
class PromptTemplate:
def __init__(self, template: str, variables: list[str]):
self.template = template
self.variables = variables
def format(self, **kwargs) -> str:
# Validate all variables provided
missing = set(self.variables) - set(kwargs.keys())
if missing:
raise ValueError(f"Missing variables: {missing}")
return self.template.format(**kwargs)
def with_examples(self, examples: list[dict]) -> str:
"""Add few-shot examples"""
example_text = "\n\n".join([
f"Input: {ex['input']}\nOutput: {ex['output']}"
for ex in examples
])
return f"{example_text}\n\n{self.template}"
# Usage
summarizer = PromptTemplate(
template="Summarize the following text in {style} style:\n\n{text}",
variables=["style", "text"]
)
prompt = summarizer.format(
style="professional",
text="Long article content..."
)
3.2 Prompt Versioning & A/B Testing
class PromptRegistry:
def __init__(self, db):
self.db = db
def register(self, name: str, template: str, version: str):
"""Store prompt with version"""
self.db.save({
"name": name,
"template": template,
"version": version,
"created_at": datetime.now(),
"metrics": {}
})
def get(self, name: str, version: str = "latest") -> str:
"""Retrieve specific version"""
return self.db.get(name, version)
def ab_test(self, name: str, user_id: str) -> str:
"""Return variant based on user bucket"""
variants = self.db.get_all_versions(name)
bucket = hash(user_id) % len(variants)
return variants[bucket]
def record_outcome(self, prompt_id: str, outcome: dict):
"""Track prompt performance"""
self.db.update_metrics(prompt_id, outcome)
3.3 Prompt Chaining
class PromptChain:
"""
Chain prompts together, passing output as input to next
"""
def __init__(self, steps: list[dict]):
self.steps = steps
def run(self, initial_input: str) -> dict:
context = {"input": initial_input}
results = []
for step in self.steps:
prompt = step["prompt"].format(**context)
output = llm.generate(prompt)
# Parse output if needed
if step.get("parser"):
output = step["parser"](output)
context[step["output_key"]] = output
results.append({
"step": step["name"],
"output": output
})
return {
"final_output": context[self.steps[-1]["output_key"]],
"intermediate_results": results
}
# Example: Research β Analyze β Summarize
chain = PromptChain([
{
"name": "research",
"prompt": "Research the topic: {input}",
"output_key": "research"
},
{
"name": "analyze",
"prompt": "Analyze these findings:\n{research}",
"output_key": "analysis"
},
{
"name": "summarize",
"prompt": "Summarize this analysis in 3 bullet points:\n{analysis}",
"output_key": "summary"
}
])
4. LLMOps & Observability
4.1 Metrics to Track
LLM_METRICS = {
# Performance
"latency_p50": "50th percentile response time",
"latency_p99": "99th percentile response time",
"tokens_per_second": "Generation speed",
# Quality
"user_satisfaction": "Thumbs up/down ratio",
"task_completion": "% tasks completed successfully",
"hallucination_rate": "% responses with factual errors",
# Cost
"cost_per_request": "Average $ per API call",
"tokens_per_request": "Average tokens used",
"cache_hit_rate": "% requests served from cache",
# Reliability
"error_rate": "% failed requests",
"timeout_rate": "% requests that timed out",
"retry_rate": "% requests needing retry"
}
4.2 Logging & Tracing
import logging
from opentelemetry import trace
tracer = trace.get_tracer(__name__)
class LLMLogger:
def log_request(self, request_id: str, data: dict):
"""Log LLM request for debugging and analysis"""
log_entry = {
"request_id": request_id,
"timestamp": datetime.now().isoformat(),
"model": data["model"],
"prompt": data["prompt"][:500], # Truncate for storage
"prompt_tokens": data["prompt_tokens"],
"temperature": data.get("temperature", 1.0),
"user_id": data.get("user_id"),
}
logging.info(f"LLM_REQUEST: {json.dumps(log_entry)}")
def log_response(self, request_id: str, data: dict):
"""Log LLM response"""
log_entry = {
"request_id": request_id,
"completion_tokens": data["completion_tokens"],
"total_tokens": data["total_tokens"],
"latency_ms": data["latency_ms"],
"finish_reason": data["finish_reason"],
"cost_usd": self._calculate_cost(data),
}
logging.info(f"LLM_RESPONSE: {json.dumps(log_entry)}")
# Distributed tracing
@tracer.start_as_current_span("llm_call")
def call_llm(prompt: str) -> str:
span = trace.get_current_span()
span.set_attribute("prompt.length", len(prompt))
response = llm.generate(prompt)
span.set_attribute("response.length", len(response))
span.set_attribute("tokens.total", response.usage.total_tokens)
return response.content
4.3 Evaluation Framework
class LLMEvaluator:
"""
Evaluate LLM outputs for quality
"""
def evaluate_response(self,
question: str,
response: str,
ground_truth: str = None) -> dict:
scores = {}
# Relevance: Does it answer the question?
scores["relevance"] = self._score_relevance(question, response)
# Coherence: Is it well-structured?
scores["coherence"] = self._score_coherence(response)
# Groundedness: Is it based on provided context?
scores["groundedness"] = self._score_groundedness(response)
# Accuracy: Does it match ground truth?
if ground_truth:
scores["accuracy"] = self._score_accuracy(response, ground_truth)
# Harmfulness: Is it safe?
scores["safety"] = self._score_safety(response)
return scores
def run_benchmark(self, test_cases: list[dict]) -> dict:
"""Run evaluation on test set"""
results = []
for case in test_cases:
response = llm.generate(case["prompt"])
scores = self.evaluate_response(
question=case["prompt"],
response=response,
ground_truth=case.get("expected")
)
results.append(scores)
return self._aggregate_scores(results)
5. Production Patterns
5.1 Caching Strategy
import hashlib
from functools import lru_cache
class LLMCache:
def __init__(self, redis_client, ttl_seconds=3600):
self.redis = redis_client
self.ttl = ttl_seconds
def _cache_key(self, prompt: str, model: str, **kwargs) -> str:
"""Generate deterministic cache key"""
content = f"{model}:{prompt}:{json.dumps(kwargs, sort_keys=True)}"
return hashlib.sha256(content.encode()).hexdigest()
def get_or_generate(self, prompt: str, model: str, **kwargs) -> str:
key = self._cache_key(prompt, model, **kwargs)
# Check cache
cached = self.redis.get(key)
if cached:
return cached.decode()
# Generate
response = llm.generate(prompt, model=model, **kwargs)
# Cache (only cache deterministic outputs)
if kwargs.get("temperature", 1.0) == 0:
self.redis.setex(key, self.ttl, response)
return response
5.2 Rate Limiting & Retry
import time
from tenacity import retry, wait_exponential, stop_after_attempt
class RateLimiter:
def __init__(self, requests_per_minute: int):
self.rpm = requests_per_minute
self.timestamps = []
def acquire(self):
"""Wait if rate limit would be exceeded"""
now = time.time()
# Remove old timestamps
self.timestamps = [t for t in self.timestamps if now - t < 60]
if len(self.timestamps) >= self.rpm:
sleep_time = 60 - (now - self.timestamps[0])
time.sleep(sleep_time)
self.timestamps.append(time.time())
# Retry with exponential backoff
@retry(
wait=wait_exponential(multiplier=1, min=4, max=60),
stop=stop_after_attempt(5)
)
def call_llm_with_retry(prompt: str) -> str:
try:
return llm.generate(prompt)
except RateLimitError:
raise # Will trigger retry
except APIError as e:
if e.status_code >= 500:
raise # Retry server errors
raise # Don't retry client errors
5.3 Fallback Strategy
class LLMWithFallback:
def __init__(self, primary: str, fallbacks: list[str]):
self.primary = primary
self.fallbacks = fallbacks
def generate(self, prompt: str, **kwargs) -> str:
models = [self.primary] + self.fallbacks
for model in models:
try:
return llm.generate(prompt, model=model, **kwargs)
except (RateLimitError, APIError) as e:
logging.warning(f"Model {model} failed: {e}")
continue
raise AllModelsFailedError("All models exhausted")
# Usage
llm_client = LLMWithFallback(
primary="gpt-4-turbo",
fallbacks=["gpt-3.5-turbo", "claude-3-sonnet"]
)
Architecture Decision Matrix
| Pattern | Use When | Complexity | Cost |
|---|---|---|---|
| Simple RAG | FAQ, docs search | Low | Low |
| Hybrid RAG | Mixed queries | Medium | Medium |
| ReAct Agent | Multi-step tasks | Medium | Medium |
| Function Calling | Structured tools | Low | Low |
| Plan-Execute | Complex tasks | High | High |
| Multi-Agent | Research tasks | Very High | Very High |
Resources
Weekly Installs
8
Install
$ npx skills add sickn33/antigravity-awesome-skills --skill "llm-app-patterns"Repository
sickn33/antigravity-awesome-skillsInstalled on
antigravity6
cursor5
gemini-cli5
codex4
claude-code4
windsurf3