model-routing-strategy
SKILL.md
Model Routing Strategy
Dynamically select the optimal model for each task to balance quality, cost, and latency.
When to Use
- Building applications using multiple LLM providers
- Optimizing costs while maintaining quality
- Need different model capabilities for different tasks
- Implementing fallback strategies
- A/B testing model performance
Model Comparison Matrix
Capability vs Cost
| Capability | Best Models | Cost Tier |
|---|---|---|
| Complex reasoning | Claude Opus, o1 | $$$ |
| General tasks | Claude Sonnet, GPT-4o | $$ |
| Simple tasks | Claude Haiku, GPT-4o-mini | $ |
| Code generation | Claude Sonnet, GPT-4o | $$ |
| Creative writing | Claude Opus, GPT-4 | $$$ |
| Extraction/Classification | Claude Haiku, GPT-4o-mini | $ |
Latency Comparison
| Model | Typical Latency (TTFB) |
|---|---|
| Claude Haiku | 200-400ms |
| Claude Sonnet | 400-800ms |
| Claude Opus | 800-1500ms |
| GPT-4o-mini | 200-400ms |
| GPT-4o | 400-700ms |
| GPT-4 Turbo | 500-1000ms |
Routing Strategies
Strategy 1: Complexity-Based Routing
type Complexity = 'simple' | 'medium' | 'complex';
interface Task {
prompt: string;
requirements: {
needsReasoning: boolean;
needsCreativity: boolean;
needsAccuracy: boolean;
maxLatencyMs?: number;
maxCostUSD?: number;
};
}
function assessComplexity(task: Task): Complexity {
const prompt = task.prompt.toLowerCase();
// Complex indicators
const complexPatterns = [
/analyze.*and.*compare/,
/explain.*step.*by.*step/,
/write.*comprehensive/,
/evaluate.*trade.*offs/,
/design.*architecture/,
/debug.*complex/,
/review.*security/
];
// Simple indicators
const simplePatterns = [
/summarize.*briefly/,
/extract.*from/,
/classify.*as/,
/format.*as.*json/,
/translate.*to/,
/fix.*typo/
];
if (complexPatterns.some(p => p.test(prompt)) ||
task.requirements.needsReasoning ||
task.requirements.needsCreativity) {
return 'complex';
}
if (simplePatterns.some(p => p.test(prompt))) {
return 'simple';
}
return 'medium';
}
function selectModel(complexity: Complexity): string {
const modelMap = {
simple: 'claude-3-haiku', // $0.25/$1.25 per 1M
medium: 'claude-3.5-sonnet', // $3/$15 per 1M
complex: 'claude-3-opus' // $15/$75 per 1M
};
return modelMap[complexity];
}
Strategy 2: Cost-Constrained Routing
interface ModelOption {
name: string;
provider: string;
inputCostPer1K: number;
outputCostPer1K: number;
qualityScore: number; // 0-1
avgLatencyMs: number;
}
const models: ModelOption[] = [
{ name: 'claude-3-opus', provider: 'anthropic', inputCostPer1K: 0.015, outputCostPer1K: 0.075, qualityScore: 0.98, avgLatencyMs: 1200 },
{ name: 'claude-3.5-sonnet', provider: 'anthropic', inputCostPer1K: 0.003, outputCostPer1K: 0.015, qualityScore: 0.95, avgLatencyMs: 600 },
{ name: 'claude-3-haiku', provider: 'anthropic', inputCostPer1K: 0.00025, outputCostPer1K: 0.00125, qualityScore: 0.85, avgLatencyMs: 300 },
{ name: 'gpt-4o', provider: 'openai', inputCostPer1K: 0.005, outputCostPer1K: 0.015, qualityScore: 0.94, avgLatencyMs: 550 },
{ name: 'gpt-4o-mini', provider: 'openai', inputCostPer1K: 0.00015, outputCostPer1K: 0.0006, qualityScore: 0.82, avgLatencyMs: 300 },
];
function selectWithinBudget(
estimatedInputTokens: number,
estimatedOutputTokens: number,
maxCost: number,
minQuality: number = 0.8
): ModelOption | null {
const viable = models
.filter(m => {
const cost = (estimatedInputTokens / 1000) * m.inputCostPer1K +
(estimatedOutputTokens / 1000) * m.outputCostPer1K;
return cost <= maxCost && m.qualityScore >= minQuality;
})
.sort((a, b) => b.qualityScore - a.qualityScore); // Best quality within budget
return viable[0] || null;
}
Strategy 3: Latency-Optimized Routing
function selectForLatency(
maxLatencyMs: number,
minQuality: number = 0.8
): ModelOption | null {
return models
.filter(m => m.avgLatencyMs <= maxLatencyMs && m.qualityScore >= minQuality)
.sort((a, b) => b.qualityScore - a.qualityScore)[0] || null;
}
// For real-time applications
const realtimeModel = selectForLatency(500, 0.8); // Haiku or GPT-4o-mini
// For batch processing
const batchModel = selectForLatency(2000, 0.95); // Sonnet or Opus
Strategy 4: Task-Type Routing
type TaskType = 'code' | 'creative' | 'analysis' | 'extraction' | 'chat' | 'classification';
const taskModelMap: Record<TaskType, string[]> = {
code: ['claude-3.5-sonnet', 'gpt-4o', 'claude-3-opus'],
creative: ['claude-3-opus', 'gpt-4', 'claude-3.5-sonnet'],
analysis: ['claude-3-opus', 'o1', 'claude-3.5-sonnet'],
extraction: ['claude-3-haiku', 'gpt-4o-mini', 'claude-3.5-sonnet'],
chat: ['claude-3.5-sonnet', 'gpt-4o', 'claude-3-haiku'],
classification: ['claude-3-haiku', 'gpt-4o-mini', 'claude-3.5-sonnet']
};
function selectForTaskType(taskType: TaskType, budget: 'low' | 'medium' | 'high'): string {
const candidates = taskModelMap[taskType];
const budgetIndex = { low: 2, medium: 1, high: 0 };
return candidates[Math.min(budgetIndex[budget], candidates.length - 1)];
}
Fallback Chains
interface FallbackChain {
primary: string;
fallbacks: string[];
retryConfig: {
maxRetries: number;
backoffMs: number;
};
}
const chains: Record<string, FallbackChain> = {
highQuality: {
primary: 'claude-3-opus',
fallbacks: ['claude-3.5-sonnet', 'gpt-4o', 'claude-3-haiku'],
retryConfig: { maxRetries: 3, backoffMs: 1000 }
},
costEffective: {
primary: 'claude-3-haiku',
fallbacks: ['gpt-4o-mini', 'claude-3.5-sonnet'],
retryConfig: { maxRetries: 2, backoffMs: 500 }
}
};
async function executeWithFallback(
chain: FallbackChain,
prompt: string
): Promise<string> {
const allModels = [chain.primary, ...chain.fallbacks];
for (let i = 0; i < allModels.length; i++) {
const model = allModels[i];
try {
return await callModel(model, prompt);
} catch (error) {
console.log(`Model ${model} failed, trying fallback...`);
if (i < allModels.length - 1) {
await sleep(chain.retryConfig.backoffMs * (i + 1));
}
}
}
throw new Error('All models in fallback chain failed');
}
Dynamic Routing with Learning
interface ModelPerformance {
model: string;
successRate: number;
avgLatency: number;
avgQuality: number; // From human feedback or automated eval
totalCalls: number;
}
class AdaptiveRouter {
private performance = new Map<string, ModelPerformance>();
recordOutcome(
model: string,
success: boolean,
latencyMs: number,
qualityScore?: number
): void {
const perf = this.performance.get(model) || {
model,
successRate: 1,
avgLatency: latencyMs,
avgQuality: 0.9,
totalCalls: 0
};
// Exponential moving average
const alpha = 0.1;
perf.successRate = alpha * (success ? 1 : 0) + (1 - alpha) * perf.successRate;
perf.avgLatency = alpha * latencyMs + (1 - alpha) * perf.avgLatency;
if (qualityScore !== undefined) {
perf.avgQuality = alpha * qualityScore + (1 - alpha) * perf.avgQuality;
}
perf.totalCalls++;
this.performance.set(model, perf);
}
selectBest(
candidates: string[],
weights: { quality: number; latency: number; reliability: number }
): string {
let bestScore = -Infinity;
let bestModel = candidates[0];
for (const model of candidates) {
const perf = this.performance.get(model);
if (!perf) continue;
const score =
weights.quality * perf.avgQuality +
weights.reliability * perf.successRate +
weights.latency * (1 - perf.avgLatency / 5000); // Normalize to 0-1
if (score > bestScore) {
bestScore = score;
bestModel = model;
}
}
return bestModel;
}
}
A/B Testing Models
interface ABTest {
name: string;
variants: { model: string; weight: number }[];
metrics: string[];
}
class ModelABTester {
private tests = new Map<string, ABTest>();
private results = new Map<string, { model: string; metric: string; value: number }[]>();
selectVariant(testName: string, userId: string): string {
const test = this.tests.get(testName);
if (!test) throw new Error(`Test ${testName} not found`);
// Deterministic selection based on userId
const hash = this.hashUserId(userId);
let cumWeight = 0;
for (const variant of test.variants) {
cumWeight += variant.weight;
if (hash < cumWeight) {
return variant.model;
}
}
return test.variants[0].model;
}
recordMetric(testName: string, model: string, metric: string, value: number): void {
const results = this.results.get(testName) || [];
results.push({ model, metric, value });
this.results.set(testName, results);
}
getResults(testName: string): Record<string, Record<string, { avg: number; count: number }>> {
const results = this.results.get(testName) || [];
// Aggregate by model and metric
// ...
}
}
Best Practices
- Start simple - Complexity-based routing covers most cases
- Monitor quality - Cheaper isn't better if quality drops
- Use fallbacks - Always have a backup
- Test routing logic - Unit test your router
- Log decisions - Know why each model was chosen
- Review regularly - Model capabilities and pricing change
- Consider latency - User experience matters
Weekly Installs
3
Repository
latestaiagents/…t-skillsGitHub Stars
2
First Seen
Feb 4, 2026
Installed on
mcpjam3
claude-code3
replit3
junie3
windsurf3
zencoder3