agent-error-recovery
SKILL.md
Agent Error Recovery
Design fault-tolerant agent systems that recover gracefully from failures.
When to Use
- Building production-grade agent systems
- Agents need to handle API failures
- Implementing autonomous error recovery
- Designing resilient multi-agent workflows
- Setting up monitoring and alerting
Error Classification
enum ErrorCategory {
// Transient - retry likely to succeed
RATE_LIMIT = 'rate_limit',
TIMEOUT = 'timeout',
NETWORK = 'network',
SERVICE_UNAVAILABLE = 'service_unavailable',
// Recoverable - different approach may work
INVALID_INPUT = 'invalid_input',
CONTEXT_OVERFLOW = 'context_overflow',
TOOL_FAILURE = 'tool_failure',
// Terminal - cannot proceed
AUTHENTICATION = 'authentication',
AUTHORIZATION = 'authorization',
NOT_FOUND = 'not_found',
VALIDATION = 'validation',
// Unknown
UNKNOWN = 'unknown'
}
interface AgentError {
category: ErrorCategory;
code: string;
message: string;
retryable: boolean;
context: Record<string, unknown>;
timestamp: Date;
stackTrace?: string;
}
function classifyError(error: Error): AgentError {
// Rate limits
if (error.message.includes('429') || error.message.includes('rate limit')) {
return {
category: ErrorCategory.RATE_LIMIT,
code: 'RATE_LIMITED',
message: error.message,
retryable: true,
context: { waitTime: extractWaitTime(error) },
timestamp: new Date()
};
}
// Timeouts
if (error.message.includes('timeout') || error.message.includes('ETIMEDOUT')) {
return {
category: ErrorCategory.TIMEOUT,
code: 'TIMEOUT',
message: error.message,
retryable: true,
context: {},
timestamp: new Date()
};
}
// Context overflow
if (error.message.includes('context length') || error.message.includes('too long')) {
return {
category: ErrorCategory.CONTEXT_OVERFLOW,
code: 'CONTEXT_OVERFLOW',
message: error.message,
retryable: true, // Can retry with truncated context
context: {},
timestamp: new Date()
};
}
// Default
return {
category: ErrorCategory.UNKNOWN,
code: 'UNKNOWN',
message: error.message,
retryable: false,
context: {},
timestamp: new Date(),
stackTrace: error.stack
};
}
Recovery Strategies
Strategy 1: Retry with Backoff
interface RetryConfig {
maxAttempts: number;
initialDelayMs: number;
maxDelayMs: number;
backoffMultiplier: number;
jitterMs: number;
}
async function retryWithBackoff<T>(
operation: () => Promise<T>,
config: RetryConfig
): Promise<T> {
let lastError: Error;
let delay = config.initialDelayMs;
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error as Error;
const classified = classifyError(lastError);
// Don't retry non-retryable errors
if (!classified.retryable) {
throw lastError;
}
// Last attempt - throw
if (attempt === config.maxAttempts) {
throw lastError;
}
// Calculate delay with jitter
const jitter = Math.random() * config.jitterMs;
const waitTime = Math.min(delay + jitter, config.maxDelayMs);
console.log(`Attempt ${attempt} failed, retrying in ${waitTime}ms...`);
await sleep(waitTime);
// Increase delay for next attempt
delay *= config.backoffMultiplier;
}
}
throw lastError!;
}
Strategy 2: Circuit Breaker
enum CircuitState {
CLOSED = 'closed', // Normal operation
OPEN = 'open', // Failing, reject requests
HALF_OPEN = 'half_open' // Testing if recovered
}
class CircuitBreaker {
private state = CircuitState.CLOSED;
private failures = 0;
private lastFailure?: Date;
private successCount = 0;
constructor(
private config: {
failureThreshold: number;
resetTimeoutMs: number;
successThreshold: number;
}
) {}
async execute<T>(operation: () => Promise<T>): Promise<T> {
// Check if circuit should transition
this.checkState();
if (this.state === CircuitState.OPEN) {
throw new Error('Circuit breaker is OPEN');
}
try {
const result = await operation();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
private checkState(): void {
if (this.state === CircuitState.OPEN) {
const elapsed = Date.now() - this.lastFailure!.getTime();
if (elapsed >= this.config.resetTimeoutMs) {
this.state = CircuitState.HALF_OPEN;
this.successCount = 0;
}
}
}
private onSuccess(): void {
if (this.state === CircuitState.HALF_OPEN) {
this.successCount++;
if (this.successCount >= this.config.successThreshold) {
this.state = CircuitState.CLOSED;
this.failures = 0;
}
} else {
this.failures = 0;
}
}
private onFailure(): void {
this.failures++;
this.lastFailure = new Date();
if (this.failures >= this.config.failureThreshold) {
this.state = CircuitState.OPEN;
}
}
}
Strategy 3: Fallback Chain
interface FallbackOption<T> {
name: string;
execute: () => Promise<T>;
isApplicable: (error: AgentError) => boolean;
}
async function executeWithFallbacks<T>(
primary: () => Promise<T>,
fallbacks: FallbackOption<T>[]
): Promise<T> {
try {
return await primary();
} catch (error) {
const classified = classifyError(error as Error);
for (const fallback of fallbacks) {
if (fallback.isApplicable(classified)) {
console.log(`Primary failed, trying fallback: ${fallback.name}`);
try {
return await fallback.execute();
} catch (fallbackError) {
console.log(`Fallback ${fallback.name} also failed`);
continue;
}
}
}
// All fallbacks failed
throw error;
}
}
// Example usage
const result = await executeWithFallbacks(
() => callPrimaryAPI(),
[
{
name: 'backup_api',
execute: () => callBackupAPI(),
isApplicable: (e) => e.category === ErrorCategory.SERVICE_UNAVAILABLE
},
{
name: 'cached_response',
execute: () => getCachedResponse(),
isApplicable: (e) => e.category === ErrorCategory.TIMEOUT
},
{
name: 'simplified_request',
execute: () => callWithReducedParams(),
isApplicable: (e) => e.category === ErrorCategory.CONTEXT_OVERFLOW
}
]
);
Strategy 4: Self-Healing Agent
class SelfHealingAgent {
async execute(task: Task): Promise<Result> {
let attempt = 0;
const maxAttempts = 3;
while (attempt < maxAttempts) {
attempt++;
try {
return await this.runTask(task);
} catch (error) {
const classified = classifyError(error as Error);
// Can we heal?
const healingAction = this.determineHealingAction(classified);
if (!healingAction) {
throw error;
}
console.log(`Attempting self-healing: ${healingAction.description}`);
// Execute healing
await healingAction.execute();
// Modify task if needed
task = healingAction.modifyTask?.(task) || task;
}
}
throw new Error('Max healing attempts exceeded');
}
private determineHealingAction(error: AgentError): HealingAction | null {
switch (error.category) {
case ErrorCategory.CONTEXT_OVERFLOW:
return {
description: 'Truncating context to fit limits',
execute: async () => {},
modifyTask: (task) => ({
...task,
context: this.truncateContext(task.context)
})
};
case ErrorCategory.TOOL_FAILURE:
return {
description: 'Switching to alternative tool',
execute: async () => {
this.toolRouter.excludeTool(error.context.toolName as string);
}
};
case ErrorCategory.RATE_LIMIT:
return {
description: `Waiting ${error.context.waitTime}ms for rate limit`,
execute: async () => {
await sleep(error.context.waitTime as number);
}
};
default:
return null;
}
}
}
Error Recovery Workflow
┌─────────────────────────────────────────────────────────────┐
│ Error Occurs │
└─────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ Classify Error │
└─────────────────────────────────────────────────────────────┘
│
┌─────────────────┼─────────────────┐
▼ ▼ ▼
┌─────────┐ ┌─────────┐ ┌─────────┐
│Transient│ │Recoverable │Terminal │
└────┬────┘ └────┬────┘ └────┬────┘
│ │ │
▼ ▼ ▼
┌─────────┐ ┌─────────┐ ┌─────────┐
│ Retry │ │ Try │ │ Log & │
│ w/Backoff │ Fallback│ │ Alert │
└────┬────┘ └────┬────┘ └────┬────┘
│ │ │
▼ ▼ ▼
┌─────────────────────────────────────────┐
│ Success or Escalate │
└─────────────────────────────────────────┘
Monitoring and Alerting
interface ErrorMetrics {
totalErrors: number;
errorsByCategory: Map<ErrorCategory, number>;
errorRate: number; // errors per minute
recoveryRate: number; // successful recoveries
mttr: number; // mean time to recover (ms)
}
class ErrorMonitor {
private errors: AgentError[] = [];
private recoveries: { error: AgentError; recoveredAt: Date }[] = [];
recordError(error: AgentError): void {
this.errors.push(error);
this.checkAlerts();
}
recordRecovery(error: AgentError): void {
this.recoveries.push({ error, recoveredAt: new Date() });
}
private checkAlerts(): void {
const recentErrors = this.getRecentErrors(60000); // Last minute
// High error rate alert
if (recentErrors.length > 10) {
this.sendAlert({
severity: 'high',
message: `High error rate: ${recentErrors.length} errors in last minute`,
errors: recentErrors
});
}
// Repeated same error alert
const errorCounts = new Map<string, number>();
for (const e of recentErrors) {
const key = `${e.category}:${e.code}`;
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
}
for (const [key, count] of errorCounts) {
if (count >= 5) {
this.sendAlert({
severity: 'medium',
message: `Repeated error: ${key} occurred ${count} times`,
errors: recentErrors.filter(e => `${e.category}:${e.code}` === key)
});
}
}
}
getMetrics(): ErrorMetrics {
const window = 5 * 60 * 1000; // 5 minutes
const recent = this.getRecentErrors(window);
const byCategory = new Map<ErrorCategory, number>();
for (const e of recent) {
byCategory.set(e.category, (byCategory.get(e.category) || 0) + 1);
}
const recentRecoveries = this.recoveries.filter(
r => Date.now() - r.recoveredAt.getTime() < window
);
const recoveryTimes = recentRecoveries.map(
r => r.recoveredAt.getTime() - r.error.timestamp.getTime()
);
return {
totalErrors: recent.length,
errorsByCategory: byCategory,
errorRate: recent.length / (window / 60000),
recoveryRate: recentRecoveries.length / Math.max(recent.length, 1),
mttr: recoveryTimes.length > 0
? recoveryTimes.reduce((a, b) => a + b, 0) / recoveryTimes.length
: 0
};
}
}
Best Practices
- Classify all errors - Know what you're dealing with
- Don't retry everything - Some errors won't recover
- Use exponential backoff - Avoid hammering failing services
- Set circuit breakers - Protect downstream systems
- Log everything - Debugging is hard without logs
- Have fallbacks - Always have a Plan B
- Alert on patterns - Single errors may be noise, patterns matter
- Test failure scenarios - Chaos engineering
Weekly Installs
3
Repository
latestaiagents/…t-skillsGitHub Stars
2
First Seen
Feb 4, 2026
Installed on
mcpjam3
claude-code3
replit3
junie3
windsurf3
zencoder3