agent-testing-harness
SKILL.md
Agent Testing Harness
Design and implement comprehensive testing for AI agent systems.
When to Use
- Building reliable agent systems
- Validating agent behavior before deployment
- Creating regression tests for agents
- Implementing continuous testing for agent updates
- Evaluating agent performance metrics
Testing Pyramid for Agents
┌─────────────────────┐
│ End-to-End │ Full workflow tests
│ Agent Tests │ (expensive, few)
├─────────────────────┤
│ Integration │ Multi-agent interaction
│ Tests │ (medium cost, some)
├─────────────────────┤
│ Component │ Individual agent behavior
│ Tests │ (cheap, many)
├─────────────────────┤
│ Unit Tests │ Tools, utilities, helpers
│ │ (cheapest, most)
└─────────────────────┘
Unit Testing Agent Components
Testing Tools
import { describe, it, expect, vi } from 'vitest';
describe('SearchTool', () => {
const searchTool = new SearchTool();
it('returns results for valid query', async () => {
const result = await searchTool.execute({ query: 'test query' });
expect(result.success).toBe(true);
expect(result.data.results).toBeInstanceOf(Array);
expect(result.data.results.length).toBeGreaterThan(0);
});
it('handles empty query gracefully', async () => {
const result = await searchTool.execute({ query: '' });
expect(result.success).toBe(false);
expect(result.error.code).toBe('INVALID_INPUT');
});
it('respects rate limits', async () => {
// Make many requests quickly
const promises = Array(10).fill(null).map(() =>
searchTool.execute({ query: 'test' })
);
const results = await Promise.all(promises);
const rateLimited = results.filter(r => r.error?.code === 'RATE_LIMITED');
expect(rateLimited.length).toBeGreaterThan(0);
});
});
Testing Prompts
describe('SystemPrompt', () => {
it('includes all required sections', () => {
const prompt = generateSystemPrompt(config);
expect(prompt).toContain('## Your Role');
expect(prompt).toContain('## Available Tools');
expect(prompt).toContain('## Constraints');
});
it('correctly formats tool descriptions', () => {
const prompt = generateSystemPrompt({
tools: [
{ name: 'search', description: 'Search the web' },
{ name: 'calculate', description: 'Do math' }
]
});
expect(prompt).toContain('- search: Search the web');
expect(prompt).toContain('- calculate: Do math');
});
});
Component Testing Agents
Mock LLM Responses
class MockLLM {
private responses: Map<string, string> = new Map();
setResponse(inputPattern: RegExp | string, response: string): void {
const key = inputPattern instanceof RegExp ? inputPattern.source : inputPattern;
this.responses.set(key, response);
}
async complete(input: string): Promise<string> {
for (const [pattern, response] of this.responses) {
const regex = new RegExp(pattern, 'i');
if (regex.test(input)) {
return response;
}
}
return 'Default mock response';
}
}
describe('ResearchAgent', () => {
let agent: ResearchAgent;
let mockLLM: MockLLM;
beforeEach(() => {
mockLLM = new MockLLM();
agent = new ResearchAgent({ llm: mockLLM });
});
it('formulates search queries from task', async () => {
mockLLM.setResponse(
/formulate.*search/i,
JSON.stringify({ queries: ['query 1', 'query 2'] })
);
const result = await agent.planResearch('Find information about X');
expect(result.queries).toHaveLength(2);
});
it('synthesizes findings into report', async () => {
mockLLM.setResponse(
/synthesize/i,
'Based on the research, here are the key findings...'
);
const result = await agent.synthesize([
{ source: 'source1', content: 'finding 1' },
{ source: 'source2', content: 'finding 2' }
]);
expect(result).toContain('key findings');
});
});
Testing Agent Decision Making
describe('Agent Decision Making', () => {
it('selects appropriate tool for task', async () => {
const agent = new Agent({
tools: [searchTool, calculatorTool, fileReaderTool]
});
// Math task should use calculator
const mathDecision = await agent.decideTool('Calculate 15% of 200');
expect(mathDecision.tool).toBe('calculator');
// Search task should use search
const searchDecision = await agent.decideTool('Find the latest news about AI');
expect(searchDecision.tool).toBe('search');
});
it('handles ambiguous tasks appropriately', async () => {
const agent = new Agent({ tools: [searchTool, fileReaderTool] });
const decision = await agent.decideTool('Read about quantum computing');
// Should clarify or make reasonable choice
expect(['search', 'clarify']).toContain(decision.action);
});
});
Integration Testing Multi-Agent Systems
describe('Multi-Agent Workflow', () => {
let supervisor: SupervisorAgent;
let researcher: ResearchAgent;
let writer: WriterAgent;
let reviewer: ReviewerAgent;
beforeEach(() => {
researcher = new ResearchAgent();
writer = new WriterAgent();
reviewer = new ReviewerAgent();
supervisor = new SupervisorAgent({
workers: [researcher, writer, reviewer]
});
});
it('coordinates agents to complete task', async () => {
const result = await supervisor.execute(
'Write a blog post about renewable energy'
);
expect(result.success).toBe(true);
expect(result.steps).toContainEqual(
expect.objectContaining({ agent: 'researcher', status: 'completed' })
);
expect(result.steps).toContainEqual(
expect.objectContaining({ agent: 'writer', status: 'completed' })
);
});
it('handles agent failure gracefully', async () => {
// Make researcher fail
vi.spyOn(researcher, 'execute').mockRejectedValue(new Error('API Error'));
const result = await supervisor.execute('Research topic X');
expect(result.success).toBe(false);
expect(result.error).toContain('researcher failed');
expect(result.recoveryAttempts).toBeGreaterThan(0);
});
it('respects budget constraints', async () => {
const result = await supervisor.execute(
'Complex research task',
{ budgetUSD: 0.01 } // Very low budget
);
expect(result.totalCost).toBeLessThanOrEqual(0.01);
});
});
End-to-End Agent Tests
describe('E2E: Content Creation Pipeline', () => {
// Use real LLM but with test account
const agent = new ContentCreationAgent({
llm: new OpenAI({ apiKey: process.env.TEST_API_KEY })
});
it('creates blog post from topic', async () => {
const result = await agent.createContent({
type: 'blog_post',
topic: 'Benefits of unit testing',
targetLength: 500
});
// Structure validation
expect(result.title).toBeDefined();
expect(result.content.length).toBeGreaterThan(400);
expect(result.content.length).toBeLessThan(600);
// Content validation
expect(result.content.toLowerCase()).toContain('test');
expect(result.sections.length).toBeGreaterThanOrEqual(3);
}, 60000); // Long timeout for real API calls
it('handles user feedback loop', async () => {
const draft = await agent.createContent({
type: 'blog_post',
topic: 'AI in healthcare'
});
const revised = await agent.reviseContent(draft, {
feedback: 'Make it more technical and add statistics'
});
expect(revised.content).not.toEqual(draft.content);
// Check for more technical language
expect(revised.content).toMatch(/\d+%|\d+ percent/);
}, 120000);
});
Evaluation Metrics
interface AgentEvaluation {
taskCompletion: number; // 0-1: Did it complete the task?
accuracy: number; // 0-1: Is the output correct?
efficiency: number; // 0-1: Token/time efficiency
safety: number; // 0-1: No harmful outputs
reliability: number; // 0-1: Consistent results
}
class AgentEvaluator {
async evaluate(
agent: Agent,
testCases: TestCase[]
): Promise<EvaluationReport> {
const results: EvaluationResult[] = [];
for (const testCase of testCases) {
const startTime = Date.now();
const result = await agent.execute(testCase.input);
const duration = Date.now() - startTime;
const evaluation: AgentEvaluation = {
taskCompletion: this.assessCompletion(result, testCase.expected),
accuracy: this.assessAccuracy(result, testCase.expected),
efficiency: this.assessEfficiency(result, duration),
safety: this.assessSafety(result),
reliability: 1 // Will be calculated across runs
};
results.push({ testCase, result, evaluation, duration });
}
// Run reliability tests (same input multiple times)
const reliabilityScores = await this.testReliability(agent, testCases);
return this.compileReport(results, reliabilityScores);
}
private assessCompletion(result: Result, expected: Expected): number {
if (!result.success) return 0;
// Check required outputs are present
const requiredKeys = Object.keys(expected.requiredOutputs || {});
const presentKeys = requiredKeys.filter(k => result.output[k] !== undefined);
return presentKeys.length / Math.max(requiredKeys.length, 1);
}
private assessAccuracy(result: Result, expected: Expected): number {
if (!expected.groundTruth) return 1; // No ground truth to compare
// Use LLM to judge similarity
return this.llmJudge(result.output, expected.groundTruth);
}
}
Test Data Management
interface TestCase {
id: string;
name: string;
category: string;
input: AgentInput;
expected: {
success: boolean;
requiredOutputs?: Record<string, any>;
groundTruth?: string;
maxDurationMs?: number;
maxCostUSD?: number;
};
tags: string[];
}
// Test case factory
function createTestCase(
name: string,
input: string,
expected: Partial<TestCase['expected']>
): TestCase {
return {
id: generateId(),
name,
category: 'default',
input: { task: input },
expected: {
success: true,
...expected
},
tags: []
};
}
// Example test suite
const codeReviewTestCases: TestCase[] = [
createTestCase(
'Identifies security vulnerability',
'Review this code: eval(userInput)',
{
requiredOutputs: {
vulnerabilities: expect.arrayContaining([
expect.objectContaining({ type: 'code_injection' })
])
}
}
),
createTestCase(
'Catches null pointer risk',
'Review: const name = user.profile.name',
{
requiredOutputs: {
warnings: expect.arrayContaining([
expect.objectContaining({ type: 'null_safety' })
])
}
}
)
];
Continuous Testing Pipeline
# .github/workflows/agent-tests.yml
name: Agent Tests
on:
push:
paths:
- 'agents/**'
- 'prompts/**'
schedule:
- cron: '0 0 * * *' # Daily regression
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: npm test -- --grep "unit"
component-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: npm test -- --grep "component"
env:
MOCK_LLM: true
integration-tests:
runs-on: ubuntu-latest
needs: [unit-tests, component-tests]
steps:
- uses: actions/checkout@v4
- run: npm test -- --grep "integration"
env:
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
e2e-tests:
runs-on: ubuntu-latest
needs: integration-tests
if: github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v4
- run: npm test -- --grep "e2e"
env:
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
Best Practices
- Mock at boundaries - Mock LLM, not agent logic
- Test deterministically - Set seeds, use fixed responses
- Measure what matters - Completion, accuracy, safety
- Automate regression - Catch prompt regressions
- Test failure modes - Agents should fail gracefully
- Budget for tests - Real API tests cost money
- Version test cases - Track alongside prompt changes
- Use golden outputs - Compare against known-good results
Weekly Installs
3
Repository
latestaiagents/…t-skillsGitHub Stars
2
First Seen
Feb 4, 2026
Installed on
mcpjam3
claude-code3
replit3
junie3
windsurf3
zencoder3