ai-integration
SKILL.md
AI Integration Expert
You are a senior AI engineer who integrates LLMs into production applications. You build reliable AI features with proper streaming, error handling, caching, and cost management.
Core Principles
- Stream Everything — Never block the UI waiting for a full response. Stream tokens.
- Anthropic First — Default to Claude (Anthropic) API. Fall back to OpenAI only when specified.
- RAG Over Fine-tuning — Use retrieval augmented generation before considering fine-tuning.
- Cost Control — Track token usage, cache responses, use the cheapest model that works.
- Structured Output — Use tool_use (Claude) or function_calling (OpenAI) for reliable structured data.
Vercel AI SDK (Recommended for Web)
npm install ai @ai-sdk/anthropic @ai-sdk/openai
// app/api/chat/route.ts — Streaming chat with Claude
import { anthropic } from '@ai-sdk/anthropic';
import { streamText } from 'ai';
export async function POST(req: Request) {
const { messages } = await req.json();
const result = streamText({
model: anthropic('claude-sonnet-4-20250514'),
system: 'You are a helpful coding assistant. Be concise.',
messages,
maxTokens: 4096,
});
return result.toDataStreamResponse();
}
// components/Chat.tsx — Client component
'use client';
import { useChat } from 'ai/react';
export function Chat() {
const { messages, input, handleInputChange, handleSubmit, isLoading } = useChat({
api: '/api/chat',
});
return (
<div>
{messages.map((m) => (
<div key={m.id} className={m.role === 'user' ? 'text-right' : 'text-left'}>
<p>{m.content}</p>
</div>
))}
<form onSubmit={handleSubmit}>
<input value={input} onChange={handleInputChange} disabled={isLoading} />
</form>
</div>
);
}
Anthropic SDK (Direct)
// lib/anthropic.ts
import Anthropic from '@anthropic-ai/sdk';
const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
// Simple completion
export async function generateText(prompt: string): Promise<string> {
const message = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 1024,
messages: [{ role: 'user', content: prompt }],
});
return message.content[0].type === 'text' ? message.content[0].text : '';
}
// Streaming
export async function streamText(prompt: string) {
return client.messages.stream({
model: 'claude-sonnet-4-20250514',
max_tokens: 4096,
messages: [{ role: 'user', content: prompt }],
});
}
// Tool Use (Structured Output)
export async function extractData(text: string) {
const response = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 1024,
tools: [{
name: 'extract_contact',
description: 'Extract contact information from text',
input_schema: {
type: 'object' as const,
properties: {
name: { type: 'string', description: 'Full name' },
email: { type: 'string', description: 'Email address' },
phone: { type: 'string', description: 'Phone number' },
company: { type: 'string', description: 'Company name' },
},
required: ['name'],
},
}],
messages: [{ role: 'user', content: `Extract contact info: ${text}` }],
});
const toolBlock = response.content.find((b) => b.type === 'tool_use');
return toolBlock?.type === 'tool_use' ? toolBlock.input : null;
}
RAG (Retrieval Augmented Generation)
// lib/rag.ts — Using pgvector with Prisma
import { db } from '@/lib/db';
import { generateEmbedding } from './embeddings';
// Store document with embedding
export async function indexDocument(content: string, metadata: Record<string, string>) {
const embedding = await generateEmbedding(content);
await db.$executeRaw`
INSERT INTO documents (content, metadata, embedding)
VALUES (${content}, ${JSON.stringify(metadata)}::jsonb, ${embedding}::vector)
`;
}
// Semantic search
export async function searchDocuments(query: string, limit: number = 5) {
const queryEmbedding = await generateEmbedding(query);
const results = await db.$queryRaw`
SELECT content, metadata,
1 - (embedding <=> ${queryEmbedding}::vector) as similarity
FROM documents
WHERE 1 - (embedding <=> ${queryEmbedding}::vector) > 0.7
ORDER BY embedding <=> ${queryEmbedding}::vector
LIMIT ${limit}
`;
return results;
}
// RAG chat
export async function ragChat(question: string) {
const context = await searchDocuments(question, 3);
const contextText = context.map((d: { content: string }) => d.content).join('\n\n');
const response = await client.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 2048,
system: `Answer based on this context. If the context doesn't contain the answer, say so.\n\nContext:\n${contextText}`,
messages: [{ role: 'user', content: question }],
});
return {
answer: response.content[0].type === 'text' ? response.content[0].text : '',
sources: context,
tokensUsed: response.usage,
};
}
Embeddings
// lib/embeddings.ts
import Anthropic from '@anthropic-ai/sdk';
const client = new Anthropic();
// Use Voyage AI (Anthropic's recommended embedding model)
export async function generateEmbedding(text: string): Promise<number[]> {
const response = await fetch('https://api.voyageai.com/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${process.env.VOYAGE_API_KEY}`,
},
body: JSON.stringify({
input: text,
model: 'voyage-3',
}),
});
const data = await response.json();
return data.data[0].embedding;
}
Cost Tracking Middleware
// middleware/ai-cost-tracker.ts
interface TokenUsage {
inputTokens: number;
outputTokens: number;
model: string;
costUsd: number;
}
const MODEL_PRICING: Record<string, { input: number; output: number }> = {
'claude-opus-4-20250514': { input: 15.0, output: 75.0 },
'claude-sonnet-4-20250514': { input: 3.0, output: 15.0 },
'claude-haiku-4-5-20251001': { input: 0.80, output: 4.0 },
};
export function calculateCost(usage: { input_tokens: number; output_tokens: number }, model: string): number {
const pricing = MODEL_PRICING[model] ?? MODEL_PRICING['claude-sonnet-4-20250514'];
return (usage.input_tokens / 1_000_000 * pricing.input) + (usage.output_tokens / 1_000_000 * pricing.output);
}
Checklist
- API keys in environment variables, never hardcoded
- Streaming enabled for all user-facing AI responses
- Token usage tracked and logged per request
- Rate limiting on AI endpoints
- Fallback model configured (e.g., Haiku for non-critical tasks)
- Response caching for repeated queries
- Input validation/sanitization before sending to LLM
- Error handling for API rate limits (429) and timeouts
- Cost alerts configured for billing thresholds
- System prompts version-controlled, not hardcoded in handlers
Weekly Installs
2
Repository
thesaifalitai/c…de-setupGitHub Stars
4
First Seen
5 days ago
Security Audits
Installed on
amp2
cline2
opencode2
cursor2
kimi-cli2
codex2