apollo-observability

SKILL.md

Apollo Observability

Overview

Comprehensive observability setup for Apollo.io integrations including metrics, logging, tracing, and alerting.

Metrics with Prometheus

// src/lib/apollo/metrics.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client';

const register = new Registry();

// Request metrics
export const apolloRequestsTotal = new Counter({
  name: 'apollo_requests_total',
  help: 'Total number of Apollo API requests',
  labelNames: ['endpoint', 'method', 'status'],
  registers: [register],
});

export const apolloRequestDuration = new Histogram({
  name: 'apollo_request_duration_seconds',
  help: 'Duration of Apollo API requests in seconds',
  labelNames: ['endpoint', 'method'],
  buckets: [0.1, 0.25, 0.5, 1, 2.5, 5, 10],
  registers: [register],
});

// Rate limit metrics
export const apolloRateLimitRemaining = new Gauge({
  name: 'apollo_rate_limit_remaining',
  help: 'Remaining Apollo API rate limit',
  labelNames: ['endpoint'],
  registers: [register],
});

export const apolloRateLimitHits = new Counter({
  name: 'apollo_rate_limit_hits_total',
  help: 'Number of times rate limit was hit',
  registers: [register],
});

// Cache metrics
export const apolloCacheHits = new Counter({
  name: 'apollo_cache_hits_total',
  help: 'Number of Apollo cache hits',
  labelNames: ['endpoint'],
  registers: [register],
});

export const apolloCacheMisses = new Counter({
  name: 'apollo_cache_misses_total',
  help: 'Number of Apollo cache misses',
  labelNames: ['endpoint'],
  registers: [register],
});

// Credit usage
export const apolloCreditsUsed = new Counter({
  name: 'apollo_credits_used_total',
  help: 'Total Apollo credits consumed',
  labelNames: ['operation'],
  registers: [register],
});

// Error tracking
export const apolloErrors = new Counter({
  name: 'apollo_errors_total',
  help: 'Total Apollo API errors',
  labelNames: ['endpoint', 'error_type'],
  registers: [register],
});

export { register };

Instrumented Client

// src/lib/apollo/instrumented-client.ts
import { apolloRequestsTotal, apolloRequestDuration, apolloErrors } from './metrics';

export class InstrumentedApolloClient {
  async request<T>(endpoint: string, options: RequestOptions): Promise<T> {
    const labels = { endpoint, method: options.method || 'POST' };
    const endTimer = apolloRequestDuration.startTimer(labels);

    try {
      const response = await this.baseClient.request(endpoint, options);

      apolloRequestsTotal.inc({ ...labels, status: 'success' });

      // Track rate limit from headers
      const remaining = response.headers['x-ratelimit-remaining'];
      if (remaining) {
        apolloRateLimitRemaining.set({ endpoint }, parseInt(remaining));
      }

      return response.data;
    } catch (error: any) {
      const errorType = this.classifyError(error);
      apolloRequestsTotal.inc({ ...labels, status: 'error' });
      apolloErrors.inc({ endpoint, error_type: errorType });

      if (error.response?.status === 429) {
        apolloRateLimitHits.inc();
      }

      throw error;
    } finally {
      endTimer();
    }
  }

  private classifyError(error: any): string {
    const status = error.response?.status;
    if (status === 401) return 'auth_error';
    if (status === 403) return 'permission_error';
    if (status === 422) return 'validation_error';
    if (status === 429) return 'rate_limit';
    if (status >= 500) return 'server_error';
    if (error.code === 'ECONNREFUSED') return 'connection_error';
    if (error.code === 'ETIMEDOUT') return 'timeout';
    return 'unknown';
  }
}

Structured Logging

// src/lib/apollo/logger.ts
import pino from 'pino';

const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label }),
  },
  redact: {
    paths: ['api_key', '*.email', '*.phone', 'headers.authorization'],
    censor: '[REDACTED]',
  },
  base: {
    service: 'apollo-integration',
    environment: process.env.NODE_ENV,
  },
});

export const apolloLogger = logger.child({ component: 'apollo' });

// Request/response logging
export function logApolloRequest(context: {
  endpoint: string;
  method: string;
  params?: object;
  requestId: string;
}): void {
  apolloLogger.info({
    type: 'apollo_request',
    ...context,
    timestamp: new Date().toISOString(),
  });
}

export function logApolloResponse(context: {
  endpoint: string;
  status: number;
  durationMs: number;
  requestId: string;
  resultCount?: number;
}): void {
  apolloLogger.info({
    type: 'apollo_response',
    ...context,
    timestamp: new Date().toISOString(),
  });
}

export function logApolloError(context: {
  endpoint: string;
  error: Error;
  requestId: string;
  retryCount?: number;
}): void {
  apolloLogger.error({
    type: 'apollo_error',
    endpoint: context.endpoint,
    error: {
      name: context.error.name,
      message: context.error.message,
      stack: context.error.stack,
    },
    requestId: context.requestId,
    retryCount: context.retryCount,
    timestamp: new Date().toISOString(),
  });
}

Distributed Tracing (OpenTelemetry)

// src/lib/apollo/tracing.ts
import { trace, Span, SpanStatusCode, context as otelContext } from '@opentelemetry/api';
import { W3CTraceContextPropagator } from '@opentelemetry/core';

const tracer = trace.getTracer('apollo-integration');
const propagator = new W3CTraceContextPropagator();

export function createApolloSpan(
  name: string,
  attributes: Record<string, any>
): Span {
  return tracer.startSpan(`apollo.${name}`, {
    attributes: {
      'apollo.endpoint': attributes.endpoint,
      'apollo.method': attributes.method,
      'service.name': 'apollo-integration',
    },
  });
}

export async function traceApolloRequest<T>(
  endpoint: string,
  requestFn: () => Promise<T>
): Promise<T> {
  const span = createApolloSpan('request', { endpoint });

  try {
    const result = await otelContext.with(
      trace.setSpan(otelContext.active(), span),
      requestFn
    );

    span.setStatus({ code: SpanStatusCode.OK });
    return result;
  } catch (error: any) {
    span.setStatus({
      code: SpanStatusCode.ERROR,
      message: error.message,
    });
    span.recordException(error);
    throw error;
  } finally {
    span.end();
  }
}

// Middleware for Express
export function apolloTracingMiddleware(req: any, res: any, next: any) {
  const span = createApolloSpan('http_request', {
    endpoint: req.path,
    method: req.method,
  });

  req.apolloSpan = span;

  res.on('finish', () => {
    span.setAttribute('http.status_code', res.statusCode);
    span.end();
  });

  next();
}

Alerting Rules

# prometheus/apollo-alerts.yml
groups:
  - name: apollo-alerts
    rules:
      # High error rate
      - alert: ApolloHighErrorRate
        expr: |
          sum(rate(apollo_errors_total[5m])) /
          sum(rate(apollo_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Apollo API error rate"
          description: "Apollo error rate is {{ $value | humanizePercentage }}"

      # Rate limit warnings
      - alert: ApolloRateLimitApproaching
        expr: apollo_rate_limit_remaining < 20
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Apollo rate limit approaching"
          description: "Only {{ $value }} requests remaining"

      - alert: ApolloRateLimitHit
        expr: increase(apollo_rate_limit_hits_total[5m]) > 0
        labels:
          severity: critical
        annotations:
          summary: "Apollo rate limit hit"
          description: "Rate limit was hit {{ $value }} times in last 5 minutes"

      # Latency alerts
      - alert: ApolloHighLatency
        expr: |
          histogram_quantile(0.95, rate(apollo_request_duration_seconds_bucket[5m])) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Apollo API latency"
          description: "P95 latency is {{ $value | humanizeDuration }}"

      # Credit usage
      - alert: ApolloHighCreditUsage
        expr: |
          increase(apollo_credits_used_total[24h]) > 8000
        labels:
          severity: warning
        annotations:
          summary: "High Apollo credit consumption"
          description: "{{ $value }} credits used in last 24 hours"

Grafana Dashboard

{
  "dashboard": {
    "title": "Apollo.io Integration",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(apollo_requests_total[5m])) by (endpoint)",
            "legendFormat": "{{ endpoint }}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(apollo_errors_total[5m])) by (error_type)",
            "legendFormat": "{{ error_type }}"
          }
        ]
      },
      {
        "title": "Request Duration (P95)",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(apollo_request_duration_seconds_bucket[5m]))",
            "legendFormat": "P95"
          }
        ]
      },
      {
        "title": "Rate Limit Status",
        "type": "gauge",
        "targets": [
          {
            "expr": "apollo_rate_limit_remaining",
            "legendFormat": "Remaining"
          }
        ],
        "thresholds": [
          { "value": 0, "color": "red" },
          { "value": 20, "color": "yellow" },
          { "value": 50, "color": "green" }
        ]
      },
      {
        "title": "Cache Hit Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "sum(rate(apollo_cache_hits_total[5m])) / (sum(rate(apollo_cache_hits_total[5m])) + sum(rate(apollo_cache_misses_total[5m])))",
            "legendFormat": "Hit Rate"
          }
        ]
      },
      {
        "title": "Credits Used Today",
        "type": "stat",
        "targets": [
          {
            "expr": "increase(apollo_credits_used_total[24h])"
          }
        ]
      }
    ]
  }
}

Health Check Endpoint

// src/routes/health/apollo.ts
import { Router } from 'express';
import { register } from '../../lib/apollo/metrics';

const router = Router();

router.get('/health/apollo', async (req, res) => {
  const checks = {
    api: false,
    rateLimit: false,
    cache: false,
  };

  try {
    // Check API connectivity
    await apollo.healthCheck();
    checks.api = true;

    // Check rate limit status
    const remaining = apolloRateLimitRemaining.get();
    checks.rateLimit = remaining > 10;

    // Check cache health
    const cacheStats = apolloCache.getStats();
    checks.cache = cacheStats.size > 0;

    const healthy = Object.values(checks).every(Boolean);

    res.status(healthy ? 200 : 503).json({
      status: healthy ? 'healthy' : 'degraded',
      checks,
      timestamp: new Date().toISOString(),
    });
  } catch (error: any) {
    res.status(503).json({
      status: 'unhealthy',
      error: error.message,
      checks,
    });
  }
});

router.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

export default router;

Output

  • Prometheus metrics for all Apollo operations
  • Structured JSON logging with PII redaction
  • OpenTelemetry distributed tracing
  • Alerting rules for errors, rate limits, latency
  • Grafana dashboard configuration
  • Health check endpoints

Error Handling

Issue Resolution
Missing metrics Verify instrumentation
Alert noise Tune thresholds
Log volume Adjust log levels
Trace gaps Check propagation

Resources

Next Steps

Proceed to apollo-incident-runbook for incident response.

Weekly Installs
1
Installed on
antigravity1