grafana-prometheus
Grafana + Prometheus Monitoring
Complete observability stack with Prometheus metrics collection, PromQL queries, and Grafana visualization dashboards.
Direct Control (CLI / API / Scripting)
Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
environment: 'prod'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules
rule_files:
- "alerts/*.yml"
- "recording_rules/*.yml"
# Scrape configurations
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Node exporter (system metrics)
- job_name: 'node'
static_configs:
- targets:
- 'node1:9100'
- 'node2:9100'
- 'node3:9100'
labels:
env: 'production'
# Kubernetes service discovery
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
# Application metrics
- job_name: 'app'
static_configs:
- targets:
- 'app1:8080'
- 'app2:8080'
metrics_path: '/metrics'
scrape_interval: 10s
# Blackbox exporter (endpoint monitoring)
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://example.com
- https://api.example.com/health
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
PromQL Queries
# CPU Usage
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory Usage Percentage
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Disk Space Available
node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100
# HTTP Request Rate
rate(http_requests_total[5m])
# HTTP Request Rate by Status Code
sum by (status) (rate(http_requests_total[5m]))
# 95th Percentile Request Duration
histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))
# Error Rate
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100
# Pod Restarts (Kubernetes)
sum by (namespace, pod) (kube_pod_container_status_restarts_total)
# Network Traffic
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])
# Active Alerts
sum by (alertname, severity) (ALERTS{alertstate="firing"})
# Query per second by database
sum by (database) (rate(mysql_global_status_queries[5m]))
# Container CPU Usage
sum by (pod_name) (rate(container_cpu_usage_seconds_total[5m]))
# Top 10 endpoints by request count
topk(10, sum by (endpoint) (rate(http_requests_total[1h])))
# SLA calculation (uptime percentage)
avg_over_time((up{job="api"}[30d])) * 100
Prometheus HTTP API (cURL)
# Query instant value
curl -G http://localhost:9090/api/v1/query \
--data-urlencode 'query=up' \
--data-urlencode 'time=2024-01-01T20:10:30.781Z'
# Query range
curl -G http://localhost:9090/api/v1/query_range \
--data-urlencode 'query=rate(http_requests_total[5m])' \
--data-urlencode 'start=2024-01-01T00:00:00Z' \
--data-urlencode 'end=2024-01-01T23:59:59Z' \
--data-urlencode 'step=15s'
# Get series labels
curl -G http://localhost:9090/api/v1/series \
--data-urlencode 'match[]=up' \
--data-urlencode 'start=2024-01-01T00:00:00Z' \
--data-urlencode 'end=2024-01-01T23:59:59Z'
# Get label values
curl http://localhost:9090/api/v1/label/job/values
# Get targets
curl http://localhost:9090/api/v1/targets
# Get alerts
curl http://localhost:9090/api/v1/alerts
# Get rules
curl http://localhost:9090/api/v1/rules
# Health check
curl http://localhost:9090/-/healthy
# Reload configuration
curl -X POST http://localhost:9090/-/reload
Grafana HTTP API
# Authentication
export GRAFANA_TOKEN="your-api-token"
export GRAFANA_URL="http://localhost:3000"
# Create dashboard
curl -X POST "$GRAFANA_URL/api/dashboards/db" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d @dashboard.json
# Get dashboard by UID
curl "$GRAFANA_URL/api/dashboards/uid/my-dashboard" \
-H "Authorization: Bearer $GRAFANA_TOKEN"
# Search dashboards
curl "$GRAFANA_URL/api/search?query=production&type=dash-db" \
-H "Authorization: Bearer $GRAFANA_TOKEN"
# Delete dashboard
curl -X DELETE "$GRAFANA_URL/api/dashboards/uid/my-dashboard" \
-H "Authorization: Bearer $GRAFANA_TOKEN"
# Create data source
curl -X POST "$GRAFANA_URL/api/datasources" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"name": "Prometheus",
"type": "prometheus",
"url": "http://prometheus:9090",
"access": "proxy",
"isDefault": true
}'
# List data sources
curl "$GRAFANA_URL/api/datasources" \
-H "Authorization: Bearer $GRAFANA_TOKEN"
# Create organization
curl -X POST "$GRAFANA_URL/api/orgs" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{"name": "Production Org"}'
# Add user to organization
curl -X POST "$GRAFANA_URL/api/orgs/1/users" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{"loginOrEmail": "user@example.com", "role": "Viewer"}'
# Create API key
curl -X POST "$GRAFANA_URL/api/auth/keys" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{"name": "automation-key", "role": "Admin", "secondsToLive": 86400}'
# Create alert notification channel
curl -X POST "$GRAFANA_URL/api/alert-notifications" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"name": "Slack Alerts",
"type": "slack",
"isDefault": true,
"settings": {
"url": "https://hooks.slack.com/services/xxx/yyy/zzz",
"recipient": "#alerts"
}
}'
# Create folder
curl -X POST "$GRAFANA_URL/api/folders" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{"title": "Production Dashboards"}'
# Snapshot dashboard
curl -X POST "$GRAFANA_URL/api/snapshots" \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d @snapshot.json
# Health check
curl "$GRAFANA_URL/api/health"
Alert Rules Configuration
# alerts/app_alerts.yml
groups:
- name: application
interval: 30s
rules:
# High error rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
team: backend
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
# API latency
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum by (le) (rate(http_request_duration_seconds_bucket[5m]))
) > 1
for: 10m
labels:
severity: warning
team: backend
annotations:
summary: "API latency is high"
description: "95th percentile latency is {{ $value }}s"
# Service down
- alert: ServiceDown
expr: up{job="app"} == 0
for: 2m
labels:
severity: critical
team: sre
annotations:
summary: "Service {{ $labels.instance }} is down"
description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes"
- name: infrastructure
interval: 1m
rules:
# High CPU
- alert: HighCPU
expr: |
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
team: sre
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}%"
# Low disk space
- alert: LowDiskSpace
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
for: 5m
labels:
severity: warning
team: sre
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Only {{ $value }}% disk space remaining"
# High memory
- alert: HighMemory
expr: |
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: critical
team: sre
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}%"
Python Client (Prometheus & Grafana)
import requests
from datetime import datetime, timedelta
class PrometheusClient:
def __init__(self, url="http://localhost:9090"):
self.url = url
def query(self, promql):
"""Execute instant query"""
response = requests.get(
f"{self.url}/api/v1/query",
params={"query": promql}
)
return response.json()
def query_range(self, promql, start, end, step="15s"):
"""Execute range query"""
response = requests.get(
f"{self.url}/api/v1/query_range",
params={
"query": promql,
"start": start.isoformat(),
"end": end.isoformat(),
"step": step
}
)
return response.json()
def get_targets(self):
"""Get scrape targets"""
response = requests.get(f"{self.url}/api/v1/targets")
return response.json()
class GrafanaClient:
def __init__(self, url, token):
self.url = url
self.headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
}
def create_dashboard(self, dashboard_json):
"""Create or update dashboard"""
response = requests.post(
f"{self.url}/api/dashboards/db",
headers=self.headers,
json={"dashboard": dashboard_json, "overwrite": True}
)
return response.json()
def get_dashboard(self, uid):
"""Get dashboard by UID"""
response = requests.get(
f"{self.url}/api/dashboards/uid/{uid}",
headers=self.headers
)
return response.json()
def search_dashboards(self, query="", tags=None):
"""Search dashboards"""
params = {"query": query}
if tags:
params["tag"] = tags
response = requests.get(
f"{self.url}/api/search",
headers=self.headers,
params=params
)
return response.json()
def create_annotation(self, dashboard_uid, time, text, tags=None):
"""Create annotation"""
data = {
"dashboardUID": dashboard_uid,
"time": int(time.timestamp() * 1000),
"text": text,
"tags": tags or []
}
response = requests.post(
f"{self.url}/api/annotations",
headers=self.headers,
json=data
)
return response.json()
# Usage example
prom = PrometheusClient("http://prometheus:9090")
grafana = GrafanaClient("http://grafana:3000", "your-api-token")
# Query CPU usage
result = prom.query('100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)')
print(f"CPU Usage: {result['data']['result'][0]['value'][1]}%")
# Get dashboard
dashboard = grafana.get_dashboard("my-dashboard-uid")
MCP Server Integration
Configuration (.codebuddy/mcp.json)
{
"mcpServers": {
"grafana": {
"command": "npx",
"args": ["-y", "@grafana/mcp-server"],
"env": {
"GRAFANA_URL": "http://localhost:3000",
"GRAFANA_API_TOKEN": "your-api-token-here"
}
}
}
}
Available MCP Tools
grafana_create_dashboard
- Creates or updates a Grafana dashboard
- Parameters:
dashboard(JSON object),folder_id(optional),overwrite(boolean) - Returns: Dashboard UID, URL, version
grafana_get_dashboard
- Retrieves dashboard by UID
- Parameters:
uid(string) - Returns: Full dashboard JSON with metadata
grafana_search_dashboards
- Searches dashboards by query and tags
- Parameters:
query(string),tags(array),folder_ids(array) - Returns: List of matching dashboards
grafana_delete_dashboard
- Deletes a dashboard
- Parameters:
uid(string) - Returns: Deletion confirmation
grafana_create_datasource
- Creates a new data source
- Parameters:
name(string),type(string),url(string),settings(object) - Returns: Data source ID and details
grafana_query_datasource
- Executes query against data source
- Parameters:
datasource_uid(string),query(string),time_range(object) - Returns: Query results
grafana_create_alert
- Creates alert rule
- Parameters:
rule_name(string),folder_uid(string),condition(object),notifications(array) - Returns: Alert rule ID
grafana_list_alerts
- Lists all alert rules
- Parameters:
folder_uid(optional),state(optional) - Returns: Array of alert rules
grafana_create_annotation
- Creates dashboard annotation
- Parameters:
dashboard_uid(string),time(timestamp),text(string),tags(array) - Returns: Annotation ID
prometheus_query
- Executes PromQL instant query
- Parameters:
query(string),time(optional timestamp) - Returns: Query result values
prometheus_query_range
- Executes PromQL range query
- Parameters:
query(string),start(timestamp),end(timestamp),step(string) - Returns: Time series data
prometheus_get_metrics
- Lists available metrics
- Parameters:
filter(optional string) - Returns: Array of metric names
prometheus_get_targets
- Lists scrape targets and their status
- Parameters: None
- Returns: Target health and labels
Common Workflows
1. Complete Monitoring Stack Setup
# Docker Compose setup
cat > docker-compose.yml <<'EOF'
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alerts:/etc/prometheus/alerts
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
command:
- '--path.rootfs=/host'
volumes:
- '/:/host:ro,rslave'
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
volumes:
prometheus-data:
grafana-data:
EOF
# Start stack
docker-compose up -d
# Wait for services
sleep 10
# Create Grafana API token
GRAFANA_TOKEN=$(curl -X POST http://admin:admin@localhost:3000/api/auth/keys \
-H "Content-Type: application/json" \
-d '{"name": "automation", "role": "Admin"}' | jq -r '.key')
# Add Prometheus data source
curl -X POST http://localhost:3000/api/datasources \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"name": "Prometheus",
"type": "prometheus",
"url": "http://prometheus:9090",
"access": "proxy",
"isDefault": true
}'
echo "Monitoring stack ready!"
echo "Grafana: http://localhost:3000 (admin/admin)"
echo "Prometheus: http://localhost:9090"
2. Create Custom Application Dashboard
# Generate dashboard JSON
cat > app-dashboard.json <<'EOF'
{
"dashboard": {
"title": "Application Metrics",
"tags": ["application", "production"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (status)",
"legendFormat": "{{status}}",
"refId": "A"
}
],
"yaxes": [
{"format": "reqps", "label": "Requests/sec"},
{"format": "short"}
]
},
{
"id": 2,
"title": "Error Rate",
"type": "singlestat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 8},
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
"refId": "A"
}
],
"format": "percent",
"thresholds": "1,5",
"colors": ["#299c46", "#e5ac0e", "#bf1b00"]
},
{
"id": 3,
"title": "Response Time (p95)",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m])))",
"legendFormat": "{{endpoint}}",
"refId": "A"
}
],
"yaxes": [
{"format": "s", "label": "Duration"},
{"format": "short"}
]
},
{
"id": 4,
"title": "Active Connections",
"type": "graph",
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "sum(app_active_connections) by (instance)",
"legendFormat": "{{instance}}",
"refId": "A"
}
]
}
],
"refresh": "30s",
"time": {"from": "now-6h", "to": "now"}
},
"overwrite": true
}
EOF
# Upload dashboard
curl -X POST http://localhost:3000/api/dashboards/db \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d @app-dashboard.json
3. Set Up Alerting Pipeline
# Configure Alertmanager
cat > alertmanager.yml <<'EOF'
global:
resolve_timeout: 5m
slack_api_url: 'https://hooks.slack.com/services/xxx/yyy/zzz'
route:
group_by: ['alertname', 'cluster']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'pagerduty'
continue: true
- match:
severity: warning
receiver: 'slack'
receivers:
- name: 'default'
slack_configs:
- channel: '#alerts'
title: 'Alert: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'slack'
slack_configs:
- channel: '#alerts-warning'
title: 'Warning: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
send_resolved: true
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'your-pagerduty-key'
description: '{{ .GroupLabels.alertname }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
EOF
# Reload Alertmanager
curl -X POST http://localhost:9093/-/reload
# Test alert
curl -X POST http://localhost:9093/api/v1/alerts \
-H "Content-Type: application/json" \
-d '[
{
"labels": {
"alertname": "TestAlert",
"severity": "warning"
},
"annotations": {
"summary": "This is a test alert"
}
}
]'
# Verify alert rules
curl http://localhost:9090/api/v1/rules | jq '.data.groups[].rules[] | select(.type=="alerting")'
# Check active alerts
curl http://localhost:9090/api/v1/alerts | jq '.data.alerts[] | select(.state=="firing")'
4. Instrument Application with Metrics
# Python Flask application with Prometheus metrics
from flask import Flask, request
from prometheus_client import Counter, Histogram, Gauge, generate_latest, REGISTRY
import time
app = Flask(__name__)
# Metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_DURATION = Histogram(
'http_request_duration_seconds',
'HTTP request duration',
['method', 'endpoint']
)
ACTIVE_REQUESTS = Gauge(
'http_requests_active',
'Active HTTP requests'
)
# Middleware
@app.before_request
def before_request():
request.start_time = time.time()
ACTIVE_REQUESTS.inc()
@app.after_request
def after_request(response):
duration = time.time() - request.start_time
REQUEST_COUNT.labels(
method=request.method,
endpoint=request.endpoint or 'unknown',
status=response.status_code
).inc()
REQUEST_DURATION.labels(
method=request.method,
endpoint=request.endpoint or 'unknown'
).observe(duration)
ACTIVE_REQUESTS.dec()
return response
# Metrics endpoint
@app.route('/metrics')
def metrics():
return generate_latest(REGISTRY)
# Application endpoints
@app.route('/api/users')
def users():
return {'users': []}
@app.route('/health')
def health():
return {'status': 'healthy'}
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080)
# Add to Prometheus scrape config
cat >> prometheus.yml <<'EOF'
- job_name: 'my-app'
static_configs:
- targets: ['app:8080']
metrics_path: '/metrics'
scrape_interval: 10s
EOF
# Reload Prometheus
curl -X POST http://localhost:9090/-/reload
5. Performance Analysis and Troubleshooting
# Check if metrics are being scraped
curl 'http://localhost:9090/api/v1/query?query=up{job="app"}'
# Analyze request patterns
curl -G 'http://localhost:9090/api/v1/query' \
--data-urlencode 'query=topk(10, sum by (endpoint) (rate(http_requests_total[1h])))'
# Find slow endpoints
curl -G 'http://localhost:9090/api/v1/query' \
--data-urlencode 'query=histogram_quantile(0.99, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m]))) > 1'
# Memory leak detection
curl -G 'http://localhost:9090/api/v1/query_range' \
--data-urlencode 'query=process_resident_memory_bytes{job="app"}' \
--data-urlencode 'start=2024-01-01T00:00:00Z' \
--data-urlencode 'end=2024-01-01T23:59:59Z' \
--data-urlencode 'step=1h' | jq '.data.result[0].values'
# CPU spike investigation
curl -G 'http://localhost:9090/api/v1/query_range' \
--data-urlencode 'query=rate(process_cpu_seconds_total{job="app"}[5m])' \
--data-urlencode 'start=2024-01-01T10:00:00Z' \
--data-urlencode 'end=2024-01-01T11:00:00Z' \
--data-urlencode 'step=30s'
# Correlate errors with deployments (using annotations)
curl -X POST http://localhost:3000/api/annotations \
-H "Authorization: Bearer $GRAFANA_TOKEN" \
-H "Content-Type: application/json" \
-d "{
\"time\": $(date +%s)000,
\"text\": \"Deployed v2.1.0\",
\"tags\": [\"deployment\", \"v2.1.0\"]
}"
# Export metrics for analysis
curl -G 'http://localhost:9090/api/v1/query_range' \
--data-urlencode 'query=http_requests_total' \
--data-urlencode 'start=2024-01-01T00:00:00Z' \
--data-urlencode 'end=2024-01-01T23:59:59Z' \
--data-urlencode 'step=5m' | jq -r '.data.result[] | [.metric.instance, .values[][1]] | @csv' > metrics.csv
More from phuetz/code-buddy
blender
Blender 3D modeling, animation, and rendering automation via Python bpy scripting and CLI
19figma
Automate Figma design workflows via REST API, Plugin API, and MCP integration
3github
Interact with GitHub using the gh CLI for issues, PRs, CI runs, releases, and API queries
3gif-search
Search and download GIFs from Tenor and Giphy APIs
3ableton-live
Ableton Live music production automation via OSC protocol, MIDI, and Max for Live
3gitlab
GitLab DevOps platform with CI/CD pipelines, API automation, and glab CLI control
3