infrastructure
Installation
SKILL.md
Grafana Cloud Infrastructure Monitoring
Docs: https://grafana.com/docs/grafana-cloud/monitor-infrastructure/
Kubernetes Monitoring (k8s-monitoring Helm Chart)
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
# values.yaml
cluster:
name: production-us-east
externalServices:
prometheus:
host: https://prometheus-prod-xx.grafana.net
basicAuth:
username: "123456"
password:
secretName: grafana-cloud-secret
secretKey: api-key
loki:
host: https://logs-prod-xx.grafana.net
basicAuth:
username: "234567"
password:
secretName: grafana-cloud-secret
secretKey: api-key
tempo:
host: https://tempo-prod-xx.grafana.net:443
basicAuth:
username: "345678"
password:
secretName: grafana-cloud-secret
secretKey: api-key
metrics:
enabled: true
cost:
enabled: true # Kubernetes cost monitoring
podMonitors:
enabled: true
serviceMonitors:
enabled: true
kube-state-metrics:
enabled: true
node-exporter:
enabled: true
cadvisor:
enabled: true
logs:
pod_logs:
enabled: true
cluster_events:
enabled: true
traces:
enabled: true
profiles:
enabled: false
receivers:
grpc:
enabled: true
port: 4317
http:
enabled: true
port: 4318
kubectl create secret generic grafana-cloud-secret \
--from-literal=api-key=<your-api-key> \
-n monitoring
helm install k8s-monitoring grafana/k8s-monitoring \
-n monitoring --create-namespace \
-f values.yaml
Key Kubernetes Metrics
# CPU usage by pod
sum(rate(container_cpu_usage_seconds_total{
namespace="$namespace", container!=""}[5m])) by (pod)
# Memory usage by pod
sum(container_memory_working_set_bytes{
namespace="$namespace", container!=""}) by (pod)
# Node CPU pressure
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
# Pod restarts
increase(kube_pod_container_status_restarts_total[1h])
# Deployment readiness
kube_deployment_status_replicas_ready / kube_deployment_spec_replicas
# PVC usage
kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes
AWS CloudWatch Integration
# Alloy config for AWS CloudWatch scraping
prometheus.scrape "cloudwatch" {
targets = [{__address__ = "cloudwatch-exporter:9106"}]
forward_to = [prometheus.remote_write.cloud.receiver]
}
Or use the CloudWatch datasource directly:
# provisioning/datasources/cloudwatch.yaml
apiVersion: 1
datasources:
- name: CloudWatch
type: cloudwatch
jsonData:
defaultRegion: us-east-1
authType: default # uses EC2 instance role / ECS task role
# Or explicit credentials:
# authType: credentials
secureJsonData:
accessKey: AKIAIOSFODNN7EXAMPLE
secretKey: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
Azure Monitor Integration
# provisioning/datasources/azure.yaml
apiVersion: 1
datasources:
- name: Azure Monitor
type: grafana-azure-monitor-datasource
jsonData:
cloudName: AzureCloud
tenantId: your-tenant-id
clientId: your-client-id
secureJsonData:
clientSecret: your-client-secret
GCP / Google Cloud Monitoring
# provisioning/datasources/google.yaml
apiVersion: 1
datasources:
- name: Google Cloud Monitoring
type: stackdriver
jsonData:
authenticationType: gce # uses GCE metadata server
# Or JWT:
# authenticationType: jwt
secureJsonData:
privateKey: |
{ "type": "service_account", ... }
Node Exporter / Linux Host Monitoring
// Alloy config for Linux host metrics
prometheus.exporter.unix "host" {
rootfs_path = "/"
enable_collectors = ["cpu", "diskstats", "filesystem", "loadavg", "meminfo", "netdev", "stat", "time", "uname"]
}
prometheus.scrape "node" {
targets = prometheus.exporter.unix.host.targets
forward_to = [prometheus.remote_write.cloud.receiver]
scrape_interval = "60s"
}
Docker / Container Monitoring
// cAdvisor metrics via Alloy
prometheus.scrape "cadvisor" {
targets = [{"__address__" = "localhost:8080"}]
metrics_path = "/metrics"
forward_to = [prometheus.remote_write.cloud.receiver]
}
// Docker container logs
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.containers.targets
forward_to = [loki.write.cloud.receiver]
}
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
}
Common Infrastructure Dashboards (Grafana Cloud)
Pre-built dashboards available from the integrations catalog:
- Kubernetes / Cluster (ID: 15520)
- Kubernetes / Namespace (ID: 15521)
- Kubernetes / Pod (ID: 15522)
- Node Exporter Full (ID: 1860)
- cAdvisor (ID: 14282)
- AWS EC2 (via CloudWatch integration)
- Azure VMs (via Azure Monitor integration)
Alerting for Infrastructure
# Common infrastructure alert rules
groups:
- name: kubernetes-alerts
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} crash looping"
- alert: NodeMemoryPressure
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} low memory (<10% free)"
- alert: PersistentVolumeAlmostFull
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} almost full"
Related skills