groups: # ══════════════════════════════════════════════════════════════════════ # Service Down Alerts # ══════════════════════════════════════════════════════════════════════ - name: service_down rules: - alert: EndpointDown expr: probe_success{job="blackbox_http"} == 0 for: 2m labels: severity: critical annotations: summary: "{{ $labels.instance }} is DOWN" description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes." - alert: TCPPortDown expr: probe_success{job="blackbox_tcp"} == 0 for: 1m labels: severity: critical annotations: summary: "TCP port {{ $labels.instance }} is DOWN" description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute." - alert: ContainerDown expr: | absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"}) or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60 for: 1m labels: severity: critical annotations: summary: "Container {{ $labels.name }} is DOWN" description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute." - alert: ContainerRestarting expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2 for: 5m labels: severity: warning annotations: summary: "Container {{ $labels.name }} is restart-looping" description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes." # ══════════════════════════════════════════════════════════════════════ # Host Resource Alerts # ══════════════════════════════════════════════════════════════════════ - name: host_resources rules: - alert: HighCPU expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 10m labels: severity: warning annotations: summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)" description: "CPU usage has been above 85% for 10 minutes." - alert: HighMemory expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90 for: 5m labels: severity: warning annotations: summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)" description: "Memory usage has been above 90% for 5 minutes." - alert: DiskSpaceLow expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)" description: "Root filesystem is more than 80% full." - alert: DiskSpaceCritical expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92 for: 2m labels: severity: critical annotations: summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)" description: "Root filesystem is more than 92% full. Immediate action required." - alert: HighLoadAverage expr: node_load15 > 8 for: 10m labels: severity: warning annotations: summary: "High load average ({{ $value | printf \"%.1f\" }})" description: "15-minute load average has been above 8 for 10 minutes." # ══════════════════════════════════════════════════════════════════════ # Database Alerts # ══════════════════════════════════════════════════════════════════════ - name: database rules: - alert: PostgresDown expr: pg_up == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is DOWN" description: "PostgreSQL exporter cannot connect to the database." - alert: PostgresHighConnections expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: summary: "High PostgreSQL connections ({{ $value }})" description: "PostgreSQL active connections exceeding 80." # ══════════════════════════════════════════════════════════════════════ # SSL Certificate Alerts # ══════════════════════════════════════════════════════════════════════ - name: ssl rules: - alert: SSLCertExpiringSoon expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600 for: 1h labels: severity: warning annotations: summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}" description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}." - alert: SSLCertExpiryCritical expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600 for: 10m labels: severity: critical annotations: summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}" description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken." # ══════════════════════════════════════════════════════════════════════ # Response Time Alerts # ══════════════════════════════════════════════════════════════════════ - name: latency rules: - alert: SlowHTTPResponse expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5 for: 5m labels: severity: warning annotations: summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)" description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}." - alert: HighNginx5xxRate expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5 for: 5m labels: severity: warning annotations: summary: "High nginx 5xx error rate" description: "More than 0.5 req/s returning 5xx errors."