Add Prometheus + Grafana + Alertmanager monitoring stack

Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 02:08:39 -05:00 · 2026-05-01 02:08:39 -05:00 · a4a5500bfc
commit a4a5500bfc
parent 97e8664cbf
13 changed files with 581 additions and 0 deletions
--- a/monitoring/alert_rules.yml
+++ b/monitoring/alert_rules.yml
@ -0,0 +1,162 @@
+groups:
+  # ══════════════════════════════════════════════════════════════════════
+  # Service Down Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: service_down
+    rules:
+      - alert: EndpointDown
+        expr: probe_success{job="blackbox_http"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "{{ $labels.instance }} is DOWN"
+          description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
+
+      - alert: TCPPortDown
+        expr: probe_success{job="blackbox_tcp"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "TCP port {{ $labels.instance }} is DOWN"
+          description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
+
+      - alert: ContainerDown
+        expr: |
+          absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
+          or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Container {{ $labels.name }} is DOWN"
+          description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
+
+      - alert: ContainerRestarting
+        expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container {{ $labels.name }} is restart-looping"
+          description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # Host Resource Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: host_resources
+    rules:
+      - alert: HighCPU
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
+          description: "CPU usage has been above 85% for 10 minutes."
+
+      - alert: HighMemory
+        expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
+          description: "Memory usage has been above 90% for 5 minutes."
+
+      - alert: DiskSpaceLow
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
+          description: "Root filesystem is more than 80% full."
+
+      - alert: DiskSpaceCritical
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
+          description: "Root filesystem is more than 92% full. Immediate action required."
+
+      - alert: HighLoadAverage
+        expr: node_load15 > 8
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High load average ({{ $value | printf \"%.1f\" }})"
+          description: "15-minute load average has been above 8 for 10 minutes."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # Database Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: database
+    rules:
+      - alert: PostgresDown
+        expr: pg_up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PostgreSQL is DOWN"
+          description: "PostgreSQL exporter cannot connect to the database."
+
+      - alert: PostgresHighConnections
+        expr: pg_stat_activity_count > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High PostgreSQL connections ({{ $value }})"
+          description: "PostgreSQL active connections exceeding 80."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # SSL Certificate Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: ssl
+    rules:
+      - alert: SSLCertExpiringSoon
+        expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
+          description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
+
+      - alert: SSLCertExpiryCritical
+        expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
+          description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # Response Time Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: latency
+    rules:
+      - alert: SlowHTTPResponse
+        expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
+          description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
+
+      - alert: HighNginx5xxRate
+        expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High nginx 5xx error rate"
+          description: "More than 0.5 req/s returning 5xx errors."