Add Prometheus + Grafana + Alertmanager monitoring stack
Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
97e8664cbf
commit
a4a5500bfc
13 changed files with 581 additions and 0 deletions
162
monitoring/alert_rules.yml
Normal file
162
monitoring/alert_rules.yml
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
groups:
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Service Down Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: service_down
|
||||
rules:
|
||||
- alert: EndpointDown
|
||||
expr: probe_success{job="blackbox_http"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is DOWN"
|
||||
description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
|
||||
|
||||
- alert: TCPPortDown
|
||||
expr: probe_success{job="blackbox_tcp"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TCP port {{ $labels.instance }} is DOWN"
|
||||
description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
|
||||
|
||||
- alert: ContainerDown
|
||||
expr: |
|
||||
absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
|
||||
or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is DOWN"
|
||||
description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
|
||||
|
||||
- alert: ContainerRestarting
|
||||
expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is restart-looping"
|
||||
description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Host Resource Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: host_resources
|
||||
rules:
|
||||
- alert: HighCPU
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "CPU usage has been above 85% for 10 minutes."
|
||||
|
||||
- alert: HighMemory
|
||||
expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Memory usage has been above 90% for 5 minutes."
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Root filesystem is more than 80% full."
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Root filesystem is more than 92% full. Immediate action required."
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load15 > 8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average ({{ $value | printf \"%.1f\" }})"
|
||||
description: "15-minute load average has been above 8 for 10 minutes."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Database Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: database
|
||||
rules:
|
||||
- alert: PostgresDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is DOWN"
|
||||
description: "PostgreSQL exporter cannot connect to the database."
|
||||
|
||||
- alert: PostgresHighConnections
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High PostgreSQL connections ({{ $value }})"
|
||||
description: "PostgreSQL active connections exceeding 80."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# SSL Certificate Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: ssl
|
||||
rules:
|
||||
- alert: SSLCertExpiringSoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
|
||||
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
|
||||
|
||||
- alert: SSLCertExpiryCritical
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
|
||||
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Response Time Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: latency
|
||||
rules:
|
||||
- alert: SlowHTTPResponse
|
||||
expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
|
||||
description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
|
||||
|
||||
- alert: HighNginx5xxRate
|
||||
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High nginx 5xx error rate"
|
||||
description: "More than 0.5 req/s returning 5xx errors."
|
||||
Loading…
Add table
Add a link
Reference in a new issue