Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
40 lines
1 KiB
YAML
40 lines
1 KiB
YAML
global:
|
|
resolve_timeout: 5m
|
|
|
|
route:
|
|
receiver: telegram
|
|
group_by: [alertname, instance]
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
routes:
|
|
- match:
|
|
severity: critical
|
|
receiver: telegram
|
|
repeat_interval: 1h
|
|
- match:
|
|
severity: warning
|
|
receiver: telegram
|
|
repeat_interval: 6h
|
|
|
|
receivers:
|
|
- name: telegram
|
|
telegram_configs:
|
|
- bot_token: "${TELEGRAM_BOT_TOKEN}"
|
|
chat_id: ${TELEGRAM_CHAT_ID}
|
|
parse_mode: HTML
|
|
message: |
|
|
{{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .Status | toUpper }}</b>
|
|
{{ range .Alerts }}
|
|
<b>{{ .Labels.alertname }}</b>
|
|
{{ .Annotations.summary }}
|
|
{{ if .Annotations.description }}<i>{{ .Annotations.description }}</i>{{ end }}
|
|
{{ end }}
|
|
<code>Server: pw-server | {{ .ExternalURL }}</code>
|
|
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: critical
|
|
target_match:
|
|
severity: warning
|
|
equal: [alertname, instance]
|