Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
83 lines
3 KiB
YAML
83 lines
3 KiB
YAML
global:
|
|
scrape_interval: 30s
|
|
evaluation_interval: 30s
|
|
|
|
rule_files:
|
|
- /etc/prometheus/alert_rules.yml
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- alertmanager:9093
|
|
|
|
scrape_configs:
|
|
# ── Prometheus self-monitoring ──────────────────────────────────────
|
|
- job_name: prometheus
|
|
static_configs:
|
|
- targets: ["localhost:9090"]
|
|
|
|
# ── Host OS metrics (node_exporter) ────────────────────────────────
|
|
- job_name: node
|
|
static_configs:
|
|
- targets: ["node-exporter:9100"]
|
|
|
|
# ── Docker container metrics (cAdvisor) ────────────────────────────
|
|
- job_name: cadvisor
|
|
static_configs:
|
|
- targets: ["cadvisor:8080"]
|
|
|
|
# ── PostgreSQL (prod) ──────────────────────────────────────────────
|
|
- job_name: postgres_prod
|
|
static_configs:
|
|
- targets: ["postgres-exporter:9187"]
|
|
labels:
|
|
instance: prod
|
|
|
|
# ── nginx ──────────────────────────────────────────────────────────
|
|
- job_name: nginx
|
|
static_configs:
|
|
- targets: ["nginx-exporter:9113"]
|
|
|
|
# ── Blackbox probes (HTTP endpoint monitoring) ─────────────────────
|
|
- job_name: blackbox_http
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
static_configs:
|
|
- targets:
|
|
- https://performancewest.net
|
|
- https://api.performancewest.net/api/v1/fcc/search?q=test
|
|
- https://dev.performancewest.net
|
|
- https://api.dev.performancewest.net/api/v1/fcc/search?q=test
|
|
- https://crm.performancewest.net
|
|
- https://lists.performancewest.net
|
|
- https://analytics.performancewest.net
|
|
- https://minio.performancewest.net/minio/health/live
|
|
- https://crypto.performancewest.net
|
|
- https://pay.performancewest.net
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# ── Blackbox TCP probes (port monitoring) ──────────────────────────
|
|
- job_name: blackbox_tcp
|
|
metrics_path: /probe
|
|
params:
|
|
module: [tcp_connect]
|
|
static_configs:
|
|
- targets:
|
|
- api-postgres:5432
|
|
- erpnext-mariadb:3306
|
|
- erpnext-redis:6379
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|