Add Prometheus + Grafana + Alertmanager monitoring stack
Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
97e8664cbf
commit
a4a5500bfc
13 changed files with 581 additions and 0 deletions
|
|
@ -285,6 +285,103 @@ services:
|
|||
- umami-pgdata:/var/lib/postgresql/data
|
||||
restart: unless-stopped
|
||||
|
||||
# ── Monitoring Stack ────────────────────────────────────────────────
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "127.0.0.1:9090:9090"
|
||||
volumes:
|
||||
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.retention.time=90d
|
||||
- --web.enable-lifecycle
|
||||
restart: unless-stopped
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "127.0.0.1:3200:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-pw_grafana_2026}
|
||||
- GF_SERVER_ROOT_URL=https://monitoring.performancewest.net
|
||||
- GF_SERVER_DOMAIN=monitoring.performancewest.net
|
||||
- GF_SMTP_ENABLED=true
|
||||
- GF_SMTP_HOST=${SMTP_HOST}:${SMTP_PORT}
|
||||
- GF_SMTP_USER=${SMTP_USER}
|
||||
- GF_SMTP_PASSWORD=${SMTP_PASS}
|
||||
- GF_SMTP_FROM_ADDRESS=noreply@performancewest.net
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./monitoring/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
|
||||
depends_on:
|
||||
- prometheus
|
||||
restart: unless-stopped
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
ports:
|
||||
- "127.0.0.1:9093:9093"
|
||||
volumes:
|
||||
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
- --storage.path=/alertmanager
|
||||
environment:
|
||||
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
|
||||
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
|
||||
restart: unless-stopped
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
command:
|
||||
- --path.rootfs=/host
|
||||
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
|
||||
volumes:
|
||||
- /:/host:ro,rslave
|
||||
pid: host
|
||||
restart: unless-stopped
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
privileged: true
|
||||
restart: unless-stopped
|
||||
|
||||
postgres-exporter:
|
||||
image: prometheuscommunity/postgres-exporter:latest
|
||||
environment:
|
||||
- DATA_SOURCE_NAME=postgresql://pw:${DB_PASSWORD:-pw_dev_2026}@api-postgres:5432/performancewest?sslmode=disable
|
||||
depends_on:
|
||||
- api-postgres
|
||||
restart: unless-stopped
|
||||
|
||||
nginx-exporter:
|
||||
image: nginx/nginx-prometheus-exporter:latest
|
||||
command:
|
||||
- -nginx.scrape-uri=http://host.docker.internal:80/nginx_status
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:latest
|
||||
volumes:
|
||||
- ./monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
api-pgdata:
|
||||
worker-data:
|
||||
|
|
@ -297,3 +394,5 @@ volumes:
|
|||
erpnext-mariadb-data:
|
||||
listmonk-uploads:
|
||||
umami-pgdata:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue