Add Prometheus + Grafana + Alertmanager monitoring stack

Full observability stack with Telegram alerting:

Components:
- Prometheus: metrics collection, 90-day retention
- Grafana: dashboards at monitoring.performancewest.net
- Alertmanager: routes alerts to Telegram bot
- node-exporter: OS metrics (CPU, RAM, disk, network)
- cAdvisor: container metrics (CPU, memory, restarts)
- postgres-exporter: PostgreSQL connection/query metrics
- nginx-exporter: request rate, 5xx errors, connections
- blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks

Alert rules:
- Service down (HTTP probe, TCP port, container missing)
- Container restart loops
- High CPU/memory/disk/load
- PostgreSQL down or high connections
- SSL cert expiring (14d warning, 3d critical)
- Slow HTTP responses, high 5xx rate

Blackbox probes all public endpoints:
  performancewest.net, api, dev, crm, lists, analytics,
  minio, crypto, pay

Telegram alerts: critical=1h repeat, warning=6h repeat,
  auto-resolve notifications

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
justin 2026-05-01 02:08:39 -05:00
parent 97e8664cbf
commit a4a5500bfc
13 changed files with 581 additions and 0 deletions

View file

@ -285,6 +285,103 @@ services:
- umami-pgdata:/var/lib/postgresql/data
restart: unless-stopped
# ── Monitoring Stack ────────────────────────────────────────────────
prometheus:
image: prom/prometheus:latest
ports:
- "127.0.0.1:9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
- prometheus-data:/prometheus
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.retention.time=90d
- --web.enable-lifecycle
restart: unless-stopped
grafana:
image: grafana/grafana:latest
ports:
- "127.0.0.1:3200:3000"
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-pw_grafana_2026}
- GF_SERVER_ROOT_URL=https://monitoring.performancewest.net
- GF_SERVER_DOMAIN=monitoring.performancewest.net
- GF_SMTP_ENABLED=true
- GF_SMTP_HOST=${SMTP_HOST}:${SMTP_PORT}
- GF_SMTP_USER=${SMTP_USER}
- GF_SMTP_PASSWORD=${SMTP_PASS}
- GF_SMTP_FROM_ADDRESS=noreply@performancewest.net
- GF_USERS_ALLOW_SIGN_UP=false
- GF_AUTH_ANONYMOUS_ENABLED=false
volumes:
- grafana-data:/var/lib/grafana
- ./monitoring/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
depends_on:
- prometheus
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
ports:
- "127.0.0.1:9093:9093"
volumes:
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
environment:
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
command:
- --path.rootfs=/host
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
volumes:
- /:/host:ro,rslave
pid: host
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
devices:
- /dev/kmsg
privileged: true
restart: unless-stopped
postgres-exporter:
image: prometheuscommunity/postgres-exporter:latest
environment:
- DATA_SOURCE_NAME=postgresql://pw:${DB_PASSWORD:-pw_dev_2026}@api-postgres:5432/performancewest?sslmode=disable
depends_on:
- api-postgres
restart: unless-stopped
nginx-exporter:
image: nginx/nginx-prometheus-exporter:latest
command:
- -nginx.scrape-uri=http://host.docker.internal:80/nginx_status
extra_hosts:
- "host.docker.internal:host-gateway"
restart: unless-stopped
blackbox-exporter:
image: prom/blackbox-exporter:latest
volumes:
- ./monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
restart: unless-stopped
volumes:
api-pgdata:
worker-data:
@ -297,3 +394,5 @@ volumes:
erpnext-mariadb-data:
listmonk-uploads:
umami-pgdata:
prometheus-data:
grafana-data:

View file

@ -0,0 +1,4 @@
[defaults]
roles_path = ./roles
inventory = ./inventory/hosts.yml
host_key_checking = False

View file

@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net
shkeeper_admin_domain: crypto.performancewest.net
minio_domain: minio.performancewest.net
minio_console_domain: minio-console.performancewest.net
monitoring_domain: monitoring.performancewest.net
# Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion)
docserver_ip: 108.181.102.34

View file

@ -32,4 +32,5 @@
- worker-crons
- shkeeper
- nginx
- monitoring
- security-updates

View file

@ -0,0 +1,13 @@
---
monitoring_domain: monitoring.performancewest.net
grafana_port: 3200
prometheus_port: 9090
alertmanager_port: 9093
# Telegram bot for alerts (set in vault)
telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}"
telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}"
# Grafana admin credentials (set in vault)
grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}"
grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}"

View file

@ -0,0 +1,5 @@
---
- name: Reload nginx
ansible.builtin.systemd:
name: nginx
state: reloaded

View file

@ -0,0 +1,91 @@
---
# ══════════════════════════════════════════════════════════════════════════════
# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram
# ══════════════════════════════════════════════════════════════════════════════
# ── 1. nginx stub_status for nginx-exporter ──────────────────────────
- name: Enable nginx stub_status endpoint
ansible.builtin.copy:
content: |
server {
listen 80;
server_name 127.0.0.1;
location /nginx_status {
stub_status;
allow 127.0.0.1;
allow 172.16.0.0/12;
deny all;
}
}
dest: /etc/nginx/conf.d/stub-status.conf
owner: root
group: root
mode: "0644"
notify: Reload nginx
# ── 2. Deploy nginx config for monitoring.performancewest.net ────────
- name: Deploy Grafana nginx config
ansible.builtin.template:
src: ../../nginx/templates/pw-monitoring-tls.conf.j2
dest: /etc/nginx/sites-available/pw-monitoring.conf
owner: root
group: root
mode: "0644"
notify: Reload nginx
- name: Enable Grafana nginx config
ansible.builtin.file:
src: /etc/nginx/sites-available/pw-monitoring.conf
dest: /etc/nginx/sites-enabled/pw-monitoring.conf
state: link
notify: Reload nginx
# ── 3. Obtain TLS certificate ────────────────────────────────────────
- name: Check if monitoring cert exists
ansible.builtin.stat:
path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem
register: monitoring_cert
- name: Obtain Let's Encrypt cert for monitoring domain
ansible.builtin.command:
cmd: >
certbot certonly --webroot -w {{ certbot_webroot }}
-d {{ monitoring_domain }}
--non-interactive --agree-tos
--email {{ certbot_email }}
when: not monitoring_cert.stat.exists
notify: Reload nginx
# ── 4. Set env vars for Telegram in .env ─────────────────────────────
- name: Ensure Telegram vars in .env
ansible.builtin.lineinfile:
path: "{{ project_dir }}/.env"
regexp: "^{{ item.key }}="
line: "{{ item.key }}={{ item.value }}"
state: present
loop:
- { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" }
- { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" }
- { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" }
- { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" }
when: telegram_bot_token != ""
no_log: true
# ── 5. UFW rules ─────────────────────────────────────────────────────
- name: Allow Grafana from localhost only
community.general.ufw:
rule: allow
port: "{{ grafana_port }}"
proto: tcp
from_ip: 127.0.0.1
comment: "Grafana (via nginx)"
# ── 6. Start monitoring stack ────────────────────────────────────────
- name: Start monitoring containers
ansible.builtin.shell:
cmd: >
cd {{ project_dir }} &&
docker compose up -d prometheus grafana alertmanager
node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter
chdir: "{{ project_dir }}"
changed_when: true

View file

@ -0,0 +1,58 @@
# {{ ansible_managed }}
# HTTPS config for monitoring.performancewest.net (Grafana)
# Redirect HTTP -> HTTPS
server {
listen 80;
server_name monitoring.performancewest.net;
location /.well-known/acme-challenge/ {
root {{ certbot_webroot }};
}
location / {
return 301 https://monitoring.performancewest.net$request_uri;
}
}
# Grafana dashboard
server {
listen 443 ssl;
http2 on;
server_name monitoring.performancewest.net;
ssl_certificate /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
include /etc/nginx/snippets/pw-security.conf;
client_max_body_size 10m;
location / {
proxy_pass http://127.0.0.1:3200;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket for Grafana Live
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_connect_timeout 10s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
}
location /.well-known/acme-challenge/ {
root {{ certbot_webroot }};
}
}

162
monitoring/alert_rules.yml Normal file
View file

@ -0,0 +1,162 @@
groups:
# ══════════════════════════════════════════════════════════════════════
# Service Down Alerts
# ══════════════════════════════════════════════════════════════════════
- name: service_down
rules:
- alert: EndpointDown
expr: probe_success{job="blackbox_http"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} is DOWN"
description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
- alert: TCPPortDown
expr: probe_success{job="blackbox_tcp"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "TCP port {{ $labels.instance }} is DOWN"
description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
- alert: ContainerDown
expr: |
absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
for: 1m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} is DOWN"
description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
- alert: ContainerRestarting
expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} is restart-looping"
description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
# ══════════════════════════════════════════════════════════════════════
# Host Resource Alerts
# ══════════════════════════════════════════════════════════════════════
- name: host_resources
rules:
- alert: HighCPU
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
description: "CPU usage has been above 85% for 10 minutes."
- alert: HighMemory
expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
description: "Memory usage has been above 90% for 5 minutes."
- alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
description: "Root filesystem is more than 80% full."
- alert: DiskSpaceCritical
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
for: 2m
labels:
severity: critical
annotations:
summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
description: "Root filesystem is more than 92% full. Immediate action required."
- alert: HighLoadAverage
expr: node_load15 > 8
for: 10m
labels:
severity: warning
annotations:
summary: "High load average ({{ $value | printf \"%.1f\" }})"
description: "15-minute load average has been above 8 for 10 minutes."
# ══════════════════════════════════════════════════════════════════════
# Database Alerts
# ══════════════════════════════════════════════════════════════════════
- name: database
rules:
- alert: PostgresDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is DOWN"
description: "PostgreSQL exporter cannot connect to the database."
- alert: PostgresHighConnections
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High PostgreSQL connections ({{ $value }})"
description: "PostgreSQL active connections exceeding 80."
# ══════════════════════════════════════════════════════════════════════
# SSL Certificate Alerts
# ══════════════════════════════════════════════════════════════════════
- name: ssl
rules:
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
annotations:
summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
- alert: SSLCertExpiryCritical
expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
for: 10m
labels:
severity: critical
annotations:
summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
# ══════════════════════════════════════════════════════════════════════
# Response Time Alerts
# ══════════════════════════════════════════════════════════════════════
- name: latency
rules:
- alert: SlowHTTPResponse
expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
- alert: HighNginx5xxRate
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High nginx 5xx error rate"
description: "More than 0.5 req/s returning 5xx errors."

View file

@ -0,0 +1,40 @@
global:
resolve_timeout: 5m
route:
receiver: telegram
group_by: [alertname, instance]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
- match:
severity: critical
receiver: telegram
repeat_interval: 1h
- match:
severity: warning
receiver: telegram
repeat_interval: 6h
receivers:
- name: telegram
telegram_configs:
- bot_token: "${TELEGRAM_BOT_TOKEN}"
chat_id: ${TELEGRAM_CHAT_ID}
parse_mode: HTML
message: |
{{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .Status | toUpper }}</b>
{{ range .Alerts }}
<b>{{ .Labels.alertname }}</b>
{{ .Annotations.summary }}
{{ if .Annotations.description }}<i>{{ .Annotations.description }}</i>{{ end }}
{{ end }}
<code>Server: pw-server | {{ .ExternalURL }}</code>
inhibit_rules:
- source_match:
severity: critical
target_match:
severity: warning
equal: [alertname, instance]

15
monitoring/blackbox.yml Normal file
View file

@ -0,0 +1,15 @@
modules:
http_2xx:
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200, 301, 302]
follow_redirects: true
preferred_ip_protocol: ip4
tls_config:
insecure_skip_verify: false
tcp_connect:
prober: tcp
timeout: 5s

View file

@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false

83
monitoring/prometheus.yml Normal file
View file

@ -0,0 +1,83 @@
global:
scrape_interval: 30s
evaluation_interval: 30s
rule_files:
- /etc/prometheus/alert_rules.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# ── Prometheus self-monitoring ──────────────────────────────────────
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
# ── Host OS metrics (node_exporter) ────────────────────────────────
- job_name: node
static_configs:
- targets: ["node-exporter:9100"]
# ── Docker container metrics (cAdvisor) ────────────────────────────
- job_name: cadvisor
static_configs:
- targets: ["cadvisor:8080"]
# ── PostgreSQL (prod) ──────────────────────────────────────────────
- job_name: postgres_prod
static_configs:
- targets: ["postgres-exporter:9187"]
labels:
instance: prod
# ── nginx ──────────────────────────────────────────────────────────
- job_name: nginx
static_configs:
- targets: ["nginx-exporter:9113"]
# ── Blackbox probes (HTTP endpoint monitoring) ─────────────────────
- job_name: blackbox_http
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://performancewest.net
- https://api.performancewest.net/api/v1/fcc/search?q=test
- https://dev.performancewest.net
- https://api.dev.performancewest.net/api/v1/fcc/search?q=test
- https://crm.performancewest.net
- https://lists.performancewest.net
- https://analytics.performancewest.net
- https://minio.performancewest.net/minio/health/live
- https://crypto.performancewest.net
- https://pay.performancewest.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Blackbox TCP probes (port monitoring) ──────────────────────────
- job_name: blackbox_tcp
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- api-postgres:5432
- erpnext-mariadb:3306
- erpnext-redis:6379
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115