Add Prometheus + Grafana + Alertmanager monitoring stack

Full observability stack with Telegram alerting:

Components:
- Prometheus: metrics collection, 90-day retention
- Grafana: dashboards at monitoring.performancewest.net
- Alertmanager: routes alerts to Telegram bot
- node-exporter: OS metrics (CPU, RAM, disk, network)
- cAdvisor: container metrics (CPU, memory, restarts)
- postgres-exporter: PostgreSQL connection/query metrics
- nginx-exporter: request rate, 5xx errors, connections
- blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks

Alert rules:
- Service down (HTTP probe, TCP port, container missing)
- Container restart loops
- High CPU/memory/disk/load
- PostgreSQL down or high connections
- SSL cert expiring (14d warning, 3d critical)
- Slow HTTP responses, high 5xx rate

Blackbox probes all public endpoints:
  performancewest.net, api, dev, crm, lists, analytics,
  minio, crypto, pay

Telegram alerts: critical=1h repeat, warning=6h repeat,
  auto-resolve notifications

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
justin 2026-05-01 02:08:39 -05:00
parent 97e8664cbf
commit a4a5500bfc
13 changed files with 581 additions and 0 deletions

View file

@ -0,0 +1,4 @@
[defaults]
roles_path = ./roles
inventory = ./inventory/hosts.yml
host_key_checking = False

View file

@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net
shkeeper_admin_domain: crypto.performancewest.net
minio_domain: minio.performancewest.net
minio_console_domain: minio-console.performancewest.net
monitoring_domain: monitoring.performancewest.net
# Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion)
docserver_ip: 108.181.102.34

View file

@ -32,4 +32,5 @@
- worker-crons
- shkeeper
- nginx
- monitoring
- security-updates

View file

@ -0,0 +1,13 @@
---
monitoring_domain: monitoring.performancewest.net
grafana_port: 3200
prometheus_port: 9090
alertmanager_port: 9093
# Telegram bot for alerts (set in vault)
telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}"
telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}"
# Grafana admin credentials (set in vault)
grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}"
grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}"

View file

@ -0,0 +1,5 @@
---
- name: Reload nginx
ansible.builtin.systemd:
name: nginx
state: reloaded

View file

@ -0,0 +1,91 @@
---
# ══════════════════════════════════════════════════════════════════════════════
# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram
# ══════════════════════════════════════════════════════════════════════════════
# ── 1. nginx stub_status for nginx-exporter ──────────────────────────
- name: Enable nginx stub_status endpoint
ansible.builtin.copy:
content: |
server {
listen 80;
server_name 127.0.0.1;
location /nginx_status {
stub_status;
allow 127.0.0.1;
allow 172.16.0.0/12;
deny all;
}
}
dest: /etc/nginx/conf.d/stub-status.conf
owner: root
group: root
mode: "0644"
notify: Reload nginx
# ── 2. Deploy nginx config for monitoring.performancewest.net ────────
- name: Deploy Grafana nginx config
ansible.builtin.template:
src: ../../nginx/templates/pw-monitoring-tls.conf.j2
dest: /etc/nginx/sites-available/pw-monitoring.conf
owner: root
group: root
mode: "0644"
notify: Reload nginx
- name: Enable Grafana nginx config
ansible.builtin.file:
src: /etc/nginx/sites-available/pw-monitoring.conf
dest: /etc/nginx/sites-enabled/pw-monitoring.conf
state: link
notify: Reload nginx
# ── 3. Obtain TLS certificate ────────────────────────────────────────
- name: Check if monitoring cert exists
ansible.builtin.stat:
path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem
register: monitoring_cert
- name: Obtain Let's Encrypt cert for monitoring domain
ansible.builtin.command:
cmd: >
certbot certonly --webroot -w {{ certbot_webroot }}
-d {{ monitoring_domain }}
--non-interactive --agree-tos
--email {{ certbot_email }}
when: not monitoring_cert.stat.exists
notify: Reload nginx
# ── 4. Set env vars for Telegram in .env ─────────────────────────────
- name: Ensure Telegram vars in .env
ansible.builtin.lineinfile:
path: "{{ project_dir }}/.env"
regexp: "^{{ item.key }}="
line: "{{ item.key }}={{ item.value }}"
state: present
loop:
- { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" }
- { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" }
- { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" }
- { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" }
when: telegram_bot_token != ""
no_log: true
# ── 5. UFW rules ─────────────────────────────────────────────────────
- name: Allow Grafana from localhost only
community.general.ufw:
rule: allow
port: "{{ grafana_port }}"
proto: tcp
from_ip: 127.0.0.1
comment: "Grafana (via nginx)"
# ── 6. Start monitoring stack ────────────────────────────────────────
- name: Start monitoring containers
ansible.builtin.shell:
cmd: >
cd {{ project_dir }} &&
docker compose up -d prometheus grafana alertmanager
node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter
chdir: "{{ project_dir }}"
changed_when: true

View file

@ -0,0 +1,58 @@
# {{ ansible_managed }}
# HTTPS config for monitoring.performancewest.net (Grafana)
# Redirect HTTP -> HTTPS
server {
listen 80;
server_name monitoring.performancewest.net;
location /.well-known/acme-challenge/ {
root {{ certbot_webroot }};
}
location / {
return 301 https://monitoring.performancewest.net$request_uri;
}
}
# Grafana dashboard
server {
listen 443 ssl;
http2 on;
server_name monitoring.performancewest.net;
ssl_certificate /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
include /etc/nginx/snippets/pw-security.conf;
client_max_body_size 10m;
location / {
proxy_pass http://127.0.0.1:3200;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket for Grafana Live
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_connect_timeout 10s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
}
location /.well-known/acme-challenge/ {
root {{ certbot_webroot }};
}
}