Add Prometheus + Grafana + Alertmanager monitoring stack
Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
97e8664cbf
commit
a4a5500bfc
13 changed files with 581 additions and 0 deletions
4
infra/ansible/ansible.cfg
Normal file
4
infra/ansible/ansible.cfg
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
[defaults]
|
||||
roles_path = ./roles
|
||||
inventory = ./inventory/hosts.yml
|
||||
host_key_checking = False
|
||||
|
|
@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net
|
|||
shkeeper_admin_domain: crypto.performancewest.net
|
||||
minio_domain: minio.performancewest.net
|
||||
minio_console_domain: minio-console.performancewest.net
|
||||
monitoring_domain: monitoring.performancewest.net
|
||||
# Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion)
|
||||
docserver_ip: 108.181.102.34
|
||||
|
||||
|
|
|
|||
|
|
@ -32,4 +32,5 @@
|
|||
- worker-crons
|
||||
- shkeeper
|
||||
- nginx
|
||||
- monitoring
|
||||
- security-updates
|
||||
|
|
|
|||
13
infra/ansible/roles/monitoring/defaults/main.yml
Normal file
13
infra/ansible/roles/monitoring/defaults/main.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
monitoring_domain: monitoring.performancewest.net
|
||||
grafana_port: 3200
|
||||
prometheus_port: 9090
|
||||
alertmanager_port: 9093
|
||||
|
||||
# Telegram bot for alerts (set in vault)
|
||||
telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}"
|
||||
telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}"
|
||||
|
||||
# Grafana admin credentials (set in vault)
|
||||
grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}"
|
||||
grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}"
|
||||
5
infra/ansible/roles/monitoring/handlers/main.yml
Normal file
5
infra/ansible/roles/monitoring/handlers/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
- name: Reload nginx
|
||||
ansible.builtin.systemd:
|
||||
name: nginx
|
||||
state: reloaded
|
||||
91
infra/ansible/roles/monitoring/tasks/main.yml
Normal file
91
infra/ansible/roles/monitoring/tasks/main.yml
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
---
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# ── 1. nginx stub_status for nginx-exporter ──────────────────────────
|
||||
- name: Enable nginx stub_status endpoint
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
server {
|
||||
listen 80;
|
||||
server_name 127.0.0.1;
|
||||
location /nginx_status {
|
||||
stub_status;
|
||||
allow 127.0.0.1;
|
||||
allow 172.16.0.0/12;
|
||||
deny all;
|
||||
}
|
||||
}
|
||||
dest: /etc/nginx/conf.d/stub-status.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Reload nginx
|
||||
|
||||
# ── 2. Deploy nginx config for monitoring.performancewest.net ────────
|
||||
- name: Deploy Grafana nginx config
|
||||
ansible.builtin.template:
|
||||
src: ../../nginx/templates/pw-monitoring-tls.conf.j2
|
||||
dest: /etc/nginx/sites-available/pw-monitoring.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Reload nginx
|
||||
|
||||
- name: Enable Grafana nginx config
|
||||
ansible.builtin.file:
|
||||
src: /etc/nginx/sites-available/pw-monitoring.conf
|
||||
dest: /etc/nginx/sites-enabled/pw-monitoring.conf
|
||||
state: link
|
||||
notify: Reload nginx
|
||||
|
||||
# ── 3. Obtain TLS certificate ────────────────────────────────────────
|
||||
- name: Check if monitoring cert exists
|
||||
ansible.builtin.stat:
|
||||
path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem
|
||||
register: monitoring_cert
|
||||
|
||||
- name: Obtain Let's Encrypt cert for monitoring domain
|
||||
ansible.builtin.command:
|
||||
cmd: >
|
||||
certbot certonly --webroot -w {{ certbot_webroot }}
|
||||
-d {{ monitoring_domain }}
|
||||
--non-interactive --agree-tos
|
||||
--email {{ certbot_email }}
|
||||
when: not monitoring_cert.stat.exists
|
||||
notify: Reload nginx
|
||||
|
||||
# ── 4. Set env vars for Telegram in .env ─────────────────────────────
|
||||
- name: Ensure Telegram vars in .env
|
||||
ansible.builtin.lineinfile:
|
||||
path: "{{ project_dir }}/.env"
|
||||
regexp: "^{{ item.key }}="
|
||||
line: "{{ item.key }}={{ item.value }}"
|
||||
state: present
|
||||
loop:
|
||||
- { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" }
|
||||
- { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" }
|
||||
- { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" }
|
||||
- { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" }
|
||||
when: telegram_bot_token != ""
|
||||
no_log: true
|
||||
|
||||
# ── 5. UFW rules ─────────────────────────────────────────────────────
|
||||
- name: Allow Grafana from localhost only
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "{{ grafana_port }}"
|
||||
proto: tcp
|
||||
from_ip: 127.0.0.1
|
||||
comment: "Grafana (via nginx)"
|
||||
|
||||
# ── 6. Start monitoring stack ────────────────────────────────────────
|
||||
- name: Start monitoring containers
|
||||
ansible.builtin.shell:
|
||||
cmd: >
|
||||
cd {{ project_dir }} &&
|
||||
docker compose up -d prometheus grafana alertmanager
|
||||
node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter
|
||||
chdir: "{{ project_dir }}"
|
||||
changed_when: true
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
# {{ ansible_managed }}
|
||||
# HTTPS config for monitoring.performancewest.net (Grafana)
|
||||
|
||||
# Redirect HTTP -> HTTPS
|
||||
server {
|
||||
listen 80;
|
||||
server_name monitoring.performancewest.net;
|
||||
|
||||
location /.well-known/acme-challenge/ {
|
||||
root {{ certbot_webroot }};
|
||||
}
|
||||
|
||||
location / {
|
||||
return 301 https://monitoring.performancewest.net$request_uri;
|
||||
}
|
||||
}
|
||||
|
||||
# Grafana dashboard
|
||||
server {
|
||||
listen 443 ssl;
|
||||
http2 on;
|
||||
server_name monitoring.performancewest.net;
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||||
ssl_prefer_server_ciphers on;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 10m;
|
||||
|
||||
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
|
||||
|
||||
include /etc/nginx/snippets/pw-security.conf;
|
||||
|
||||
client_max_body_size 10m;
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:3200;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# WebSocket for Grafana Live
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
|
||||
proxy_connect_timeout 10s;
|
||||
proxy_send_timeout 60s;
|
||||
proxy_read_timeout 60s;
|
||||
}
|
||||
|
||||
location /.well-known/acme-challenge/ {
|
||||
root {{ certbot_webroot }};
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue