Add Prometheus + Grafana + Alertmanager monitoring stack
Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
97e8664cbf
commit
a4a5500bfc
13 changed files with 581 additions and 0 deletions
|
|
@ -285,6 +285,103 @@ services:
|
|||
- umami-pgdata:/var/lib/postgresql/data
|
||||
restart: unless-stopped
|
||||
|
||||
# ── Monitoring Stack ────────────────────────────────────────────────
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "127.0.0.1:9090:9090"
|
||||
volumes:
|
||||
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.retention.time=90d
|
||||
- --web.enable-lifecycle
|
||||
restart: unless-stopped
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "127.0.0.1:3200:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-pw_grafana_2026}
|
||||
- GF_SERVER_ROOT_URL=https://monitoring.performancewest.net
|
||||
- GF_SERVER_DOMAIN=monitoring.performancewest.net
|
||||
- GF_SMTP_ENABLED=true
|
||||
- GF_SMTP_HOST=${SMTP_HOST}:${SMTP_PORT}
|
||||
- GF_SMTP_USER=${SMTP_USER}
|
||||
- GF_SMTP_PASSWORD=${SMTP_PASS}
|
||||
- GF_SMTP_FROM_ADDRESS=noreply@performancewest.net
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./monitoring/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
|
||||
depends_on:
|
||||
- prometheus
|
||||
restart: unless-stopped
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
ports:
|
||||
- "127.0.0.1:9093:9093"
|
||||
volumes:
|
||||
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
- --storage.path=/alertmanager
|
||||
environment:
|
||||
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
|
||||
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
|
||||
restart: unless-stopped
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
command:
|
||||
- --path.rootfs=/host
|
||||
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
|
||||
volumes:
|
||||
- /:/host:ro,rslave
|
||||
pid: host
|
||||
restart: unless-stopped
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
privileged: true
|
||||
restart: unless-stopped
|
||||
|
||||
postgres-exporter:
|
||||
image: prometheuscommunity/postgres-exporter:latest
|
||||
environment:
|
||||
- DATA_SOURCE_NAME=postgresql://pw:${DB_PASSWORD:-pw_dev_2026}@api-postgres:5432/performancewest?sslmode=disable
|
||||
depends_on:
|
||||
- api-postgres
|
||||
restart: unless-stopped
|
||||
|
||||
nginx-exporter:
|
||||
image: nginx/nginx-prometheus-exporter:latest
|
||||
command:
|
||||
- -nginx.scrape-uri=http://host.docker.internal:80/nginx_status
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:latest
|
||||
volumes:
|
||||
- ./monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
api-pgdata:
|
||||
worker-data:
|
||||
|
|
@ -297,3 +394,5 @@ volumes:
|
|||
erpnext-mariadb-data:
|
||||
listmonk-uploads:
|
||||
umami-pgdata:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
|
|
|
|||
4
infra/ansible/ansible.cfg
Normal file
4
infra/ansible/ansible.cfg
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
[defaults]
|
||||
roles_path = ./roles
|
||||
inventory = ./inventory/hosts.yml
|
||||
host_key_checking = False
|
||||
|
|
@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net
|
|||
shkeeper_admin_domain: crypto.performancewest.net
|
||||
minio_domain: minio.performancewest.net
|
||||
minio_console_domain: minio-console.performancewest.net
|
||||
monitoring_domain: monitoring.performancewest.net
|
||||
# Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion)
|
||||
docserver_ip: 108.181.102.34
|
||||
|
||||
|
|
|
|||
|
|
@ -32,4 +32,5 @@
|
|||
- worker-crons
|
||||
- shkeeper
|
||||
- nginx
|
||||
- monitoring
|
||||
- security-updates
|
||||
|
|
|
|||
13
infra/ansible/roles/monitoring/defaults/main.yml
Normal file
13
infra/ansible/roles/monitoring/defaults/main.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
monitoring_domain: monitoring.performancewest.net
|
||||
grafana_port: 3200
|
||||
prometheus_port: 9090
|
||||
alertmanager_port: 9093
|
||||
|
||||
# Telegram bot for alerts (set in vault)
|
||||
telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}"
|
||||
telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}"
|
||||
|
||||
# Grafana admin credentials (set in vault)
|
||||
grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}"
|
||||
grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}"
|
||||
5
infra/ansible/roles/monitoring/handlers/main.yml
Normal file
5
infra/ansible/roles/monitoring/handlers/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
- name: Reload nginx
|
||||
ansible.builtin.systemd:
|
||||
name: nginx
|
||||
state: reloaded
|
||||
91
infra/ansible/roles/monitoring/tasks/main.yml
Normal file
91
infra/ansible/roles/monitoring/tasks/main.yml
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
---
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# ── 1. nginx stub_status for nginx-exporter ──────────────────────────
|
||||
- name: Enable nginx stub_status endpoint
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
server {
|
||||
listen 80;
|
||||
server_name 127.0.0.1;
|
||||
location /nginx_status {
|
||||
stub_status;
|
||||
allow 127.0.0.1;
|
||||
allow 172.16.0.0/12;
|
||||
deny all;
|
||||
}
|
||||
}
|
||||
dest: /etc/nginx/conf.d/stub-status.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Reload nginx
|
||||
|
||||
# ── 2. Deploy nginx config for monitoring.performancewest.net ────────
|
||||
- name: Deploy Grafana nginx config
|
||||
ansible.builtin.template:
|
||||
src: ../../nginx/templates/pw-monitoring-tls.conf.j2
|
||||
dest: /etc/nginx/sites-available/pw-monitoring.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Reload nginx
|
||||
|
||||
- name: Enable Grafana nginx config
|
||||
ansible.builtin.file:
|
||||
src: /etc/nginx/sites-available/pw-monitoring.conf
|
||||
dest: /etc/nginx/sites-enabled/pw-monitoring.conf
|
||||
state: link
|
||||
notify: Reload nginx
|
||||
|
||||
# ── 3. Obtain TLS certificate ────────────────────────────────────────
|
||||
- name: Check if monitoring cert exists
|
||||
ansible.builtin.stat:
|
||||
path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem
|
||||
register: monitoring_cert
|
||||
|
||||
- name: Obtain Let's Encrypt cert for monitoring domain
|
||||
ansible.builtin.command:
|
||||
cmd: >
|
||||
certbot certonly --webroot -w {{ certbot_webroot }}
|
||||
-d {{ monitoring_domain }}
|
||||
--non-interactive --agree-tos
|
||||
--email {{ certbot_email }}
|
||||
when: not monitoring_cert.stat.exists
|
||||
notify: Reload nginx
|
||||
|
||||
# ── 4. Set env vars for Telegram in .env ─────────────────────────────
|
||||
- name: Ensure Telegram vars in .env
|
||||
ansible.builtin.lineinfile:
|
||||
path: "{{ project_dir }}/.env"
|
||||
regexp: "^{{ item.key }}="
|
||||
line: "{{ item.key }}={{ item.value }}"
|
||||
state: present
|
||||
loop:
|
||||
- { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" }
|
||||
- { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" }
|
||||
- { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" }
|
||||
- { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" }
|
||||
when: telegram_bot_token != ""
|
||||
no_log: true
|
||||
|
||||
# ── 5. UFW rules ─────────────────────────────────────────────────────
|
||||
- name: Allow Grafana from localhost only
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "{{ grafana_port }}"
|
||||
proto: tcp
|
||||
from_ip: 127.0.0.1
|
||||
comment: "Grafana (via nginx)"
|
||||
|
||||
# ── 6. Start monitoring stack ────────────────────────────────────────
|
||||
- name: Start monitoring containers
|
||||
ansible.builtin.shell:
|
||||
cmd: >
|
||||
cd {{ project_dir }} &&
|
||||
docker compose up -d prometheus grafana alertmanager
|
||||
node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter
|
||||
chdir: "{{ project_dir }}"
|
||||
changed_when: true
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
# {{ ansible_managed }}
|
||||
# HTTPS config for monitoring.performancewest.net (Grafana)
|
||||
|
||||
# Redirect HTTP -> HTTPS
|
||||
server {
|
||||
listen 80;
|
||||
server_name monitoring.performancewest.net;
|
||||
|
||||
location /.well-known/acme-challenge/ {
|
||||
root {{ certbot_webroot }};
|
||||
}
|
||||
|
||||
location / {
|
||||
return 301 https://monitoring.performancewest.net$request_uri;
|
||||
}
|
||||
}
|
||||
|
||||
# Grafana dashboard
|
||||
server {
|
||||
listen 443 ssl;
|
||||
http2 on;
|
||||
server_name monitoring.performancewest.net;
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||||
ssl_prefer_server_ciphers on;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 10m;
|
||||
|
||||
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
|
||||
|
||||
include /etc/nginx/snippets/pw-security.conf;
|
||||
|
||||
client_max_body_size 10m;
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:3200;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# WebSocket for Grafana Live
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
|
||||
proxy_connect_timeout 10s;
|
||||
proxy_send_timeout 60s;
|
||||
proxy_read_timeout 60s;
|
||||
}
|
||||
|
||||
location /.well-known/acme-challenge/ {
|
||||
root {{ certbot_webroot }};
|
||||
}
|
||||
}
|
||||
162
monitoring/alert_rules.yml
Normal file
162
monitoring/alert_rules.yml
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
groups:
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Service Down Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: service_down
|
||||
rules:
|
||||
- alert: EndpointDown
|
||||
expr: probe_success{job="blackbox_http"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is DOWN"
|
||||
description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
|
||||
|
||||
- alert: TCPPortDown
|
||||
expr: probe_success{job="blackbox_tcp"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TCP port {{ $labels.instance }} is DOWN"
|
||||
description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
|
||||
|
||||
- alert: ContainerDown
|
||||
expr: |
|
||||
absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
|
||||
or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is DOWN"
|
||||
description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
|
||||
|
||||
- alert: ContainerRestarting
|
||||
expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is restart-looping"
|
||||
description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Host Resource Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: host_resources
|
||||
rules:
|
||||
- alert: HighCPU
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "CPU usage has been above 85% for 10 minutes."
|
||||
|
||||
- alert: HighMemory
|
||||
expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Memory usage has been above 90% for 5 minutes."
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Root filesystem is more than 80% full."
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Root filesystem is more than 92% full. Immediate action required."
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load15 > 8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average ({{ $value | printf \"%.1f\" }})"
|
||||
description: "15-minute load average has been above 8 for 10 minutes."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Database Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: database
|
||||
rules:
|
||||
- alert: PostgresDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is DOWN"
|
||||
description: "PostgreSQL exporter cannot connect to the database."
|
||||
|
||||
- alert: PostgresHighConnections
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High PostgreSQL connections ({{ $value }})"
|
||||
description: "PostgreSQL active connections exceeding 80."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# SSL Certificate Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: ssl
|
||||
rules:
|
||||
- alert: SSLCertExpiringSoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
|
||||
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
|
||||
|
||||
- alert: SSLCertExpiryCritical
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
|
||||
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Response Time Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: latency
|
||||
rules:
|
||||
- alert: SlowHTTPResponse
|
||||
expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
|
||||
description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
|
||||
|
||||
- alert: HighNginx5xxRate
|
||||
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High nginx 5xx error rate"
|
||||
description: "More than 0.5 req/s returning 5xx errors."
|
||||
40
monitoring/alertmanager.yml
Normal file
40
monitoring/alertmanager.yml
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: telegram
|
||||
group_by: [alertname, instance]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: telegram
|
||||
repeat_interval: 1h
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: telegram
|
||||
repeat_interval: 6h
|
||||
|
||||
receivers:
|
||||
- name: telegram
|
||||
telegram_configs:
|
||||
- bot_token: "${TELEGRAM_BOT_TOKEN}"
|
||||
chat_id: ${TELEGRAM_CHAT_ID}
|
||||
parse_mode: HTML
|
||||
message: |
|
||||
{{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .Status | toUpper }}</b>
|
||||
{{ range .Alerts }}
|
||||
<b>{{ .Labels.alertname }}</b>
|
||||
{{ .Annotations.summary }}
|
||||
{{ if .Annotations.description }}<i>{{ .Annotations.description }}</i>{{ end }}
|
||||
{{ end }}
|
||||
<code>Server: pw-server | {{ .ExternalURL }}</code>
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: critical
|
||||
target_match:
|
||||
severity: warning
|
||||
equal: [alertname, instance]
|
||||
15
monitoring/blackbox.yml
Normal file
15
monitoring/blackbox.yml
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200, 301, 302]
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: ip4
|
||||
tls_config:
|
||||
insecure_skip_verify: false
|
||||
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
timeout: 5s
|
||||
9
monitoring/grafana-datasources.yml
Normal file
9
monitoring/grafana-datasources.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
83
monitoring/prometheus.yml
Normal file
83
monitoring/prometheus.yml
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
global:
|
||||
scrape_interval: 30s
|
||||
evaluation_interval: 30s
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/alert_rules.yml
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# ── Prometheus self-monitoring ──────────────────────────────────────
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# ── Host OS metrics (node_exporter) ────────────────────────────────
|
||||
- job_name: node
|
||||
static_configs:
|
||||
- targets: ["node-exporter:9100"]
|
||||
|
||||
# ── Docker container metrics (cAdvisor) ────────────────────────────
|
||||
- job_name: cadvisor
|
||||
static_configs:
|
||||
- targets: ["cadvisor:8080"]
|
||||
|
||||
# ── PostgreSQL (prod) ──────────────────────────────────────────────
|
||||
- job_name: postgres_prod
|
||||
static_configs:
|
||||
- targets: ["postgres-exporter:9187"]
|
||||
labels:
|
||||
instance: prod
|
||||
|
||||
# ── nginx ──────────────────────────────────────────────────────────
|
||||
- job_name: nginx
|
||||
static_configs:
|
||||
- targets: ["nginx-exporter:9113"]
|
||||
|
||||
# ── Blackbox probes (HTTP endpoint monitoring) ─────────────────────
|
||||
- job_name: blackbox_http
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://performancewest.net
|
||||
- https://api.performancewest.net/api/v1/fcc/search?q=test
|
||||
- https://dev.performancewest.net
|
||||
- https://api.dev.performancewest.net/api/v1/fcc/search?q=test
|
||||
- https://crm.performancewest.net
|
||||
- https://lists.performancewest.net
|
||||
- https://analytics.performancewest.net
|
||||
- https://minio.performancewest.net/minio/health/live
|
||||
- https://crypto.performancewest.net
|
||||
- https://pay.performancewest.net
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Blackbox TCP probes (port monitoring) ──────────────────────────
|
||||
- job_name: blackbox_tcp
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [tcp_connect]
|
||||
static_configs:
|
||||
- targets:
|
||||
- api-postgres:5432
|
||||
- erpnext-mariadb:3306
|
||||
- erpnext-redis:6379
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
Loading…
Add table
Add a link
Reference in a new issue