diff --git a/docker-compose.yml b/docker-compose.yml
index f8b4297..10ab469 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -285,6 +285,103 @@ services:
- umami-pgdata:/var/lib/postgresql/data
restart: unless-stopped
+ # ── Monitoring Stack ────────────────────────────────────────────────
+ prometheus:
+ image: prom/prometheus:latest
+ ports:
+ - "127.0.0.1:9090:9090"
+ volumes:
+ - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+ - ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
+ - prometheus-data:/prometheus
+ command:
+ - --config.file=/etc/prometheus/prometheus.yml
+ - --storage.tsdb.retention.time=90d
+ - --web.enable-lifecycle
+ restart: unless-stopped
+
+ grafana:
+ image: grafana/grafana:latest
+ ports:
+ - "127.0.0.1:3200:3000"
+ environment:
+ - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-pw_grafana_2026}
+ - GF_SERVER_ROOT_URL=https://monitoring.performancewest.net
+ - GF_SERVER_DOMAIN=monitoring.performancewest.net
+ - GF_SMTP_ENABLED=true
+ - GF_SMTP_HOST=${SMTP_HOST}:${SMTP_PORT}
+ - GF_SMTP_USER=${SMTP_USER}
+ - GF_SMTP_PASSWORD=${SMTP_PASS}
+ - GF_SMTP_FROM_ADDRESS=noreply@performancewest.net
+ - GF_USERS_ALLOW_SIGN_UP=false
+ - GF_AUTH_ANONYMOUS_ENABLED=false
+ volumes:
+ - grafana-data:/var/lib/grafana
+ - ./monitoring/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
+ depends_on:
+ - prometheus
+ restart: unless-stopped
+
+ alertmanager:
+ image: prom/alertmanager:latest
+ ports:
+ - "127.0.0.1:9093:9093"
+ volumes:
+ - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+ command:
+ - --config.file=/etc/alertmanager/alertmanager.yml
+ - --storage.path=/alertmanager
+ environment:
+ - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
+ - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
+ restart: unless-stopped
+
+ node-exporter:
+ image: prom/node-exporter:latest
+ command:
+ - --path.rootfs=/host
+ - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
+ volumes:
+ - /:/host:ro,rslave
+ pid: host
+ restart: unless-stopped
+
+ cadvisor:
+ image: gcr.io/cadvisor/cadvisor:latest
+ volumes:
+ - /:/rootfs:ro
+ - /var/run:/var/run:ro
+ - /sys:/sys:ro
+ - /var/lib/docker/:/var/lib/docker:ro
+ - /dev/disk/:/dev/disk:ro
+ devices:
+ - /dev/kmsg
+ privileged: true
+ restart: unless-stopped
+
+ postgres-exporter:
+ image: prometheuscommunity/postgres-exporter:latest
+ environment:
+ - DATA_SOURCE_NAME=postgresql://pw:${DB_PASSWORD:-pw_dev_2026}@api-postgres:5432/performancewest?sslmode=disable
+ depends_on:
+ - api-postgres
+ restart: unless-stopped
+
+ nginx-exporter:
+ image: nginx/nginx-prometheus-exporter:latest
+ command:
+ - -nginx.scrape-uri=http://host.docker.internal:80/nginx_status
+ extra_hosts:
+ - "host.docker.internal:host-gateway"
+ restart: unless-stopped
+
+ blackbox-exporter:
+ image: prom/blackbox-exporter:latest
+ volumes:
+ - ./monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
+ restart: unless-stopped
+
volumes:
api-pgdata:
worker-data:
@@ -297,3 +394,5 @@ volumes:
erpnext-mariadb-data:
listmonk-uploads:
umami-pgdata:
+ prometheus-data:
+ grafana-data:
diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg
new file mode 100644
index 0000000..7f78352
--- /dev/null
+++ b/infra/ansible/ansible.cfg
@@ -0,0 +1,4 @@
+[defaults]
+roles_path = ./roles
+inventory = ./inventory/hosts.yml
+host_key_checking = False
diff --git a/infra/ansible/inventory/group_vars/all.yml b/infra/ansible/inventory/group_vars/all.yml
index 83497a0..1e78060 100644
--- a/infra/ansible/inventory/group_vars/all.yml
+++ b/infra/ansible/inventory/group_vars/all.yml
@@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net
shkeeper_admin_domain: crypto.performancewest.net
minio_domain: minio.performancewest.net
minio_console_domain: minio-console.performancewest.net
+monitoring_domain: monitoring.performancewest.net
# Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion)
docserver_ip: 108.181.102.34
diff --git a/infra/ansible/playbooks/site.yml b/infra/ansible/playbooks/site.yml
index c66031c..02500fc 100644
--- a/infra/ansible/playbooks/site.yml
+++ b/infra/ansible/playbooks/site.yml
@@ -32,4 +32,5 @@
- worker-crons
- shkeeper
- nginx
+ - monitoring
- security-updates
diff --git a/infra/ansible/roles/monitoring/defaults/main.yml b/infra/ansible/roles/monitoring/defaults/main.yml
new file mode 100644
index 0000000..9540dec
--- /dev/null
+++ b/infra/ansible/roles/monitoring/defaults/main.yml
@@ -0,0 +1,13 @@
+---
+monitoring_domain: monitoring.performancewest.net
+grafana_port: 3200
+prometheus_port: 9090
+alertmanager_port: 9093
+
+# Telegram bot for alerts (set in vault)
+telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}"
+telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}"
+
+# Grafana admin credentials (set in vault)
+grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}"
+grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}"
diff --git a/infra/ansible/roles/monitoring/handlers/main.yml b/infra/ansible/roles/monitoring/handlers/main.yml
new file mode 100644
index 0000000..7419154
--- /dev/null
+++ b/infra/ansible/roles/monitoring/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Reload nginx
+ ansible.builtin.systemd:
+ name: nginx
+ state: reloaded
diff --git a/infra/ansible/roles/monitoring/tasks/main.yml b/infra/ansible/roles/monitoring/tasks/main.yml
new file mode 100644
index 0000000..f7a5aa6
--- /dev/null
+++ b/infra/ansible/roles/monitoring/tasks/main.yml
@@ -0,0 +1,91 @@
+---
+# ══════════════════════════════════════════════════════════════════════════════
+# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram
+# ══════════════════════════════════════════════════════════════════════════════
+
+# ── 1. nginx stub_status for nginx-exporter ──────────────────────────
+- name: Enable nginx stub_status endpoint
+ ansible.builtin.copy:
+ content: |
+ server {
+ listen 80;
+ server_name 127.0.0.1;
+ location /nginx_status {
+ stub_status;
+ allow 127.0.0.1;
+ allow 172.16.0.0/12;
+ deny all;
+ }
+ }
+ dest: /etc/nginx/conf.d/stub-status.conf
+ owner: root
+ group: root
+ mode: "0644"
+ notify: Reload nginx
+
+# ── 2. Deploy nginx config for monitoring.performancewest.net ────────
+- name: Deploy Grafana nginx config
+ ansible.builtin.template:
+ src: ../../nginx/templates/pw-monitoring-tls.conf.j2
+ dest: /etc/nginx/sites-available/pw-monitoring.conf
+ owner: root
+ group: root
+ mode: "0644"
+ notify: Reload nginx
+
+- name: Enable Grafana nginx config
+ ansible.builtin.file:
+ src: /etc/nginx/sites-available/pw-monitoring.conf
+ dest: /etc/nginx/sites-enabled/pw-monitoring.conf
+ state: link
+ notify: Reload nginx
+
+# ── 3. Obtain TLS certificate ────────────────────────────────────────
+- name: Check if monitoring cert exists
+ ansible.builtin.stat:
+ path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem
+ register: monitoring_cert
+
+- name: Obtain Let's Encrypt cert for monitoring domain
+ ansible.builtin.command:
+ cmd: >
+ certbot certonly --webroot -w {{ certbot_webroot }}
+ -d {{ monitoring_domain }}
+ --non-interactive --agree-tos
+ --email {{ certbot_email }}
+ when: not monitoring_cert.stat.exists
+ notify: Reload nginx
+
+# ── 4. Set env vars for Telegram in .env ─────────────────────────────
+- name: Ensure Telegram vars in .env
+ ansible.builtin.lineinfile:
+ path: "{{ project_dir }}/.env"
+ regexp: "^{{ item.key }}="
+ line: "{{ item.key }}={{ item.value }}"
+ state: present
+ loop:
+ - { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" }
+ - { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" }
+ - { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" }
+ - { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" }
+ when: telegram_bot_token != ""
+ no_log: true
+
+# ── 5. UFW rules ─────────────────────────────────────────────────────
+- name: Allow Grafana from localhost only
+ community.general.ufw:
+ rule: allow
+ port: "{{ grafana_port }}"
+ proto: tcp
+ from_ip: 127.0.0.1
+ comment: "Grafana (via nginx)"
+
+# ── 6. Start monitoring stack ────────────────────────────────────────
+- name: Start monitoring containers
+ ansible.builtin.shell:
+ cmd: >
+ cd {{ project_dir }} &&
+ docker compose up -d prometheus grafana alertmanager
+ node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter
+ chdir: "{{ project_dir }}"
+ changed_when: true
diff --git a/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2 b/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2
new file mode 100644
index 0000000..c4078d5
--- /dev/null
+++ b/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2
@@ -0,0 +1,58 @@
+# {{ ansible_managed }}
+# HTTPS config for monitoring.performancewest.net (Grafana)
+
+# Redirect HTTP -> HTTPS
+server {
+ listen 80;
+ server_name monitoring.performancewest.net;
+
+ location /.well-known/acme-challenge/ {
+ root {{ certbot_webroot }};
+ }
+
+ location / {
+ return 301 https://monitoring.performancewest.net$request_uri;
+ }
+}
+
+# Grafana dashboard
+server {
+ listen 443 ssl;
+ http2 on;
+ server_name monitoring.performancewest.net;
+
+ ssl_certificate /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem;
+ ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem;
+ ssl_protocols TLSv1.2 TLSv1.3;
+ ssl_ciphers HIGH:!aNULL:!MD5;
+ ssl_prefer_server_ciphers on;
+ ssl_session_cache shared:SSL:10m;
+ ssl_session_timeout 10m;
+
+ add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
+
+ include /etc/nginx/snippets/pw-security.conf;
+
+ client_max_body_size 10m;
+
+ location / {
+ proxy_pass http://127.0.0.1:3200;
+ proxy_set_header Host $host;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto $scheme;
+
+ # WebSocket for Grafana Live
+ proxy_http_version 1.1;
+ proxy_set_header Upgrade $http_upgrade;
+ proxy_set_header Connection "upgrade";
+
+ proxy_connect_timeout 10s;
+ proxy_send_timeout 60s;
+ proxy_read_timeout 60s;
+ }
+
+ location /.well-known/acme-challenge/ {
+ root {{ certbot_webroot }};
+ }
+}
diff --git a/monitoring/alert_rules.yml b/monitoring/alert_rules.yml
new file mode 100644
index 0000000..8f84920
--- /dev/null
+++ b/monitoring/alert_rules.yml
@@ -0,0 +1,162 @@
+groups:
+ # ══════════════════════════════════════════════════════════════════════
+ # Service Down Alerts
+ # ══════════════════════════════════════════════════════════════════════
+ - name: service_down
+ rules:
+ - alert: EndpointDown
+ expr: probe_success{job="blackbox_http"} == 0
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: "{{ $labels.instance }} is DOWN"
+ description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
+
+ - alert: TCPPortDown
+ expr: probe_success{job="blackbox_tcp"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "TCP port {{ $labels.instance }} is DOWN"
+ description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
+
+ - alert: ContainerDown
+ expr: |
+ absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
+ or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Container {{ $labels.name }} is DOWN"
+ description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
+
+ - alert: ContainerRestarting
+ expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container {{ $labels.name }} is restart-looping"
+ description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
+
+ # ══════════════════════════════════════════════════════════════════════
+ # Host Resource Alerts
+ # ══════════════════════════════════════════════════════════════════════
+ - name: host_resources
+ rules:
+ - alert: HighCPU
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
+ description: "CPU usage has been above 85% for 10 minutes."
+
+ - alert: HighMemory
+ expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
+ description: "Memory usage has been above 90% for 5 minutes."
+
+ - alert: DiskSpaceLow
+ expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
+ description: "Root filesystem is more than 80% full."
+
+ - alert: DiskSpaceCritical
+ expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
+ description: "Root filesystem is more than 92% full. Immediate action required."
+
+ - alert: HighLoadAverage
+ expr: node_load15 > 8
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High load average ({{ $value | printf \"%.1f\" }})"
+ description: "15-minute load average has been above 8 for 10 minutes."
+
+ # ══════════════════════════════════════════════════════════════════════
+ # Database Alerts
+ # ══════════════════════════════════════════════════════════════════════
+ - name: database
+ rules:
+ - alert: PostgresDown
+ expr: pg_up == 0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "PostgreSQL is DOWN"
+ description: "PostgreSQL exporter cannot connect to the database."
+
+ - alert: PostgresHighConnections
+ expr: pg_stat_activity_count > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High PostgreSQL connections ({{ $value }})"
+ description: "PostgreSQL active connections exceeding 80."
+
+ # ══════════════════════════════════════════════════════════════════════
+ # SSL Certificate Alerts
+ # ══════════════════════════════════════════════════════════════════════
+ - name: ssl
+ rules:
+ - alert: SSLCertExpiringSoon
+ expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
+ for: 1h
+ labels:
+ severity: warning
+ annotations:
+ summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
+ description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
+
+ - alert: SSLCertExpiryCritical
+ expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
+ description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
+
+ # ══════════════════════════════════════════════════════════════════════
+ # Response Time Alerts
+ # ══════════════════════════════════════════════════════════════════════
+ - name: latency
+ rules:
+ - alert: SlowHTTPResponse
+ expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
+ description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
+
+ - alert: HighNginx5xxRate
+ expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High nginx 5xx error rate"
+ description: "More than 0.5 req/s returning 5xx errors."
diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml
new file mode 100644
index 0000000..82907c2
--- /dev/null
+++ b/monitoring/alertmanager.yml
@@ -0,0 +1,40 @@
+global:
+ resolve_timeout: 5m
+
+route:
+ receiver: telegram
+ group_by: [alertname, instance]
+ group_wait: 30s
+ group_interval: 5m
+ repeat_interval: 4h
+ routes:
+ - match:
+ severity: critical
+ receiver: telegram
+ repeat_interval: 1h
+ - match:
+ severity: warning
+ receiver: telegram
+ repeat_interval: 6h
+
+receivers:
+ - name: telegram
+ telegram_configs:
+ - bot_token: "${TELEGRAM_BOT_TOKEN}"
+ chat_id: ${TELEGRAM_CHAT_ID}
+ parse_mode: HTML
+ message: |
+ {{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} {{ .Status | toUpper }}
+ {{ range .Alerts }}
+ {{ .Labels.alertname }}
+ {{ .Annotations.summary }}
+ {{ if .Annotations.description }}{{ .Annotations.description }}{{ end }}
+ {{ end }}
+ Server: pw-server | {{ .ExternalURL }}
+
+inhibit_rules:
+ - source_match:
+ severity: critical
+ target_match:
+ severity: warning
+ equal: [alertname, instance]
diff --git a/monitoring/blackbox.yml b/monitoring/blackbox.yml
new file mode 100644
index 0000000..ab91b22
--- /dev/null
+++ b/monitoring/blackbox.yml
@@ -0,0 +1,15 @@
+modules:
+ http_2xx:
+ prober: http
+ timeout: 10s
+ http:
+ valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+ valid_status_codes: [200, 301, 302]
+ follow_redirects: true
+ preferred_ip_protocol: ip4
+ tls_config:
+ insecure_skip_verify: false
+
+ tcp_connect:
+ prober: tcp
+ timeout: 5s
diff --git a/monitoring/grafana-datasources.yml b/monitoring/grafana-datasources.yml
new file mode 100644
index 0000000..bb009bb
--- /dev/null
+++ b/monitoring/grafana-datasources.yml
@@ -0,0 +1,9 @@
+apiVersion: 1
+
+datasources:
+ - name: Prometheus
+ type: prometheus
+ access: proxy
+ url: http://prometheus:9090
+ isDefault: true
+ editable: false
diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
new file mode 100644
index 0000000..88c2dcc
--- /dev/null
+++ b/monitoring/prometheus.yml
@@ -0,0 +1,83 @@
+global:
+ scrape_interval: 30s
+ evaluation_interval: 30s
+
+rule_files:
+ - /etc/prometheus/alert_rules.yml
+
+alerting:
+ alertmanagers:
+ - static_configs:
+ - targets:
+ - alertmanager:9093
+
+scrape_configs:
+ # ── Prometheus self-monitoring ──────────────────────────────────────
+ - job_name: prometheus
+ static_configs:
+ - targets: ["localhost:9090"]
+
+ # ── Host OS metrics (node_exporter) ────────────────────────────────
+ - job_name: node
+ static_configs:
+ - targets: ["node-exporter:9100"]
+
+ # ── Docker container metrics (cAdvisor) ────────────────────────────
+ - job_name: cadvisor
+ static_configs:
+ - targets: ["cadvisor:8080"]
+
+ # ── PostgreSQL (prod) ──────────────────────────────────────────────
+ - job_name: postgres_prod
+ static_configs:
+ - targets: ["postgres-exporter:9187"]
+ labels:
+ instance: prod
+
+ # ── nginx ──────────────────────────────────────────────────────────
+ - job_name: nginx
+ static_configs:
+ - targets: ["nginx-exporter:9113"]
+
+ # ── Blackbox probes (HTTP endpoint monitoring) ─────────────────────
+ - job_name: blackbox_http
+ metrics_path: /probe
+ params:
+ module: [http_2xx]
+ static_configs:
+ - targets:
+ - https://performancewest.net
+ - https://api.performancewest.net/api/v1/fcc/search?q=test
+ - https://dev.performancewest.net
+ - https://api.dev.performancewest.net/api/v1/fcc/search?q=test
+ - https://crm.performancewest.net
+ - https://lists.performancewest.net
+ - https://analytics.performancewest.net
+ - https://minio.performancewest.net/minio/health/live
+ - https://crypto.performancewest.net
+ - https://pay.performancewest.net
+ relabel_configs:
+ - source_labels: [__address__]
+ target_label: __param_target
+ - source_labels: [__param_target]
+ target_label: instance
+ - target_label: __address__
+ replacement: blackbox-exporter:9115
+
+ # ── Blackbox TCP probes (port monitoring) ──────────────────────────
+ - job_name: blackbox_tcp
+ metrics_path: /probe
+ params:
+ module: [tcp_connect]
+ static_configs:
+ - targets:
+ - api-postgres:5432
+ - erpnext-mariadb:3306
+ - erpnext-redis:6379
+ relabel_configs:
+ - source_labels: [__address__]
+ target_label: __param_target
+ - source_labels: [__param_target]
+ target_label: instance
+ - target_label: __address__
+ replacement: blackbox-exporter:9115