From a4a5500bfc4a7c25bd903c9e794cbbb68c87b7d3 Mon Sep 17 00:00:00 2001 From: justin Date: Fri, 1 May 2026 02:08:39 -0500 Subject: [PATCH] Add Prometheus + Grafana + Alertmanager monitoring stack Full observability stack with Telegram alerting: Components: - Prometheus: metrics collection, 90-day retention - Grafana: dashboards at monitoring.performancewest.net - Alertmanager: routes alerts to Telegram bot - node-exporter: OS metrics (CPU, RAM, disk, network) - cAdvisor: container metrics (CPU, memory, restarts) - postgres-exporter: PostgreSQL connection/query metrics - nginx-exporter: request rate, 5xx errors, connections - blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks Alert rules: - Service down (HTTP probe, TCP port, container missing) - Container restart loops - High CPU/memory/disk/load - PostgreSQL down or high connections - SSL cert expiring (14d warning, 3d critical) - Slow HTTP responses, high 5xx rate Blackbox probes all public endpoints: performancewest.net, api, dev, crm, lists, analytics, minio, crypto, pay Telegram alerts: critical=1h repeat, warning=6h repeat, auto-resolve notifications Co-Authored-By: Claude Opus 4.6 (1M context) --- docker-compose.yml | 99 +++++++++++ infra/ansible/ansible.cfg | 4 + infra/ansible/inventory/group_vars/all.yml | 1 + infra/ansible/playbooks/site.yml | 1 + .../roles/monitoring/defaults/main.yml | 13 ++ .../roles/monitoring/handlers/main.yml | 5 + infra/ansible/roles/monitoring/tasks/main.yml | 91 ++++++++++ .../nginx/templates/pw-monitoring-tls.conf.j2 | 58 +++++++ monitoring/alert_rules.yml | 162 ++++++++++++++++++ monitoring/alertmanager.yml | 40 +++++ monitoring/blackbox.yml | 15 ++ monitoring/grafana-datasources.yml | 9 + monitoring/prometheus.yml | 83 +++++++++ 13 files changed, 581 insertions(+) create mode 100644 infra/ansible/ansible.cfg create mode 100644 infra/ansible/roles/monitoring/defaults/main.yml create mode 100644 infra/ansible/roles/monitoring/handlers/main.yml create mode 100644 infra/ansible/roles/monitoring/tasks/main.yml create mode 100644 infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2 create mode 100644 monitoring/alert_rules.yml create mode 100644 monitoring/alertmanager.yml create mode 100644 monitoring/blackbox.yml create mode 100644 monitoring/grafana-datasources.yml create mode 100644 monitoring/prometheus.yml diff --git a/docker-compose.yml b/docker-compose.yml index f8b4297..10ab469 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -285,6 +285,103 @@ services: - umami-pgdata:/var/lib/postgresql/data restart: unless-stopped + # ── Monitoring Stack ──────────────────────────────────────────────── + prometheus: + image: prom/prometheus:latest + ports: + - "127.0.0.1:9090:9090" + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro + - prometheus-data:/prometheus + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.retention.time=90d + - --web.enable-lifecycle + restart: unless-stopped + + grafana: + image: grafana/grafana:latest + ports: + - "127.0.0.1:3200:3000" + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-pw_grafana_2026} + - GF_SERVER_ROOT_URL=https://monitoring.performancewest.net + - GF_SERVER_DOMAIN=monitoring.performancewest.net + - GF_SMTP_ENABLED=true + - GF_SMTP_HOST=${SMTP_HOST}:${SMTP_PORT} + - GF_SMTP_USER=${SMTP_USER} + - GF_SMTP_PASSWORD=${SMTP_PASS} + - GF_SMTP_FROM_ADDRESS=noreply@performancewest.net + - GF_USERS_ALLOW_SIGN_UP=false + - GF_AUTH_ANONYMOUS_ENABLED=false + volumes: + - grafana-data:/var/lib/grafana + - ./monitoring/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro + depends_on: + - prometheus + restart: unless-stopped + + alertmanager: + image: prom/alertmanager:latest + ports: + - "127.0.0.1:9093:9093" + volumes: + - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + command: + - --config.file=/etc/alertmanager/alertmanager.yml + - --storage.path=/alertmanager + environment: + - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN} + - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID} + restart: unless-stopped + + node-exporter: + image: prom/node-exporter:latest + command: + - --path.rootfs=/host + - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) + volumes: + - /:/host:ro,rslave + pid: host + restart: unless-stopped + + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + devices: + - /dev/kmsg + privileged: true + restart: unless-stopped + + postgres-exporter: + image: prometheuscommunity/postgres-exporter:latest + environment: + - DATA_SOURCE_NAME=postgresql://pw:${DB_PASSWORD:-pw_dev_2026}@api-postgres:5432/performancewest?sslmode=disable + depends_on: + - api-postgres + restart: unless-stopped + + nginx-exporter: + image: nginx/nginx-prometheus-exporter:latest + command: + - -nginx.scrape-uri=http://host.docker.internal:80/nginx_status + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + + blackbox-exporter: + image: prom/blackbox-exporter:latest + volumes: + - ./monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml:ro + restart: unless-stopped + volumes: api-pgdata: worker-data: @@ -297,3 +394,5 @@ volumes: erpnext-mariadb-data: listmonk-uploads: umami-pgdata: + prometheus-data: + grafana-data: diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg new file mode 100644 index 0000000..7f78352 --- /dev/null +++ b/infra/ansible/ansible.cfg @@ -0,0 +1,4 @@ +[defaults] +roles_path = ./roles +inventory = ./inventory/hosts.yml +host_key_checking = False diff --git a/infra/ansible/inventory/group_vars/all.yml b/infra/ansible/inventory/group_vars/all.yml index 83497a0..1e78060 100644 --- a/infra/ansible/inventory/group_vars/all.yml +++ b/infra/ansible/inventory/group_vars/all.yml @@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net shkeeper_admin_domain: crypto.performancewest.net minio_domain: minio.performancewest.net minio_console_domain: minio-console.performancewest.net +monitoring_domain: monitoring.performancewest.net # Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion) docserver_ip: 108.181.102.34 diff --git a/infra/ansible/playbooks/site.yml b/infra/ansible/playbooks/site.yml index c66031c..02500fc 100644 --- a/infra/ansible/playbooks/site.yml +++ b/infra/ansible/playbooks/site.yml @@ -32,4 +32,5 @@ - worker-crons - shkeeper - nginx + - monitoring - security-updates diff --git a/infra/ansible/roles/monitoring/defaults/main.yml b/infra/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000..9540dec --- /dev/null +++ b/infra/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,13 @@ +--- +monitoring_domain: monitoring.performancewest.net +grafana_port: 3200 +prometheus_port: 9090 +alertmanager_port: 9093 + +# Telegram bot for alerts (set in vault) +telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}" +telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}" + +# Grafana admin credentials (set in vault) +grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}" +grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}" diff --git a/infra/ansible/roles/monitoring/handlers/main.yml b/infra/ansible/roles/monitoring/handlers/main.yml new file mode 100644 index 0000000..7419154 --- /dev/null +++ b/infra/ansible/roles/monitoring/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Reload nginx + ansible.builtin.systemd: + name: nginx + state: reloaded diff --git a/infra/ansible/roles/monitoring/tasks/main.yml b/infra/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..f7a5aa6 --- /dev/null +++ b/infra/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,91 @@ +--- +# ══════════════════════════════════════════════════════════════════════════════ +# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram +# ══════════════════════════════════════════════════════════════════════════════ + +# ── 1. nginx stub_status for nginx-exporter ────────────────────────── +- name: Enable nginx stub_status endpoint + ansible.builtin.copy: + content: | + server { + listen 80; + server_name 127.0.0.1; + location /nginx_status { + stub_status; + allow 127.0.0.1; + allow 172.16.0.0/12; + deny all; + } + } + dest: /etc/nginx/conf.d/stub-status.conf + owner: root + group: root + mode: "0644" + notify: Reload nginx + +# ── 2. Deploy nginx config for monitoring.performancewest.net ──────── +- name: Deploy Grafana nginx config + ansible.builtin.template: + src: ../../nginx/templates/pw-monitoring-tls.conf.j2 + dest: /etc/nginx/sites-available/pw-monitoring.conf + owner: root + group: root + mode: "0644" + notify: Reload nginx + +- name: Enable Grafana nginx config + ansible.builtin.file: + src: /etc/nginx/sites-available/pw-monitoring.conf + dest: /etc/nginx/sites-enabled/pw-monitoring.conf + state: link + notify: Reload nginx + +# ── 3. Obtain TLS certificate ──────────────────────────────────────── +- name: Check if monitoring cert exists + ansible.builtin.stat: + path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem + register: monitoring_cert + +- name: Obtain Let's Encrypt cert for monitoring domain + ansible.builtin.command: + cmd: > + certbot certonly --webroot -w {{ certbot_webroot }} + -d {{ monitoring_domain }} + --non-interactive --agree-tos + --email {{ certbot_email }} + when: not monitoring_cert.stat.exists + notify: Reload nginx + +# ── 4. Set env vars for Telegram in .env ───────────────────────────── +- name: Ensure Telegram vars in .env + ansible.builtin.lineinfile: + path: "{{ project_dir }}/.env" + regexp: "^{{ item.key }}=" + line: "{{ item.key }}={{ item.value }}" + state: present + loop: + - { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" } + - { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" } + - { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" } + - { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" } + when: telegram_bot_token != "" + no_log: true + +# ── 5. UFW rules ───────────────────────────────────────────────────── +- name: Allow Grafana from localhost only + community.general.ufw: + rule: allow + port: "{{ grafana_port }}" + proto: tcp + from_ip: 127.0.0.1 + comment: "Grafana (via nginx)" + +# ── 6. Start monitoring stack ──────────────────────────────────────── +- name: Start monitoring containers + ansible.builtin.shell: + cmd: > + cd {{ project_dir }} && + docker compose up -d prometheus grafana alertmanager + node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter + chdir: "{{ project_dir }}" + changed_when: true diff --git a/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2 b/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2 new file mode 100644 index 0000000..c4078d5 --- /dev/null +++ b/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2 @@ -0,0 +1,58 @@ +# {{ ansible_managed }} +# HTTPS config for monitoring.performancewest.net (Grafana) + +# Redirect HTTP -> HTTPS +server { + listen 80; + server_name monitoring.performancewest.net; + + location /.well-known/acme-challenge/ { + root {{ certbot_webroot }}; + } + + location / { + return 301 https://monitoring.performancewest.net$request_uri; + } +} + +# Grafana dashboard +server { + listen 443 ssl; + http2 on; + server_name monitoring.performancewest.net; + + ssl_certificate /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers HIGH:!aNULL:!MD5; + ssl_prefer_server_ciphers on; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 10m; + + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always; + + include /etc/nginx/snippets/pw-security.conf; + + client_max_body_size 10m; + + location / { + proxy_pass http://127.0.0.1:3200; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # WebSocket for Grafana Live + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + + proxy_connect_timeout 10s; + proxy_send_timeout 60s; + proxy_read_timeout 60s; + } + + location /.well-known/acme-challenge/ { + root {{ certbot_webroot }}; + } +} diff --git a/monitoring/alert_rules.yml b/monitoring/alert_rules.yml new file mode 100644 index 0000000..8f84920 --- /dev/null +++ b/monitoring/alert_rules.yml @@ -0,0 +1,162 @@ +groups: + # ══════════════════════════════════════════════════════════════════════ + # Service Down Alerts + # ══════════════════════════════════════════════════════════════════════ + - name: service_down + rules: + - alert: EndpointDown + expr: probe_success{job="blackbox_http"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "{{ $labels.instance }} is DOWN" + description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes." + + - alert: TCPPortDown + expr: probe_success{job="blackbox_tcp"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "TCP port {{ $labels.instance }} is DOWN" + description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute." + + - alert: ContainerDown + expr: | + absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"}) + or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60 + for: 1m + labels: + severity: critical + annotations: + summary: "Container {{ $labels.name }} is DOWN" + description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute." + + - alert: ContainerRestarting + expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Container {{ $labels.name }} is restart-looping" + description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes." + + # ══════════════════════════════════════════════════════════════════════ + # Host Resource Alerts + # ══════════════════════════════════════════════════════════════════════ + - name: host_resources + rules: + - alert: HighCPU + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)" + description: "CPU usage has been above 85% for 10 minutes." + + - alert: HighMemory + expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)" + description: "Memory usage has been above 90% for 5 minutes." + + - alert: DiskSpaceLow + expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)" + description: "Root filesystem is more than 80% full." + + - alert: DiskSpaceCritical + expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92 + for: 2m + labels: + severity: critical + annotations: + summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)" + description: "Root filesystem is more than 92% full. Immediate action required." + + - alert: HighLoadAverage + expr: node_load15 > 8 + for: 10m + labels: + severity: warning + annotations: + summary: "High load average ({{ $value | printf \"%.1f\" }})" + description: "15-minute load average has been above 8 for 10 minutes." + + # ══════════════════════════════════════════════════════════════════════ + # Database Alerts + # ══════════════════════════════════════════════════════════════════════ + - name: database + rules: + - alert: PostgresDown + expr: pg_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "PostgreSQL is DOWN" + description: "PostgreSQL exporter cannot connect to the database." + + - alert: PostgresHighConnections + expr: pg_stat_activity_count > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High PostgreSQL connections ({{ $value }})" + description: "PostgreSQL active connections exceeding 80." + + # ══════════════════════════════════════════════════════════════════════ + # SSL Certificate Alerts + # ══════════════════════════════════════════════════════════════════════ + - name: ssl + rules: + - alert: SSLCertExpiringSoon + expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600 + for: 1h + labels: + severity: warning + annotations: + summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}" + description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}." + + - alert: SSLCertExpiryCritical + expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600 + for: 10m + labels: + severity: critical + annotations: + summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}" + description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken." + + # ══════════════════════════════════════════════════════════════════════ + # Response Time Alerts + # ══════════════════════════════════════════════════════════════════════ + - name: latency + rules: + - alert: SlowHTTPResponse + expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)" + description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}." + + - alert: HighNginx5xxRate + expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "High nginx 5xx error rate" + description: "More than 0.5 req/s returning 5xx errors." diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml new file mode 100644 index 0000000..82907c2 --- /dev/null +++ b/monitoring/alertmanager.yml @@ -0,0 +1,40 @@ +global: + resolve_timeout: 5m + +route: + receiver: telegram + group_by: [alertname, instance] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: + - match: + severity: critical + receiver: telegram + repeat_interval: 1h + - match: + severity: warning + receiver: telegram + repeat_interval: 6h + +receivers: + - name: telegram + telegram_configs: + - bot_token: "${TELEGRAM_BOT_TOKEN}" + chat_id: ${TELEGRAM_CHAT_ID} + parse_mode: HTML + message: | + {{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} {{ .Status | toUpper }} + {{ range .Alerts }} + {{ .Labels.alertname }} + {{ .Annotations.summary }} + {{ if .Annotations.description }}{{ .Annotations.description }}{{ end }} + {{ end }} + Server: pw-server | {{ .ExternalURL }} + +inhibit_rules: + - source_match: + severity: critical + target_match: + severity: warning + equal: [alertname, instance] diff --git a/monitoring/blackbox.yml b/monitoring/blackbox.yml new file mode 100644 index 0000000..ab91b22 --- /dev/null +++ b/monitoring/blackbox.yml @@ -0,0 +1,15 @@ +modules: + http_2xx: + prober: http + timeout: 10s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200, 301, 302] + follow_redirects: true + preferred_ip_protocol: ip4 + tls_config: + insecure_skip_verify: false + + tcp_connect: + prober: tcp + timeout: 5s diff --git a/monitoring/grafana-datasources.yml b/monitoring/grafana-datasources.yml new file mode 100644 index 0000000..bb009bb --- /dev/null +++ b/monitoring/grafana-datasources.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 0000000..88c2dcc --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,83 @@ +global: + scrape_interval: 30s + evaluation_interval: 30s + +rule_files: + - /etc/prometheus/alert_rules.yml + +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + +scrape_configs: + # ── Prometheus self-monitoring ────────────────────────────────────── + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] + + # ── Host OS metrics (node_exporter) ──────────────────────────────── + - job_name: node + static_configs: + - targets: ["node-exporter:9100"] + + # ── Docker container metrics (cAdvisor) ──────────────────────────── + - job_name: cadvisor + static_configs: + - targets: ["cadvisor:8080"] + + # ── PostgreSQL (prod) ────────────────────────────────────────────── + - job_name: postgres_prod + static_configs: + - targets: ["postgres-exporter:9187"] + labels: + instance: prod + + # ── nginx ────────────────────────────────────────────────────────── + - job_name: nginx + static_configs: + - targets: ["nginx-exporter:9113"] + + # ── Blackbox probes (HTTP endpoint monitoring) ───────────────────── + - job_name: blackbox_http + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - https://performancewest.net + - https://api.performancewest.net/api/v1/fcc/search?q=test + - https://dev.performancewest.net + - https://api.dev.performancewest.net/api/v1/fcc/search?q=test + - https://crm.performancewest.net + - https://lists.performancewest.net + - https://analytics.performancewest.net + - https://minio.performancewest.net/minio/health/live + - https://crypto.performancewest.net + - https://pay.performancewest.net + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Blackbox TCP probes (port monitoring) ────────────────────────── + - job_name: blackbox_tcp + metrics_path: /probe + params: + module: [tcp_connect] + static_configs: + - targets: + - api-postgres:5432 + - erpnext-mariadb:3306 + - erpnext-redis:6379 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115