new-site/monitoring/prometheus.yml
justin f856434642 Fix service probes: correct endpoints and permissive HTTP module
- Workers: use http_internal module (HTTP/1.0 SimpleHTTPServer)
- ERPNext: use /api/method/ping, accept 401/403 (still means alive)
- Listmonk: use /health not /api/health (403 without auth)
- Forgejo: port 3000 not 3030
- Dev API: probe via HTTPS public URL (blackbox can't reach Docker)
- Added http_internal blackbox module accepting HTTP/1.0 + 401/403

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 03:33:48 -05:00

283 lines
10 KiB
YAML

global:
scrape_interval: 30s
evaluation_interval: 30s
rule_files:
- /etc/prometheus/alert_rules.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# ── Prometheus self-monitoring ──────────────────────────────────────
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
# ── Host OS metrics (node_exporter) ────────────────────────────────
- job_name: node
static_configs:
- targets: ["node-exporter:9100"]
# ── Docker container metrics (cAdvisor) ────────────────────────────
- job_name: cadvisor
static_configs:
- targets: ["cadvisor:8080"]
# ── PostgreSQL (prod) ──────────────────────────────────────────────
- job_name: postgres_prod
static_configs:
- targets: ["postgres-exporter:9187"]
labels:
instance: prod
# ── Alertmanager ────────────────────────────────────────────────────
- job_name: alertmanager
static_configs:
- targets: ["alertmanager:9093"]
# ── nginx ──────────────────────────────────────────────────────────
- job_name: nginx
static_configs:
- targets: ["nginx-exporter:9113"]
# ══════════════════════════════════════════════════════════════════════
# Performance West Service Health Probes
# Each probe verifies the service is FUNCTIONAL, not just responding
# ══════════════════════════════════════════════════════════════════════
# ── Prod API + DB (returns 503 if DB unreachable) ──────────────────
- job_name: pw_api_prod
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://api:3001/api/v1/status
labels:
service: api
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Dev API + DB ───────────────────────────────────────────────────
- job_name: pw_api_dev
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://api.dev.performancewest.net/api/v1/status
labels:
service: api
env: dev
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Prod Site (Astro static) ───────────────────────────────────────
- job_name: pw_site_prod
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://site:80/
labels:
service: site
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Workers (Python job server — HTTP/1.0 SimpleHTTPServer) ────────
- job_name: pw_workers
metrics_path: /probe
params:
module: [http_internal]
static_configs:
- targets:
- http://workers:8090/health
labels:
service: workers
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── ERPNext CRM (login page returns 200) ───────────────────────────
- job_name: pw_erpnext
metrics_path: /probe
params:
module: [http_internal]
static_configs:
- targets:
- http://erpnext:8000/api/method/ping
labels:
service: erpnext
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── MinIO object storage ───────────────────────────────────────────
- job_name: pw_minio
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://minio:9000/minio/health/live
labels:
service: minio
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Listmonk email marketing ───────────────────────────────────────
- job_name: pw_listmonk
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://listmonk:9000/health
labels:
service: listmonk
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Ollama LLM ────────────────────────────────────────────────────
- job_name: pw_ollama
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://ollama:11434/
labels:
service: ollama
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Umami analytics ────────────────────────────────────────────────
- job_name: pw_umami
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://umami:3000/api/heartbeat
labels:
service: umami
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Forgejo git server ─────────────────────────────────────────────
- job_name: pw_forgejo
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://host.docker.internal:3000/
labels:
service: forgejo
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ══════════════════════════════════════════════════════════════════════
# External-facing HTTPS probes (SSL + reachability from outside)
# ══════════════════════════════════════════════════════════════════════
- job_name: blackbox_https
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://performancewest.net
- https://api.performancewest.net/api/v1/status
- https://dev.performancewest.net
- https://crm.performancewest.net
- https://lists.performancewest.net
- https://analytics.performancewest.net
- https://monitoring.performancewest.net
- https://crypto.performancewest.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── TCP port probes (databases, caches) ────────────────────────────
- job_name: blackbox_tcp
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- api-postgres:5432
- erpnext-mariadb:3306
- erpnext-redis:6379
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115