Add deep service health monitoring for all PW dependencies
Each service gets its own Prometheus probe verifying actual functionality: - API: /status endpoint (checks DB connectivity, returns 503 if down) - Workers: /health endpoint (job server responsive) - ERPNext: API method call (MariaDB + Redis + app all working) - MinIO: /minio/health/live (storage accessible) - Listmonk: /api/health (email service + DB) - Ollama: root endpoint (LLM inference available) - Umami: /api/heartbeat (analytics tracking) - Forgejo: root page (git server accessible) - PostgreSQL: pg_up metric from postgres-exporter - All HTTPS endpoints: SSL + reachability from outside Service-specific alerts with context: - API down = DB may be unreachable - Workers down = compliance orders not processing - ERPNext down = CRM inaccessible - MinIO down = document storage unavailable Custom Grafana dashboard: "Performance West — Services Overview" - Service status grid (UP/DOWN with colors) - Response time charts (internal + HTTPS) - SSL certificate expiry gauges - Container CPU/memory per service - PostgreSQL connections, nginx req/s, active alerts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
cc463a662f
commit
2f9005693e
3 changed files with 547 additions and 87 deletions
|
|
@ -1,38 +1,137 @@
|
|||
groups:
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Service Down Alerts
|
||||
# Performance West Core Services
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: service_down
|
||||
- name: pw_services
|
||||
rules:
|
||||
- alert: EndpointDown
|
||||
expr: probe_success{job="blackbox_http"} == 0
|
||||
- alert: PW_API_Down
|
||||
expr: probe_success{job="pw_api_prod"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: api
|
||||
annotations:
|
||||
summary: "Prod API is DOWN"
|
||||
description: "API /status endpoint failed — database may be unreachable. Check api + api-postgres containers."
|
||||
|
||||
- alert: PW_API_Dev_Down
|
||||
expr: probe_success{job="pw_api_dev"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
service: api-dev
|
||||
annotations:
|
||||
summary: "Dev API is DOWN"
|
||||
description: "Dev API /status endpoint unreachable."
|
||||
|
||||
- alert: PW_Site_Down
|
||||
expr: probe_success{job="pw_site_prod"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: site
|
||||
annotations:
|
||||
summary: "Prod website is DOWN"
|
||||
description: "performancewest.net static site is not responding."
|
||||
|
||||
- alert: PW_Workers_Down
|
||||
expr: probe_success{job="pw_workers"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: workers
|
||||
annotations:
|
||||
summary: "Workers job server is DOWN"
|
||||
description: "Python workers /health endpoint failed. Compliance orders, formation filings, and cron jobs are not being processed."
|
||||
|
||||
- alert: PW_ERPNext_Down
|
||||
expr: probe_success{job="pw_erpnext"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: erpnext
|
||||
annotations:
|
||||
summary: "ERPNext CRM is DOWN"
|
||||
description: "ERPNext API is unreachable. Sales orders, invoices, and customer records are inaccessible."
|
||||
|
||||
- alert: PW_MinIO_Down
|
||||
expr: probe_success{job="pw_minio"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: minio
|
||||
annotations:
|
||||
summary: "MinIO object storage is DOWN"
|
||||
description: "MinIO health check failed. Document uploads, RMD packets, and file storage are unavailable."
|
||||
|
||||
- alert: PW_Listmonk_Down
|
||||
expr: probe_success{job="pw_listmonk"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: listmonk
|
||||
annotations:
|
||||
summary: "Listmonk email service is DOWN"
|
||||
description: "Listmonk health endpoint failed. Email campaigns and subscriber management are unavailable."
|
||||
|
||||
- alert: PW_Ollama_Down
|
||||
expr: probe_success{job="pw_ollama"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: ollama
|
||||
annotations:
|
||||
summary: "Ollama LLM is DOWN"
|
||||
description: "Ollama not responding. AI-powered document analysis in workers will fall back to regex."
|
||||
|
||||
- alert: PW_Umami_Down
|
||||
expr: probe_success{job="pw_umami"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: umami
|
||||
annotations:
|
||||
summary: "Umami analytics is DOWN"
|
||||
description: "Analytics tracking endpoint unreachable. Site analytics not being recorded."
|
||||
|
||||
- alert: PW_Forgejo_Down
|
||||
expr: probe_success{job="pw_forgejo"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: forgejo
|
||||
annotations:
|
||||
summary: "Forgejo git server is DOWN"
|
||||
description: "Git server unreachable. Code deployments will fail."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# External HTTPS Endpoints (SSL + reachability)
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: external_endpoints
|
||||
rules:
|
||||
- alert: HTTPS_Endpoint_Down
|
||||
expr: probe_success{job="blackbox_https"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is DOWN"
|
||||
description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
|
||||
description: "HTTPS probe failed for {{ $labels.instance }}. Check nginx, DNS, and SSL cert."
|
||||
|
||||
- alert: TCPPortDown
|
||||
- alert: TCP_Port_Down
|
||||
expr: probe_success{job="blackbox_tcp"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TCP port {{ $labels.instance }} is DOWN"
|
||||
description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
|
||||
|
||||
- alert: ContainerDown
|
||||
expr: |
|
||||
absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
|
||||
or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is DOWN"
|
||||
description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
|
||||
description: "Database or cache port unreachable."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Container Health
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: ContainerRestarting
|
||||
expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
|
||||
for: 5m
|
||||
|
|
@ -40,10 +139,28 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is restart-looping"
|
||||
description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
|
||||
description: "{{ $labels.name }} has restarted more than 2 times in 15 minutes."
|
||||
|
||||
- alert: ContainerHighCPU
|
||||
expr: rate(container_cpu_usage_seconds_total{name=~"performancewest-.*"}[5m]) * 100 > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} high CPU ({{ $value | printf \"%.0f\" }}%)"
|
||||
description: "Container CPU usage above 80% for 10 minutes."
|
||||
|
||||
- alert: ContainerHighMemory
|
||||
expr: container_memory_usage_bytes{name=~"performancewest-.*"} / container_spec_memory_limit_bytes{name=~"performancewest-.*"} * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} high memory ({{ $value | printf \"%.0f\" }}%)"
|
||||
description: "Container using more than 85% of its memory limit."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Host Resource Alerts
|
||||
# Host Resources
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: host_resources
|
||||
rules:
|
||||
|
|
@ -53,8 +170,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "CPU usage has been above 85% for 10 minutes."
|
||||
summary: "High CPU ({{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighMemory
|
||||
expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
|
||||
|
|
@ -62,8 +178,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Memory usage has been above 90% for 5 minutes."
|
||||
summary: "High memory ({{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
|
||||
|
|
@ -71,8 +186,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Root filesystem is more than 80% full."
|
||||
summary: "Disk {{ $value | printf \"%.1f\" }}% full"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
|
||||
|
|
@ -80,20 +194,10 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
|
||||
description: "Root filesystem is more than 92% full. Immediate action required."
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load15 > 8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average ({{ $value | printf \"%.1f\" }})"
|
||||
description: "15-minute load average has been above 8 for 10 minutes."
|
||||
summary: "DISK CRITICAL {{ $value | printf \"%.1f\" }}% full"
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Database Alerts
|
||||
# Database
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: database
|
||||
rules:
|
||||
|
|
@ -104,7 +208,6 @@ groups:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is DOWN"
|
||||
description: "PostgreSQL exporter cannot connect to the database."
|
||||
|
||||
- alert: PostgresHighConnections
|
||||
expr: pg_stat_activity_count > 80
|
||||
|
|
@ -112,45 +215,57 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High PostgreSQL connections ({{ $value }})"
|
||||
description: "PostgreSQL active connections exceeding 80."
|
||||
summary: "PostgreSQL {{ $value }} active connections"
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# SSL Certificate Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: ssl
|
||||
rules:
|
||||
- alert: SSLCertExpiringSoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
|
||||
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
|
||||
|
||||
- alert: SSLCertExpiryCritical
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
|
||||
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Response Time Alerts
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: latency
|
||||
rules:
|
||||
- alert: SlowHTTPResponse
|
||||
expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
|
||||
- alert: PostgresSlowQueries
|
||||
expr: pg_stat_activity_max_tx_duration > 300
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
|
||||
description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
|
||||
summary: "PostgreSQL query running > 5 minutes"
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# SSL Certificates
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: ssl
|
||||
rules:
|
||||
- alert: SSLCertExpiringSoon
|
||||
expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 14 * 24 * 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SSL cert for {{ $labels.instance }} expires in < 14 days"
|
||||
|
||||
- alert: SSLCertExpiryCritical
|
||||
expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 3 * 24 * 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SSL cert for {{ $labels.instance }} expires in < 3 DAYS"
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Response Time
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- name: latency
|
||||
rules:
|
||||
- alert: APISlowResponse
|
||||
expr: probe_duration_seconds{job="pw_api_prod"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API responding slowly ({{ $value | printf \"%.1f\" }}s)"
|
||||
|
||||
- alert: SiteSlowResponse
|
||||
expr: probe_duration_seconds{job="pw_site_prod"} > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Website responding slowly ({{ $value | printf \"%.1f\" }}s)"
|
||||
|
||||
- alert: HighNginx5xxRate
|
||||
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
|
||||
|
|
@ -159,4 +274,3 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "High nginx 5xx error rate"
|
||||
description: "More than 0.5 req/s returning 5xx errors."
|
||||
|
|
|
|||
|
|
@ -44,23 +44,22 @@ scrape_configs:
|
|||
static_configs:
|
||||
- targets: ["nginx-exporter:9113"]
|
||||
|
||||
# ── Blackbox probes (HTTP endpoint monitoring) ─────────────────────
|
||||
- job_name: blackbox_http
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# Performance West Service Health Probes
|
||||
# Each probe verifies the service is FUNCTIONAL, not just responding
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
# ── Prod API + DB (returns 503 if DB unreachable) ──────────────────
|
||||
- job_name: pw_api_prod
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://performancewest.net
|
||||
- https://api.performancewest.net/api/v1/fcc/search?q=test
|
||||
- https://dev.performancewest.net
|
||||
- https://api.dev.performancewest.net/api/v1/fcc/search?q=test
|
||||
- https://crm.performancewest.net
|
||||
- https://lists.performancewest.net
|
||||
- https://analytics.performancewest.net
|
||||
- http://minio:9000/minio/health/live
|
||||
- https://crypto.performancewest.net
|
||||
- https://pay.performancewest.net
|
||||
- http://api:3001/api/v1/status
|
||||
labels:
|
||||
service: api
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
|
|
@ -69,7 +68,203 @@ scrape_configs:
|
|||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Blackbox TCP probes (port monitoring) ──────────────────────────
|
||||
# ── Dev API + DB ───────────────────────────────────────────────────
|
||||
- job_name: pw_api_dev
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://host.docker.internal:3002/api/v1/status
|
||||
labels:
|
||||
service: api
|
||||
env: dev
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Prod Site (Astro static) ───────────────────────────────────────
|
||||
- job_name: pw_site_prod
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://site:80/
|
||||
labels:
|
||||
service: site
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Workers (Python job server) ────────────────────────────────────
|
||||
- job_name: pw_workers
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://workers:8090/health
|
||||
labels:
|
||||
service: workers
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── ERPNext CRM ────────────────────────────────────────────────────
|
||||
- job_name: pw_erpnext
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://erpnext:8000/api/method/frappe.client.get_count?doctype=Customer
|
||||
labels:
|
||||
service: erpnext
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── MinIO object storage ───────────────────────────────────────────
|
||||
- job_name: pw_minio
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://minio:9000/minio/health/live
|
||||
labels:
|
||||
service: minio
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Listmonk email marketing ───────────────────────────────────────
|
||||
- job_name: pw_listmonk
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://listmonk:9000/api/health
|
||||
labels:
|
||||
service: listmonk
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Ollama LLM ────────────────────────────────────────────────────
|
||||
- job_name: pw_ollama
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://ollama:11434/
|
||||
labels:
|
||||
service: ollama
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Umami analytics ────────────────────────────────────────────────
|
||||
- job_name: pw_umami
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://umami:3000/api/heartbeat
|
||||
labels:
|
||||
service: umami
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── Forgejo git server ─────────────────────────────────────────────
|
||||
- job_name: pw_forgejo
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://host.docker.internal:3030/
|
||||
labels:
|
||||
service: forgejo
|
||||
env: prod
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# External-facing HTTPS probes (SSL + reachability from outside)
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
- job_name: blackbox_https
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://performancewest.net
|
||||
- https://api.performancewest.net/api/v1/status
|
||||
- https://dev.performancewest.net
|
||||
- https://crm.performancewest.net
|
||||
- https://lists.performancewest.net
|
||||
- https://analytics.performancewest.net
|
||||
- https://monitoring.performancewest.net
|
||||
- https://crypto.performancewest.net
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ── TCP port probes (databases, caches) ────────────────────────────
|
||||
- job_name: blackbox_tcp
|
||||
metrics_path: /probe
|
||||
params:
|
||||
|
|
|
|||
151
monitoring/pw-services-dashboard.json
Normal file
151
monitoring/pw-services-dashboard.json
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"uid": null,
|
||||
"title": "Performance West — Services Overview",
|
||||
"tags": ["performancewest", "services"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Service Status",
|
||||
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" },
|
||||
{ "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background", "textMode": "auto" },
|
||||
"targets": [
|
||||
{ "expr": "probe_success{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_listmonk'}", "legendFormat": "Listmonk", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_ollama'}", "legendFormat": "Ollama", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_umami'}", "legendFormat": "Umami", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_success{job='pw_forgejo'}", "legendFormat": "Forgejo", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "pg_up", "legendFormat": "PostgreSQL", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Service Response Time",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "yellow" }, { "value": 3, "color": "red" }] } }
|
||||
},
|
||||
"targets": [
|
||||
{ "expr": "probe_duration_seconds{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_duration_seconds{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_duration_seconds{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_duration_seconds{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } },
|
||||
{ "expr": "probe_duration_seconds{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "HTTPS Endpoint Response Time",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 2, "color": "yellow" }, { "value": 5, "color": "red" }] } }
|
||||
},
|
||||
"targets": [
|
||||
{ "expr": "probe_duration_seconds{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "HTTPS Endpoints",
|
||||
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 12 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" },
|
||||
{ "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] }
|
||||
}
|
||||
},
|
||||
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background" },
|
||||
"targets": [
|
||||
{ "expr": "probe_success{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "gauge",
|
||||
"title": "SSL Certificate Days Remaining",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "d",
|
||||
"min": 0, "max": 90,
|
||||
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 7, "color": "orange" }, { "value": 14, "color": "yellow" }, { "value": 30, "color": "green" }] }
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{ "expr": "(probe_ssl_earliest_cert_expiry{job='blackbox_https'} - time()) / 86400", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Container CPU Usage",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
|
||||
"fieldConfig": { "defaults": { "unit": "percentunit" } },
|
||||
"targets": [
|
||||
{ "expr": "rate(container_cpu_usage_seconds_total{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama)-1'}[5m])", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Container Memory Usage",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } },
|
||||
"targets": [
|
||||
{ "expr": "container_memory_usage_bytes{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama|api-postgres)-1'}", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "PostgreSQL",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 30 },
|
||||
"targets": [
|
||||
{ "expr": "pg_stat_activity_count", "legendFormat": "Active Connections", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "nginx Requests/sec",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 30 },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } },
|
||||
"targets": [
|
||||
{ "expr": "rate(nginx_http_requests_total[5m])", "legendFormat": "req/s", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Active Alerts",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 30 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "orange" }, { "value": 3, "color": "red" }] }
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{ "expr": "ALERTS{alertstate='firing'}", "legendFormat": "{{ alertname }}", "datasource": { "type": "prometheus" } }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39
|
||||
},
|
||||
"overwrite": true,
|
||||
"folderId": 0
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue