Each service gets its own Prometheus probe verifying actual functionality: - API: /status endpoint (checks DB connectivity, returns 503 if down) - Workers: /health endpoint (job server responsive) - ERPNext: API method call (MariaDB + Redis + app all working) - MinIO: /minio/health/live (storage accessible) - Listmonk: /api/health (email service + DB) - Ollama: root endpoint (LLM inference available) - Umami: /api/heartbeat (analytics tracking) - Forgejo: root page (git server accessible) - PostgreSQL: pg_up metric from postgres-exporter - All HTTPS endpoints: SSL + reachability from outside Service-specific alerts with context: - API down = DB may be unreachable - Workers down = compliance orders not processing - ERPNext down = CRM inaccessible - MinIO down = document storage unavailable Custom Grafana dashboard: "Performance West — Services Overview" - Service status grid (UP/DOWN with colors) - Response time charts (internal + HTTPS) - SSL certificate expiry gauges - Container CPU/memory per service - PostgreSQL connections, nginx req/s, active alerts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
276 lines
11 KiB
YAML
276 lines
11 KiB
YAML
groups:
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# Performance West Core Services
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: pw_services
|
|
rules:
|
|
- alert: PW_API_Down
|
|
expr: probe_success{job="pw_api_prod"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: api
|
|
annotations:
|
|
summary: "Prod API is DOWN"
|
|
description: "API /status endpoint failed — database may be unreachable. Check api + api-postgres containers."
|
|
|
|
- alert: PW_API_Dev_Down
|
|
expr: probe_success{job="pw_api_dev"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
service: api-dev
|
|
annotations:
|
|
summary: "Dev API is DOWN"
|
|
description: "Dev API /status endpoint unreachable."
|
|
|
|
- alert: PW_Site_Down
|
|
expr: probe_success{job="pw_site_prod"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: site
|
|
annotations:
|
|
summary: "Prod website is DOWN"
|
|
description: "performancewest.net static site is not responding."
|
|
|
|
- alert: PW_Workers_Down
|
|
expr: probe_success{job="pw_workers"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: workers
|
|
annotations:
|
|
summary: "Workers job server is DOWN"
|
|
description: "Python workers /health endpoint failed. Compliance orders, formation filings, and cron jobs are not being processed."
|
|
|
|
- alert: PW_ERPNext_Down
|
|
expr: probe_success{job="pw_erpnext"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: erpnext
|
|
annotations:
|
|
summary: "ERPNext CRM is DOWN"
|
|
description: "ERPNext API is unreachable. Sales orders, invoices, and customer records are inaccessible."
|
|
|
|
- alert: PW_MinIO_Down
|
|
expr: probe_success{job="pw_minio"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: minio
|
|
annotations:
|
|
summary: "MinIO object storage is DOWN"
|
|
description: "MinIO health check failed. Document uploads, RMD packets, and file storage are unavailable."
|
|
|
|
- alert: PW_Listmonk_Down
|
|
expr: probe_success{job="pw_listmonk"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: listmonk
|
|
annotations:
|
|
summary: "Listmonk email service is DOWN"
|
|
description: "Listmonk health endpoint failed. Email campaigns and subscriber management are unavailable."
|
|
|
|
- alert: PW_Ollama_Down
|
|
expr: probe_success{job="pw_ollama"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: ollama
|
|
annotations:
|
|
summary: "Ollama LLM is DOWN"
|
|
description: "Ollama not responding. AI-powered document analysis in workers will fall back to regex."
|
|
|
|
- alert: PW_Umami_Down
|
|
expr: probe_success{job="pw_umami"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: umami
|
|
annotations:
|
|
summary: "Umami analytics is DOWN"
|
|
description: "Analytics tracking endpoint unreachable. Site analytics not being recorded."
|
|
|
|
- alert: PW_Forgejo_Down
|
|
expr: probe_success{job="pw_forgejo"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: forgejo
|
|
annotations:
|
|
summary: "Forgejo git server is DOWN"
|
|
description: "Git server unreachable. Code deployments will fail."
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# External HTTPS Endpoints (SSL + reachability)
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: external_endpoints
|
|
rules:
|
|
- alert: HTTPS_Endpoint_Down
|
|
expr: probe_success{job="blackbox_https"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ $labels.instance }} is DOWN"
|
|
description: "HTTPS probe failed for {{ $labels.instance }}. Check nginx, DNS, and SSL cert."
|
|
|
|
- alert: TCP_Port_Down
|
|
expr: probe_success{job="blackbox_tcp"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "TCP port {{ $labels.instance }} is DOWN"
|
|
description: "Database or cache port unreachable."
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# Container Health
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: containers
|
|
rules:
|
|
- alert: ContainerRestarting
|
|
expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} is restart-looping"
|
|
description: "{{ $labels.name }} has restarted more than 2 times in 15 minutes."
|
|
|
|
- alert: ContainerHighCPU
|
|
expr: rate(container_cpu_usage_seconds_total{name=~"performancewest-.*"}[5m]) * 100 > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high CPU ({{ $value | printf \"%.0f\" }}%)"
|
|
description: "Container CPU usage above 80% for 10 minutes."
|
|
|
|
- alert: ContainerHighMemory
|
|
expr: container_memory_usage_bytes{name=~"performancewest-.*"} / container_spec_memory_limit_bytes{name=~"performancewest-.*"} * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high memory ({{ $value | printf \"%.0f\" }}%)"
|
|
description: "Container using more than 85% of its memory limit."
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# Host Resources
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: host_resources
|
|
rules:
|
|
- alert: HighCPU
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighMemory
|
|
expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory ({{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk {{ $value | printf \"%.1f\" }}% full"
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "DISK CRITICAL {{ $value | printf \"%.1f\" }}% full"
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# Database
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: database
|
|
rules:
|
|
- alert: PostgresDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL is DOWN"
|
|
|
|
- alert: PostgresHighConnections
|
|
expr: pg_stat_activity_count > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL {{ $value }} active connections"
|
|
|
|
- alert: PostgresSlowQueries
|
|
expr: pg_stat_activity_max_tx_duration > 300
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL query running > 5 minutes"
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# SSL Certificates
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: ssl
|
|
rules:
|
|
- alert: SSLCertExpiringSoon
|
|
expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 14 * 24 * 3600
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "SSL cert for {{ $labels.instance }} expires in < 14 days"
|
|
|
|
- alert: SSLCertExpiryCritical
|
|
expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 3 * 24 * 3600
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "SSL cert for {{ $labels.instance }} expires in < 3 DAYS"
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# Response Time
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
- name: latency
|
|
rules:
|
|
- alert: APISlowResponse
|
|
expr: probe_duration_seconds{job="pw_api_prod"} > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "API responding slowly ({{ $value | printf \"%.1f\" }}s)"
|
|
|
|
- alert: SiteSlowResponse
|
|
expr: probe_duration_seconds{job="pw_site_prod"} > 3
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Website responding slowly ({{ $value | printf \"%.1f\" }}s)"
|
|
|
|
- alert: HighNginx5xxRate
|
|
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High nginx 5xx error rate"
|