groups: # ══════════════════════════════════════════════════════════════════════ # Performance West Core Services # ══════════════════════════════════════════════════════════════════════ - name: pw_services rules: - alert: PW_API_Down expr: probe_success{job="pw_api_prod"} == 0 for: 1m labels: severity: critical service: api annotations: summary: "Prod API is DOWN" description: "API /status endpoint failed — database may be unreachable. Check api + api-postgres containers." - alert: PW_API_Dev_Down expr: probe_success{job="pw_api_dev"} == 0 for: 3m labels: severity: warning service: api-dev annotations: summary: "Dev API is DOWN" description: "Dev API /status endpoint unreachable." - alert: PW_Site_Down expr: probe_success{job="pw_site_prod"} == 0 for: 1m labels: severity: critical service: site annotations: summary: "Prod website is DOWN" description: "performancewest.net static site is not responding." - alert: PW_Workers_Down expr: probe_success{job="pw_workers"} == 0 for: 2m labels: severity: critical service: workers annotations: summary: "Workers job server is DOWN" description: "Python workers /health endpoint failed. Compliance orders, formation filings, and cron jobs are not being processed." - alert: PW_ERPNext_Down expr: probe_success{job="pw_erpnext"} == 0 for: 2m labels: severity: critical service: erpnext annotations: summary: "ERPNext CRM is DOWN" description: "ERPNext API is unreachable. Sales orders, invoices, and customer records are inaccessible." - alert: PW_MinIO_Down expr: probe_success{job="pw_minio"} == 0 for: 2m labels: severity: critical service: minio annotations: summary: "MinIO object storage is DOWN" description: "MinIO health check failed. Document uploads, RMD packets, and file storage are unavailable." - alert: PW_Listmonk_Down expr: probe_success{job="pw_listmonk"} == 0 for: 5m labels: severity: warning service: listmonk annotations: summary: "Listmonk email service is DOWN" description: "Listmonk health endpoint failed. Email campaigns and subscriber management are unavailable." - alert: PW_Ollama_Down expr: probe_success{job="pw_ollama"} == 0 for: 5m labels: severity: warning service: ollama annotations: summary: "Ollama LLM is DOWN" description: "Ollama not responding. AI-powered document analysis in workers will fall back to regex." - alert: PW_Umami_Down expr: probe_success{job="pw_umami"} == 0 for: 5m labels: severity: warning service: umami annotations: summary: "Umami analytics is DOWN" description: "Analytics tracking endpoint unreachable. Site analytics not being recorded." - alert: PW_Forgejo_Down expr: probe_success{job="pw_forgejo"} == 0 for: 5m labels: severity: warning service: forgejo annotations: summary: "Forgejo git server is DOWN" description: "Git server unreachable. Code deployments will fail." # ══════════════════════════════════════════════════════════════════════ # External HTTPS Endpoints (SSL + reachability) # ══════════════════════════════════════════════════════════════════════ - name: external_endpoints rules: - alert: HTTPS_Endpoint_Down expr: probe_success{job="blackbox_https"} == 0 for: 2m labels: severity: critical annotations: summary: "{{ $labels.instance }} is DOWN" description: "HTTPS probe failed for {{ $labels.instance }}. Check nginx, DNS, and SSL cert." - alert: TCP_Port_Down expr: probe_success{job="blackbox_tcp"} == 0 for: 1m labels: severity: critical annotations: summary: "TCP port {{ $labels.instance }} is DOWN" description: "Database or cache port unreachable." # ══════════════════════════════════════════════════════════════════════ # Container Health # ══════════════════════════════════════════════════════════════════════ - name: containers rules: - alert: ContainerRestarting expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2 for: 5m labels: severity: warning annotations: summary: "Container {{ $labels.name }} is restart-looping" description: "{{ $labels.name }} has restarted more than 2 times in 15 minutes." - alert: ContainerHighCPU expr: rate(container_cpu_usage_seconds_total{name=~"performancewest-.*"}[5m]) * 100 > 80 for: 10m labels: severity: warning annotations: summary: "Container {{ $labels.name }} high CPU ({{ $value | printf \"%.0f\" }}%)" description: "Container CPU usage above 80% for 10 minutes." - alert: ContainerHighMemory expr: | (container_memory_usage_bytes{name=~"performancewest-.*"} / container_spec_memory_limit_bytes{name=~"performancewest-.*"} * 100 > 85) and container_spec_memory_limit_bytes{name=~"performancewest-.*"} > 0 for: 5m labels: severity: warning annotations: summary: "Container {{ $labels.name }} high memory ({{ $value | printf \"%.0f\" }}%)" description: "Container using more than 85% of its memory limit." # ══════════════════════════════════════════════════════════════════════ # Host Resources # ══════════════════════════════════════════════════════════════════════ - name: host_resources rules: - alert: HighCPU expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 10m labels: severity: warning annotations: summary: "High CPU ({{ $value | printf \"%.1f\" }}%)" - alert: HighMemory expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90 for: 5m labels: severity: warning annotations: summary: "High memory ({{ $value | printf \"%.1f\" }}%)" - alert: DiskSpaceLow expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "Disk {{ $value | printf \"%.1f\" }}% full" - alert: DiskSpaceCritical expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92 for: 2m labels: severity: critical annotations: summary: "DISK CRITICAL {{ $value | printf \"%.1f\" }}% full" # ══════════════════════════════════════════════════════════════════════ # Database # ══════════════════════════════════════════════════════════════════════ - name: database rules: - alert: PostgresDown expr: pg_up == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is DOWN" - alert: PostgresHighConnections expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: summary: "PostgreSQL {{ $value }} active connections" - alert: PostgresSlowQueries expr: pg_stat_activity_max_tx_duration > 300 for: 5m labels: severity: warning annotations: summary: "PostgreSQL query running > 5 minutes" # ══════════════════════════════════════════════════════════════════════ # SSL Certificates # ══════════════════════════════════════════════════════════════════════ - name: ssl rules: - alert: SSLCertExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 14 * 24 * 3600 for: 1h labels: severity: warning annotations: summary: "SSL cert for {{ $labels.instance }} expires in < 14 days" - alert: SSLCertExpiryCritical expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 3 * 24 * 3600 for: 10m labels: severity: critical annotations: summary: "SSL cert for {{ $labels.instance }} expires in < 3 DAYS" # ══════════════════════════════════════════════════════════════════════ # Response Time # ══════════════════════════════════════════════════════════════════════ - name: latency rules: - alert: APISlowResponse expr: probe_duration_seconds{job="pw_api_prod"} > 5 for: 5m labels: severity: warning annotations: summary: "API responding slowly ({{ $value | printf \"%.1f\" }}s)" - alert: SiteSlowResponse expr: probe_duration_seconds{job="pw_site_prod"} > 3 for: 5m labels: severity: warning annotations: summary: "Website responding slowly ({{ $value | printf \"%.1f\" }}s)" - alert: HighNginx5xxRate expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5 for: 5m labels: severity: warning annotations: summary: "High nginx 5xx error rate"