new-site/monitoring/alert_rules.yml

groups:
  # ══════════════════════════════════════════════════════════════════════
  # Performance West Core Services
  # ══════════════════════════════════════════════════════════════════════
  - name: pw_services
    rules:
      - alert: PW_API_Down
        expr: probe_success{job="pw_api_prod"} == 0
        for: 1m
        labels:
          severity: critical
          service: api
        annotations:
          summary: "Prod API is DOWN"
          description: "API /status endpoint failed — database may be unreachable. Check api + api-postgres containers."

      - alert: PW_API_Dev_Down
        expr: probe_success{job="pw_api_dev"} == 0
        for: 3m
        labels:
          severity: warning
          service: api-dev
        annotations:
          summary: "Dev API is DOWN"
          description: "Dev API /status endpoint unreachable."

      - alert: PW_Site_Down
        expr: probe_success{job="pw_site_prod"} == 0
        for: 1m
        labels:
          severity: critical
          service: site
        annotations:
          summary: "Prod website is DOWN"
          description: "performancewest.net static site is not responding."

      - alert: PW_Workers_Down
        expr: probe_success{job="pw_workers"} == 0
        for: 2m
        labels:
          severity: critical
          service: workers
        annotations:
          summary: "Workers job server is DOWN"
          description: "Python workers /health endpoint failed. Compliance orders, formation filings, and cron jobs are not being processed."

      - alert: PW_ERPNext_Down
        expr: probe_success{job="pw_erpnext"} == 0
        for: 2m
        labels:
          severity: critical
          service: erpnext
        annotations:
          summary: "ERPNext CRM is DOWN"
          description: "ERPNext API is unreachable. Sales orders, invoices, and customer records are inaccessible."

      - alert: PW_MinIO_Down
        expr: probe_success{job="pw_minio"} == 0
        for: 2m
        labels:
          severity: critical
          service: minio
        annotations:
          summary: "MinIO object storage is DOWN"
          description: "MinIO health check failed. Document uploads, RMD packets, and file storage are unavailable."

      - alert: PW_Listmonk_Down
        expr: probe_success{job="pw_listmonk"} == 0
        for: 5m
        labels:
          severity: warning
          service: listmonk
        annotations:
          summary: "Listmonk email service is DOWN"
          description: "Listmonk health endpoint failed. Email campaigns and subscriber management are unavailable."

      - alert: PW_Ollama_Down
        expr: probe_success{job="pw_ollama"} == 0
        for: 5m
        labels:
          severity: warning
          service: ollama
        annotations:
          summary: "Ollama LLM is DOWN"
          description: "Ollama not responding. AI-powered document analysis in workers will fall back to regex."

      - alert: PW_Umami_Down
        expr: probe_success{job="pw_umami"} == 0
        for: 5m
        labels:
          severity: warning
          service: umami
        annotations:
          summary: "Umami analytics is DOWN"
          description: "Analytics tracking endpoint unreachable. Site analytics not being recorded."

      - alert: PW_Forgejo_Down
        expr: probe_success{job="pw_forgejo"} == 0
        for: 5m
        labels:
          severity: warning
          service: forgejo
        annotations:
          summary: "Forgejo git server is DOWN"
          description: "Git server unreachable. Code deployments will fail."

  # ══════════════════════════════════════════════════════════════════════
  # External HTTPS Endpoints (SSL + reachability)
  # ══════════════════════════════════════════════════════════════════════
  - name: external_endpoints
    rules:
      - alert: HTTPS_Endpoint_Down
        expr: probe_success{job="blackbox_https"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.instance }} is DOWN"
          description: "HTTPS probe failed for {{ $labels.instance }}. Check nginx, DNS, and SSL cert."

      - alert: TCP_Port_Down
        expr: probe_success{job="blackbox_tcp"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "TCP port {{ $labels.instance }} is DOWN"
          description: "Database or cache port unreachable."

  # ══════════════════════════════════════════════════════════════════════
  # Container Health
  # ══════════════════════════════════════════════════════════════════════
  - name: containers
    rules:
      - alert: ContainerRestarting
        expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} is restart-looping"
          description: "{{ $labels.name }} has restarted more than 2 times in 15 minutes."

      - alert: ContainerHighCPU
        expr: rate(container_cpu_usage_seconds_total{name=~"performancewest-.*"}[5m]) * 100 > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high CPU ({{ $value | printf \"%.0f\" }}%)"
          description: "Container CPU usage above 80% for 10 minutes."

      - alert: ContainerHighMemory
        expr: container_memory_usage_bytes{name=~"performancewest-.*"} / container_spec_memory_limit_bytes{name=~"performancewest-.*"} * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high memory ({{ $value | printf \"%.0f\" }}%)"
          description: "Container using more than 85% of its memory limit."

  # ══════════════════════════════════════════════════════════════════════
  # Host Resources
  # ══════════════════════════════════════════════════════════════════════
  - name: host_resources
    rules:
      - alert: HighCPU
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU ({{ $value | printf \"%.1f\" }}%)"

      - alert: HighMemory
        expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory ({{ $value | printf \"%.1f\" }}%)"

      - alert: DiskSpaceLow
        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk {{ $value | printf \"%.1f\" }}% full"

      - alert: DiskSpaceCritical
        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "DISK CRITICAL {{ $value | printf \"%.1f\" }}% full"

  # ══════════════════════════════════════════════════════════════════════
  # Database
  # ══════════════════════════════════════════════════════════════════════
  - name: database
    rules:
      - alert: PostgresDown
        expr: pg_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "PostgreSQL is DOWN"

      - alert: PostgresHighConnections
        expr: pg_stat_activity_count > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PostgreSQL {{ $value }} active connections"

      - alert: PostgresSlowQueries
        expr: pg_stat_activity_max_tx_duration > 300
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PostgreSQL query running > 5 minutes"

  # ══════════════════════════════════════════════════════════════════════
  # SSL Certificates
  # ══════════════════════════════════════════════════════════════════════
  - name: ssl
    rules:
      - alert: SSLCertExpiringSoon
        expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 14 * 24 * 3600
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL cert for {{ $labels.instance }} expires in < 14 days"

      - alert: SSLCertExpiryCritical
        expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 3 * 24 * 3600
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "SSL cert for {{ $labels.instance }} expires in < 3 DAYS"

  # ══════════════════════════════════════════════════════════════════════
  # Response Time
  # ══════════════════════════════════════════════════════════════════════
  - name: latency
    rules:
      - alert: APISlowResponse
        expr: probe_duration_seconds{job="pw_api_prod"} > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "API responding slowly ({{ $value | printf \"%.1f\" }}s)"

      - alert: SiteSlowResponse
        expr: probe_duration_seconds{job="pw_site_prod"} > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Website responding slowly ({{ $value | printf \"%.1f\" }}s)"

      - alert: HighNginx5xxRate
        expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High nginx 5xx error rate"