new-site/monitoring/alert_rules.yml

groups:
  # ══════════════════════════════════════════════════════════════════════
  # Service Down Alerts
  # ══════════════════════════════════════════════════════════════════════
  - name: service_down
    rules:
      - alert: EndpointDown
        expr: probe_success{job="blackbox_http"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.instance }} is DOWN"
          description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."

      - alert: TCPPortDown
        expr: probe_success{job="blackbox_tcp"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "TCP port {{ $labels.instance }} is DOWN"
          description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."

      - alert: ContainerDown
        expr: |
          absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
          or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} is DOWN"
          description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."

      - alert: ContainerRestarting
        expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} is restart-looping"
          description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."

  # ══════════════════════════════════════════════════════════════════════
  # Host Resource Alerts
  # ══════════════════════════════════════════════════════════════════════
  - name: host_resources
    rules:
      - alert: HighCPU
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
          description: "CPU usage has been above 85% for 10 minutes."

      - alert: HighMemory
        expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
          description: "Memory usage has been above 90% for 5 minutes."

      - alert: DiskSpaceLow
        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
          description: "Root filesystem is more than 80% full."

      - alert: DiskSpaceCritical
        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
          description: "Root filesystem is more than 92% full. Immediate action required."

      - alert: HighLoadAverage
        expr: node_load15 > 8
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High load average ({{ $value | printf \"%.1f\" }})"
          description: "15-minute load average has been above 8 for 10 minutes."

  # ══════════════════════════════════════════════════════════════════════
  # Database Alerts
  # ══════════════════════════════════════════════════════════════════════
  - name: database
    rules:
      - alert: PostgresDown
        expr: pg_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "PostgreSQL is DOWN"
          description: "PostgreSQL exporter cannot connect to the database."

      - alert: PostgresHighConnections
        expr: pg_stat_activity_count > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High PostgreSQL connections ({{ $value }})"
          description: "PostgreSQL active connections exceeding 80."

  # ══════════════════════════════════════════════════════════════════════
  # SSL Certificate Alerts
  # ══════════════════════════════════════════════════════════════════════
  - name: ssl
    rules:
      - alert: SSLCertExpiringSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
          description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."

      - alert: SSLCertExpiryCritical
        expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
          description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."

  # ══════════════════════════════════════════════════════════════════════
  # Response Time Alerts
  # ══════════════════════════════════════════════════════════════════════
  - name: latency
    rules:
      - alert: SlowHTTPResponse
        expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
          description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."

      - alert: HighNginx5xxRate
        expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High nginx 5xx error rate"
          description: "More than 0.5 req/s returning 5xx errors."