diff --git a/monitoring/alert_rules.yml b/monitoring/alert_rules.yml index 8f84920..5a637d9 100644 --- a/monitoring/alert_rules.yml +++ b/monitoring/alert_rules.yml @@ -1,38 +1,137 @@ groups: # ══════════════════════════════════════════════════════════════════════ - # Service Down Alerts + # Performance West Core Services # ══════════════════════════════════════════════════════════════════════ - - name: service_down + - name: pw_services rules: - - alert: EndpointDown - expr: probe_success{job="blackbox_http"} == 0 + - alert: PW_API_Down + expr: probe_success{job="pw_api_prod"} == 0 + for: 1m + labels: + severity: critical + service: api + annotations: + summary: "Prod API is DOWN" + description: "API /status endpoint failed — database may be unreachable. Check api + api-postgres containers." + + - alert: PW_API_Dev_Down + expr: probe_success{job="pw_api_dev"} == 0 + for: 3m + labels: + severity: warning + service: api-dev + annotations: + summary: "Dev API is DOWN" + description: "Dev API /status endpoint unreachable." + + - alert: PW_Site_Down + expr: probe_success{job="pw_site_prod"} == 0 + for: 1m + labels: + severity: critical + service: site + annotations: + summary: "Prod website is DOWN" + description: "performancewest.net static site is not responding." + + - alert: PW_Workers_Down + expr: probe_success{job="pw_workers"} == 0 + for: 2m + labels: + severity: critical + service: workers + annotations: + summary: "Workers job server is DOWN" + description: "Python workers /health endpoint failed. Compliance orders, formation filings, and cron jobs are not being processed." + + - alert: PW_ERPNext_Down + expr: probe_success{job="pw_erpnext"} == 0 + for: 2m + labels: + severity: critical + service: erpnext + annotations: + summary: "ERPNext CRM is DOWN" + description: "ERPNext API is unreachable. Sales orders, invoices, and customer records are inaccessible." + + - alert: PW_MinIO_Down + expr: probe_success{job="pw_minio"} == 0 + for: 2m + labels: + severity: critical + service: minio + annotations: + summary: "MinIO object storage is DOWN" + description: "MinIO health check failed. Document uploads, RMD packets, and file storage are unavailable." + + - alert: PW_Listmonk_Down + expr: probe_success{job="pw_listmonk"} == 0 + for: 5m + labels: + severity: warning + service: listmonk + annotations: + summary: "Listmonk email service is DOWN" + description: "Listmonk health endpoint failed. Email campaigns and subscriber management are unavailable." + + - alert: PW_Ollama_Down + expr: probe_success{job="pw_ollama"} == 0 + for: 5m + labels: + severity: warning + service: ollama + annotations: + summary: "Ollama LLM is DOWN" + description: "Ollama not responding. AI-powered document analysis in workers will fall back to regex." + + - alert: PW_Umami_Down + expr: probe_success{job="pw_umami"} == 0 + for: 5m + labels: + severity: warning + service: umami + annotations: + summary: "Umami analytics is DOWN" + description: "Analytics tracking endpoint unreachable. Site analytics not being recorded." + + - alert: PW_Forgejo_Down + expr: probe_success{job="pw_forgejo"} == 0 + for: 5m + labels: + severity: warning + service: forgejo + annotations: + summary: "Forgejo git server is DOWN" + description: "Git server unreachable. Code deployments will fail." + + # ══════════════════════════════════════════════════════════════════════ + # External HTTPS Endpoints (SSL + reachability) + # ══════════════════════════════════════════════════════════════════════ + - name: external_endpoints + rules: + - alert: HTTPS_Endpoint_Down + expr: probe_success{job="blackbox_https"} == 0 for: 2m labels: severity: critical annotations: summary: "{{ $labels.instance }} is DOWN" - description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes." + description: "HTTPS probe failed for {{ $labels.instance }}. Check nginx, DNS, and SSL cert." - - alert: TCPPortDown + - alert: TCP_Port_Down expr: probe_success{job="blackbox_tcp"} == 0 for: 1m labels: severity: critical annotations: summary: "TCP port {{ $labels.instance }} is DOWN" - description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute." - - - alert: ContainerDown - expr: | - absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"}) - or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60 - for: 1m - labels: - severity: critical - annotations: - summary: "Container {{ $labels.name }} is DOWN" - description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute." + description: "Database or cache port unreachable." + # ══════════════════════════════════════════════════════════════════════ + # Container Health + # ══════════════════════════════════════════════════════════════════════ + - name: containers + rules: - alert: ContainerRestarting expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2 for: 5m @@ -40,10 +139,28 @@ groups: severity: warning annotations: summary: "Container {{ $labels.name }} is restart-looping" - description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes." + description: "{{ $labels.name }} has restarted more than 2 times in 15 minutes." + + - alert: ContainerHighCPU + expr: rate(container_cpu_usage_seconds_total{name=~"performancewest-.*"}[5m]) * 100 > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "Container {{ $labels.name }} high CPU ({{ $value | printf \"%.0f\" }}%)" + description: "Container CPU usage above 80% for 10 minutes." + + - alert: ContainerHighMemory + expr: container_memory_usage_bytes{name=~"performancewest-.*"} / container_spec_memory_limit_bytes{name=~"performancewest-.*"} * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Container {{ $labels.name }} high memory ({{ $value | printf \"%.0f\" }}%)" + description: "Container using more than 85% of its memory limit." # ══════════════════════════════════════════════════════════════════════ - # Host Resource Alerts + # Host Resources # ══════════════════════════════════════════════════════════════════════ - name: host_resources rules: @@ -53,8 +170,7 @@ groups: labels: severity: warning annotations: - summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)" - description: "CPU usage has been above 85% for 10 minutes." + summary: "High CPU ({{ $value | printf \"%.1f\" }}%)" - alert: HighMemory expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90 @@ -62,8 +178,7 @@ groups: labels: severity: warning annotations: - summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)" - description: "Memory usage has been above 90% for 5 minutes." + summary: "High memory ({{ $value | printf \"%.1f\" }}%)" - alert: DiskSpaceLow expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 @@ -71,8 +186,7 @@ groups: labels: severity: warning annotations: - summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)" - description: "Root filesystem is more than 80% full." + summary: "Disk {{ $value | printf \"%.1f\" }}% full" - alert: DiskSpaceCritical expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92 @@ -80,20 +194,10 @@ groups: labels: severity: critical annotations: - summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)" - description: "Root filesystem is more than 92% full. Immediate action required." - - - alert: HighLoadAverage - expr: node_load15 > 8 - for: 10m - labels: - severity: warning - annotations: - summary: "High load average ({{ $value | printf \"%.1f\" }})" - description: "15-minute load average has been above 8 for 10 minutes." + summary: "DISK CRITICAL {{ $value | printf \"%.1f\" }}% full" # ══════════════════════════════════════════════════════════════════════ - # Database Alerts + # Database # ══════════════════════════════════════════════════════════════════════ - name: database rules: @@ -104,7 +208,6 @@ groups: severity: critical annotations: summary: "PostgreSQL is DOWN" - description: "PostgreSQL exporter cannot connect to the database." - alert: PostgresHighConnections expr: pg_stat_activity_count > 80 @@ -112,45 +215,57 @@ groups: labels: severity: warning annotations: - summary: "High PostgreSQL connections ({{ $value }})" - description: "PostgreSQL active connections exceeding 80." + summary: "PostgreSQL {{ $value }} active connections" - # ══════════════════════════════════════════════════════════════════════ - # SSL Certificate Alerts - # ══════════════════════════════════════════════════════════════════════ - - name: ssl - rules: - - alert: SSLCertExpiringSoon - expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600 - for: 1h - labels: - severity: warning - annotations: - summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}" - description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}." - - - alert: SSLCertExpiryCritical - expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600 - for: 10m - labels: - severity: critical - annotations: - summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}" - description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken." - - # ══════════════════════════════════════════════════════════════════════ - # Response Time Alerts - # ══════════════════════════════════════════════════════════════════════ - - name: latency - rules: - - alert: SlowHTTPResponse - expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5 + - alert: PostgresSlowQueries + expr: pg_stat_activity_max_tx_duration > 300 for: 5m labels: severity: warning annotations: - summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)" - description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}." + summary: "PostgreSQL query running > 5 minutes" + + # ══════════════════════════════════════════════════════════════════════ + # SSL Certificates + # ══════════════════════════════════════════════════════════════════════ + - name: ssl + rules: + - alert: SSLCertExpiringSoon + expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 14 * 24 * 3600 + for: 1h + labels: + severity: warning + annotations: + summary: "SSL cert for {{ $labels.instance }} expires in < 14 days" + + - alert: SSLCertExpiryCritical + expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 3 * 24 * 3600 + for: 10m + labels: + severity: critical + annotations: + summary: "SSL cert for {{ $labels.instance }} expires in < 3 DAYS" + + # ══════════════════════════════════════════════════════════════════════ + # Response Time + # ══════════════════════════════════════════════════════════════════════ + - name: latency + rules: + - alert: APISlowResponse + expr: probe_duration_seconds{job="pw_api_prod"} > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "API responding slowly ({{ $value | printf \"%.1f\" }}s)" + + - alert: SiteSlowResponse + expr: probe_duration_seconds{job="pw_site_prod"} > 3 + for: 5m + labels: + severity: warning + annotations: + summary: "Website responding slowly ({{ $value | printf \"%.1f\" }}s)" - alert: HighNginx5xxRate expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5 @@ -159,4 +274,3 @@ groups: severity: warning annotations: summary: "High nginx 5xx error rate" - description: "More than 0.5 req/s returning 5xx errors." diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index 44e1b0c..5cf9069 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -44,23 +44,22 @@ scrape_configs: static_configs: - targets: ["nginx-exporter:9113"] - # ── Blackbox probes (HTTP endpoint monitoring) ───────────────────── - - job_name: blackbox_http + # ══════════════════════════════════════════════════════════════════════ + # Performance West Service Health Probes + # Each probe verifies the service is FUNCTIONAL, not just responding + # ══════════════════════════════════════════════════════════════════════ + + # ── Prod API + DB (returns 503 if DB unreachable) ────────────────── + - job_name: pw_api_prod metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - - https://performancewest.net - - https://api.performancewest.net/api/v1/fcc/search?q=test - - https://dev.performancewest.net - - https://api.dev.performancewest.net/api/v1/fcc/search?q=test - - https://crm.performancewest.net - - https://lists.performancewest.net - - https://analytics.performancewest.net - - http://minio:9000/minio/health/live - - https://crypto.performancewest.net - - https://pay.performancewest.net + - http://api:3001/api/v1/status + labels: + service: api + env: prod relabel_configs: - source_labels: [__address__] target_label: __param_target @@ -69,7 +68,203 @@ scrape_configs: - target_label: __address__ replacement: blackbox-exporter:9115 - # ── Blackbox TCP probes (port monitoring) ────────────────────────── + # ── Dev API + DB ─────────────────────────────────────────────────── + - job_name: pw_api_dev + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://host.docker.internal:3002/api/v1/status + labels: + service: api + env: dev + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Prod Site (Astro static) ─────────────────────────────────────── + - job_name: pw_site_prod + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://site:80/ + labels: + service: site + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Workers (Python job server) ──────────────────────────────────── + - job_name: pw_workers + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://workers:8090/health + labels: + service: workers + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── ERPNext CRM ──────────────────────────────────────────────────── + - job_name: pw_erpnext + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://erpnext:8000/api/method/frappe.client.get_count?doctype=Customer + labels: + service: erpnext + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── MinIO object storage ─────────────────────────────────────────── + - job_name: pw_minio + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://minio:9000/minio/health/live + labels: + service: minio + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Listmonk email marketing ─────────────────────────────────────── + - job_name: pw_listmonk + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://listmonk:9000/api/health + labels: + service: listmonk + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Ollama LLM ──────────────────────────────────────────────────── + - job_name: pw_ollama + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://ollama:11434/ + labels: + service: ollama + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Umami analytics ──────────────────────────────────────────────── + - job_name: pw_umami + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://umami:3000/api/heartbeat + labels: + service: umami + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── Forgejo git server ───────────────────────────────────────────── + - job_name: pw_forgejo + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://host.docker.internal:3030/ + labels: + service: forgejo + env: prod + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ══════════════════════════════════════════════════════════════════════ + # External-facing HTTPS probes (SSL + reachability from outside) + # ══════════════════════════════════════════════════════════════════════ + - job_name: blackbox_https + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - https://performancewest.net + - https://api.performancewest.net/api/v1/status + - https://dev.performancewest.net + - https://crm.performancewest.net + - https://lists.performancewest.net + - https://analytics.performancewest.net + - https://monitoring.performancewest.net + - https://crypto.performancewest.net + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ── TCP port probes (databases, caches) ──────────────────────────── - job_name: blackbox_tcp metrics_path: /probe params: diff --git a/monitoring/pw-services-dashboard.json b/monitoring/pw-services-dashboard.json new file mode 100644 index 0000000..c30b62a --- /dev/null +++ b/monitoring/pw-services-dashboard.json @@ -0,0 +1,151 @@ +{ + "dashboard": { + "id": null, + "uid": null, + "title": "Performance West — Services Overview", + "tags": ["performancewest", "services"], + "timezone": "browser", + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "panels": [ + { + "type": "stat", + "title": "Service Status", + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" }, + { "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" } + ], + "thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] } + }, + "overrides": [] + }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background", "textMode": "auto" }, + "targets": [ + { "expr": "probe_success{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_listmonk'}", "legendFormat": "Listmonk", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_ollama'}", "legendFormat": "Ollama", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_umami'}", "legendFormat": "Umami", "datasource": { "type": "prometheus" } }, + { "expr": "probe_success{job='pw_forgejo'}", "legendFormat": "Forgejo", "datasource": { "type": "prometheus" } }, + { "expr": "pg_up", "legendFormat": "PostgreSQL", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "timeseries", + "title": "Service Response Time", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "fieldConfig": { + "defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "yellow" }, { "value": 3, "color": "red" }] } } + }, + "targets": [ + { "expr": "probe_duration_seconds{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } }, + { "expr": "probe_duration_seconds{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } }, + { "expr": "probe_duration_seconds{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } }, + { "expr": "probe_duration_seconds{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } }, + { "expr": "probe_duration_seconds{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "timeseries", + "title": "HTTPS Endpoint Response Time", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "fieldConfig": { + "defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 2, "color": "yellow" }, { "value": 5, "color": "red" }] } } + }, + "targets": [ + { "expr": "probe_duration_seconds{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "stat", + "title": "HTTPS Endpoints", + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 12 }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" }, + { "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" } + ], + "thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] } + } + }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background" }, + "targets": [ + { "expr": "probe_success{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "gauge", + "title": "SSL Certificate Days Remaining", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 }, + "fieldConfig": { + "defaults": { + "unit": "d", + "min": 0, "max": 90, + "thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 7, "color": "orange" }, { "value": 14, "color": "yellow" }, { "value": 30, "color": "green" }] } + } + }, + "targets": [ + { "expr": "(probe_ssl_earliest_cert_expiry{job='blackbox_https'} - time()) / 86400", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "timeseries", + "title": "Container CPU Usage", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, + "fieldConfig": { "defaults": { "unit": "percentunit" } }, + "targets": [ + { "expr": "rate(container_cpu_usage_seconds_total{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama)-1'}[5m])", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "timeseries", + "title": "Container Memory Usage", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, + "fieldConfig": { "defaults": { "unit": "bytes" } }, + "targets": [ + { "expr": "container_memory_usage_bytes{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama|api-postgres)-1'}", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "stat", + "title": "PostgreSQL", + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 30 }, + "targets": [ + { "expr": "pg_stat_activity_count", "legendFormat": "Active Connections", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "stat", + "title": "nginx Requests/sec", + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 30 }, + "fieldConfig": { "defaults": { "unit": "reqps" } }, + "targets": [ + { "expr": "rate(nginx_http_requests_total[5m])", "legendFormat": "req/s", "datasource": { "type": "prometheus" } } + ] + }, + { + "type": "stat", + "title": "Active Alerts", + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 30 }, + "fieldConfig": { + "defaults": { + "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "orange" }, { "value": 3, "color": "red" }] } + } + }, + "targets": [ + { "expr": "ALERTS{alertstate='firing'}", "legendFormat": "{{ alertname }}", "datasource": { "type": "prometheus" } } + ] + } + ], + "schemaVersion": 39 + }, + "overwrite": true, + "folderId": 0 +}