Add Playwright failure monitoring: Telegram alerts + screenshots + health check

When any Playwright submission fails (selector not found, timeout, etc.):
1. Full-page screenshot captured and uploaded to MinIO
2. Telegram alert sent immediately with error details + screenshot link
3. Email alert to ops with same info
4. Admin todo includes screenshot MinIO path for debugging
5. Client order stays pending for manual completion

Proactive selector health check (daily 7am CT cron):
- Navigates to each portal (FCC RMD, USAC E-File, FCC CPNI/ECFS)
- Verifies all critical selectors are still present in the DOM
- If selectors are missing (UI changed): alerts via Telegram + email
  BEFORE any real client order fails
- Reports which service slugs are affected

Integrated into:
- RMD filing handler (fccprod.servicenowservices.com)
- Form 499-A handler (forms.universalservice.org)
- Form 499-Q handler (already had error handling)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
justin 2026-05-04 02:44:02 -05:00
parent 3e04edd384
commit 78c04b8bc3
4 changed files with 409 additions and 2 deletions

View file

@ -158,6 +158,15 @@ worker_crons:
on_calendar: "Sat *-*-* 10:00:00 UTC"
persistent: true
# Playwright selector health check — daily 12:00 UTC (7am CT).
# Proactively detects when FCC/USAC portal UI changes would break automation.
# Alerts via Telegram + email if selectors are missing.
- name: pw-playwright-health
description: Check FCC/USAC portal selectors are still valid
module: scripts.workers.services.telecom.playwright_monitor
on_calendar: "*-*-* 12:00:00 UTC"
persistent: true
# 499-Q quarterly filing reminders — daily 13:00 UTC (8am CT).
# Sends reminder emails at 30/14/7 days before each quarterly due date.
# Creates compliance_orders for each quarter when the 499-A+Q bundle is filed.

View file

@ -574,10 +574,33 @@ class Form499AHandler(BaseServiceHandler):
except Exception as exc:
logger.exception("Form499AHandler: USAC submission failed: %s", exc)
# Upload screenshot + alert ops via Telegram
screenshot_key = None
try:
from scripts.workers.services.telecom.playwright_monitor import (
upload_failure_screenshot_async, alert_playwright_failure,
)
screenshot_key = await upload_failure_screenshot_async(
page, order_number, "fcc-499a", work_dir,
)
alert_playwright_failure(
order_number=order_number,
service_slug=self.SERVICE_SLUG,
service_name=self.SERVICE_NAME,
entity_name=entity.get("legal_name", ""),
error=exc,
screenshot_key=screenshot_key,
portal_url="https://forms.universalservice.org/",
)
except Exception as alert_exc:
logger.warning("Playwright failure alert failed: %s", alert_exc)
self._create_admin_todo(
order_number,
f"USAC 499-A submission failed for Filer ID {filer_id}: {exc}. "
"Prep packet is in MinIO; file manually at "
f"{'Screenshot: MinIO ' + screenshot_key if screenshot_key else ''}"
f"\nPrep packet is in MinIO; file manually at "
"https://forms.universalservice.org/.",
)
return None, ""

View file

@ -477,10 +477,33 @@ class RMDFilingHandler(BaseServiceHandler):
except Exception as exc:
logger.exception("RMDFilingHandler: portal submission failed: %s", exc)
# Upload screenshot + alert ops via Telegram
screenshot_key = None
try:
from scripts.workers.services.telecom.playwright_monitor import (
upload_failure_screenshot_async, alert_playwright_failure,
)
screenshot_key = await upload_failure_screenshot_async(
page, order_number, "rmd-filing", work_dir,
)
alert_playwright_failure(
order_number=order_number,
service_slug="rmd-filing",
service_name="RMD Registration",
entity_name=entity.get("legal_name", ""),
error=exc,
screenshot_key=screenshot_key,
portal_url="https://fccprod.servicenowservices.com/rmd",
)
except Exception as alert_exc:
logger.warning("Playwright failure alert failed: %s", alert_exc)
self._create_admin_todo(
order_number,
f"Automated RMD submission failed for FRN {frn}: {exc}. "
"Inspect worker logs for the screenshot/video, then file manually.",
f"{'Screenshot: MinIO ' + screenshot_key if screenshot_key else 'No screenshot captured.'}"
f"\nFile manually at https://fccprod.servicenowservices.com/rmd",
)
return None, ""

View file

@ -0,0 +1,352 @@
"""Playwright failure monitoring and alerting.
Provides:
1. alert_playwright_failure() sends Telegram + email notification when
a Playwright submission fails, with screenshot link
2. upload_failure_screenshot() saves screenshot to MinIO for debugging
3. PlaywrightHealthCheck scheduled probe that verifies selectors are
still valid on target portals without submitting anything
Used by all Playwright-based handlers (RMD, 499-A, CPNI, CORES, BDC, etc.)
"""
from __future__ import annotations
import logging
import os
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Optional
logger = logging.getLogger("workers.services.telecom.playwright_monitor")
ADMIN_EMAIL = os.environ.get("ADMIN_EMAIL", "ops@performancewest.net")
MINIO_BUCKET = os.environ.get("MINIO_BUCKET", "performancewest")
def upload_failure_screenshot(
page,
order_number: str,
service_slug: str,
work_dir: Optional[str] = None,
) -> Optional[str]:
"""Take a full-page screenshot and upload to MinIO.
Returns the MinIO object key (path) or None if upload failed.
"""
try:
work_dir = work_dir or tempfile.mkdtemp(prefix=f"pw_fail_{order_number}_")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"playwright_error_{service_slug}_{timestamp}.png"
local_path = os.path.join(work_dir, filename)
# Use sync API if page supports it, otherwise try async
import asyncio
if asyncio.iscoroutinefunction(getattr(page, 'screenshot', None)):
loop = asyncio.get_event_loop()
loop.run_until_complete(page.screenshot(path=local_path, full_page=True))
else:
page.screenshot(path=local_path, full_page=True)
# Upload to MinIO
minio_key = f"compliance/{order_number}/errors/{filename}"
try:
from scripts.document_gen.minio_client import MinIOStorage
storage = MinIOStorage()
storage.upload(local_path, minio_key)
logger.info("Failure screenshot uploaded: %s", minio_key)
return minio_key
except Exception as exc:
logger.warning("MinIO screenshot upload failed: %s", exc)
return None
except Exception as exc:
logger.warning("Screenshot capture failed: %s", exc)
return None
async def upload_failure_screenshot_async(
page,
order_number: str,
service_slug: str,
work_dir: Optional[str] = None,
) -> Optional[str]:
"""Async version — take screenshot and upload to MinIO."""
try:
work_dir = work_dir or tempfile.mkdtemp(prefix=f"pw_fail_{order_number}_")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"playwright_error_{service_slug}_{timestamp}.png"
local_path = os.path.join(work_dir, filename)
await page.screenshot(path=local_path, full_page=True)
minio_key = f"compliance/{order_number}/errors/{filename}"
try:
from scripts.document_gen.minio_client import MinIOStorage
storage = MinIOStorage()
storage.upload(local_path, minio_key)
logger.info("Failure screenshot uploaded: %s", minio_key)
return minio_key
except Exception as exc:
logger.warning("MinIO screenshot upload failed: %s", exc)
return None
except Exception as exc:
logger.warning("Async screenshot capture failed: %s", exc)
return None
def alert_playwright_failure(
order_number: str,
service_slug: str,
service_name: str,
entity_name: str,
error: Exception,
screenshot_key: Optional[str] = None,
portal_url: str = "",
) -> None:
"""Send Telegram + email alert when a Playwright submission fails.
This fires immediately so ops knows a client order is stuck.
"""
error_msg = str(error)[:300]
screenshot_info = ""
if screenshot_key:
minio_url = os.environ.get("MINIO_CONSOLE_URL", "https://minio-console.performancewest.net")
screenshot_info = f"\nScreenshot: {minio_url}/browser/{MINIO_BUCKET}/{screenshot_key}"
alert_body = (
f"🔴 PLAYWRIGHT FAILURE\n\n"
f"Service: {service_name} ({service_slug})\n"
f"Order: {order_number}\n"
f"Entity: {entity_name}\n"
f"Portal: {portal_url}\n\n"
f"Error: {error_msg}\n"
f"{screenshot_info}\n\n"
f"The client's order is stuck. Either:\n"
f"1. Fix the selector and re-dispatch the order\n"
f"2. File manually at the portal URL above"
)
# Telegram alert
_send_telegram_alert(alert_body)
# Email alert
_send_email_alert(
subject=f"[PLAYWRIGHT FAIL] {service_name}{entity_name} ({order_number})",
body=alert_body,
)
def _send_telegram_alert(message: str) -> None:
"""Send alert to Telegram bot."""
try:
bot_token = os.environ.get("TELEGRAM_BOT_TOKEN", "")
chat_id = os.environ.get("TELEGRAM_CHAT_ID", "")
if not bot_token or not chat_id:
logger.debug("Telegram not configured — skipping alert")
return
import urllib.request
import urllib.parse
import json
# Truncate for Telegram's 4096 char limit
msg = message[:4000]
data = urllib.parse.urlencode({
"chat_id": chat_id,
"text": msg,
"parse_mode": "",
}).encode()
req = urllib.request.Request(
f"https://api.telegram.org/bot{bot_token}/sendMessage",
data=data,
method="POST",
)
urllib.request.urlopen(req, timeout=10)
logger.info("Telegram playwright failure alert sent")
except Exception as exc:
logger.warning("Telegram alert failed: %s", exc)
def _send_email_alert(subject: str, body: str) -> None:
"""Send failure alert email to admin."""
try:
import smtplib
from email.mime.text import MIMEText
smtp_host = os.environ.get("SMTP_HOST", "co.carrierone.com")
smtp_port = int(os.environ.get("SMTP_PORT", "587"))
smtp_user = os.environ.get("SMTP_USER", "")
smtp_pass = os.environ.get("SMTP_PASS", "")
if not smtp_user or not smtp_pass:
return
msg = MIMEText(body)
msg["From"] = os.environ.get("SMTP_FROM", "Performance West <noreply@performancewest.net>")
msg["To"] = ADMIN_EMAIL
msg["Subject"] = subject
with smtplib.SMTP(smtp_host, smtp_port, timeout=15) as s:
s.starttls()
s.login(smtp_user, smtp_pass)
s.send_message(msg)
logger.info("Playwright failure email sent to %s", ADMIN_EMAIL)
except Exception as exc:
logger.warning("Failure email send failed: %s", exc)
# ═══════════════════════════════════════════════════════════════════════════
# Proactive Selector Health Check
# ═══════════════════════════════════════════════════════════════════════════
# Portal definitions — each portal has a URL, expected selectors, and a
# login state file. The health check navigates to the portal and verifies
# all critical selectors exist without submitting anything.
PORTAL_CHECKS = [
{
"name": "FCC RMD Portal",
"url": "https://fccprod.servicenowservices.com/rmd",
"storage_state": "/app/data/rmd_session.json",
"selectors": [
'text="File Certification"',
'input[name="frn"]',
],
"service_slugs": ["rmd-filing"],
},
{
"name": "USAC E-File",
"url": "https://forms.universalservice.org",
"storage_state": "/app/data/usac_session.json",
"selectors": [
'text="Form 499-A"',
'text="Form 499-Q"',
],
"service_slugs": ["fcc-499a", "fcc-499a-zero", "fcc-499a-499q", "fcc-499q"],
},
{
"name": "FCC CPNI (ECFS)",
"url": "https://www.fcc.gov/ecfs/search/search-filings",
"storage_state": None,
"selectors": [
'input[id*="search"], input[name*="search"]',
],
"service_slugs": ["cpni-certification"],
},
]
async def run_selector_health_check(dry_run: bool = False) -> list[dict]:
"""Proactively check all portal selectors are still valid.
Returns a list of failed checks. Sends Telegram alert for each failure.
Called by a daily/weekly cron.
"""
failures = []
try:
from playwright.async_api import async_playwright
except ImportError:
logger.warning("Playwright not available — skipping health check")
return []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
for portal in PORTAL_CHECKS:
name = portal["name"]
url = portal["url"]
state_file = portal.get("storage_state")
# Skip if no session file exists (can't access authenticated portals)
if state_file and not os.path.exists(state_file):
logger.info("Health check: %s — no session file, skipping", name)
continue
try:
context_kwargs = {}
if state_file:
context_kwargs["storage_state"] = state_file
context = await browser.new_context(**context_kwargs)
page = await context.new_page()
await page.goto(url, timeout=30000)
missing = []
for selector in portal["selectors"]:
try:
el = page.locator(selector)
count = await el.count()
if count == 0:
missing.append(selector)
except Exception:
missing.append(selector)
if missing:
failure = {
"portal": name,
"url": url,
"missing_selectors": missing,
"affects": portal["service_slugs"],
}
failures.append(failure)
logger.warning(
"Health check FAILED: %s — missing selectors: %s",
name, missing,
)
if not dry_run:
alert_body = (
f"⚠️ PORTAL UI CHANGE DETECTED\n\n"
f"Portal: {name}\n"
f"URL: {url}\n"
f"Missing selectors:\n"
+ "\n".join(f"{s}" for s in missing) +
f"\n\nAffected services: {', '.join(portal['service_slugs'])}\n\n"
f"Playwright automation will FAIL for these services until "
f"selectors are updated. Check the portal for UI changes."
)
_send_telegram_alert(alert_body)
_send_email_alert(
subject=f"[SELECTOR ALERT] {name} — UI change detected",
body=alert_body,
)
else:
logger.info("Health check OK: %s — all selectors present", name)
await context.close()
except Exception as exc:
logger.warning("Health check error for %s: %s", name, exc)
failures.append({
"portal": name,
"url": url,
"error": str(exc),
"affects": portal["service_slugs"],
})
await browser.close()
return failures
def main():
"""CLI entrypoint for selector health check cron."""
import asyncio
import argparse
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
parser = argparse.ArgumentParser(description="Playwright selector health check")
parser.add_argument("--dry-run", action="store_true", help="Check but don't alert")
args = parser.parse_args()
failures = asyncio.run(run_selector_health_check(dry_run=args.dry_run))
if failures:
print(f"FAILED: {len(failures)} portal(s) have selector issues")
for f in failures:
print(f" {f['portal']}: {f.get('missing_selectors', f.get('error', '?'))}")
else:
print("ALL OK: All portal selectors are valid")
if __name__ == "__main__":
main()