new-site/scripts/workers/services/telecom/undetected_browser.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

221 lines
7.4 KiB
Python

"""Undetected Playwright launcher for FCC / USAC / BDC portals.
All automated filing handlers (RMD, CPNI, Form 499-A, BDC) go through this
helper instead of importing ``playwright`` directly. We prefer ``patchright``
(a drop-in Playwright replacement that patches ``navigator.webdriver``, CDP
leakage, runtime-enable fingerprints, and the ``--disable-blink-features=
AutomationControlled`` artifact) and fall back to vanilla Playwright with
the same stealth init scripts we use in ``scripts/formation/base.py`` if
patchright is not installed.
State formation portals that sit behind Incapsula/Akamai (Nevada, Delaware,
etc.) should also use this helper — see ``docs/state-automation-status.md``
for the list.
Optional residential proxy support: set ``UNDETECTED_PROXY_URL`` in the
environment (e.g. ``http://user:pass@proxy.example.com:8080``) and pass
``use_proxy=True`` when launching.
"""
from __future__ import annotations
import logging
import os
import random
from contextlib import asynccontextmanager
from typing import TYPE_CHECKING, AsyncIterator
if TYPE_CHECKING:
from playwright.async_api import Browser, BrowserContext, Page, Playwright
logger = logging.getLogger(__name__)
# Prefer patchright; fall back to playwright with manual stealth patches.
_USING_PATCHRIGHT = False
try:
from patchright.async_api import async_playwright # type: ignore
_USING_PATCHRIGHT = True
logger.info("undetected_browser: using patchright")
except ImportError:
from playwright.async_api import async_playwright # type: ignore
logger.warning(
"undetected_browser: patchright not installed — falling back to "
"vanilla playwright with home-grown stealth patches. Install with "
"`pip install patchright` and run `patchright install chromium` "
"for best results against bot-detection-heavy portals."
)
# Common modern Chrome UAs. We rotate between a handful so that a burst of
# concurrent submissions doesn't all look like the same client.
_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
]
# Common viewports; mildly jittered per-launch to vary fingerprint.
_VIEWPORTS = [
{"width": 1280, "height": 900},
{"width": 1440, "height": 900},
{"width": 1536, "height": 864},
{"width": 1920, "height": 1080},
]
# Init script run on every page — only used on the vanilla-playwright path;
# patchright handles all of these patches (and many more) internally.
_STEALTH_INIT_SCRIPT = """
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5].map(() => ({ name: 'Chrome PDF Plugin' })),
});
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
window.chrome = { runtime: {} };
const originalQuery = window.navigator.permissions && window.navigator.permissions.query;
if (originalQuery) {
window.navigator.permissions.query = (parameters) =>
parameters.name === 'notifications'
? Promise.resolve({ state: Notification.permission })
: originalQuery(parameters);
}
"""
def _proxy_config() -> dict | None:
"""Read UNDETECTED_PROXY_URL and turn it into a Playwright proxy dict."""
url = os.environ.get("UNDETECTED_PROXY_URL", "").strip()
if not url:
return None
# Playwright's proxy dict supports: server, username, password, bypass
from urllib.parse import urlparse
parsed = urlparse(url)
server = f"{parsed.scheme}://{parsed.hostname}"
if parsed.port:
server += f":{parsed.port}"
cfg: dict = {"server": server}
if parsed.username:
cfg["username"] = parsed.username
if parsed.password:
cfg["password"] = parsed.password
return cfg
async def launch_context(
playwright: "Playwright",
*,
headless: bool = True,
use_proxy: bool = False,
timezone_id: str = "America/New_York",
locale: str = "en-US",
storage_state: str | None = None,
) -> "tuple[Browser, BrowserContext]":
"""Launch a Chromium browser + context with stealth settings.
Returns ``(browser, context)`` — caller is responsible for closing both
(prefer the :func:`undetected_browser` context manager instead).
"""
launch_args = [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
]
# On the vanilla-playwright path we add the extra flag that hides the
# AutomationControlled fingerprint. Patchright already does this (and
# adding the flag with patchright is harmless but redundant).
if not _USING_PATCHRIGHT:
launch_args.append("--disable-blink-features=AutomationControlled")
browser = await playwright.chromium.launch(
headless=headless,
args=launch_args,
)
context_kwargs: dict = {
"viewport": random.choice(_VIEWPORTS),
"user_agent": random.choice(_USER_AGENTS),
"locale": locale,
"timezone_id": timezone_id,
"java_script_enabled": True,
}
if use_proxy:
proxy = _proxy_config()
if proxy:
context_kwargs["proxy"] = proxy
else:
logger.warning(
"undetected_browser: use_proxy=True but UNDETECTED_PROXY_URL is unset"
)
if storage_state:
context_kwargs["storage_state"] = storage_state
context = await browser.new_context(**context_kwargs)
if not _USING_PATCHRIGHT:
await context.add_init_script(_STEALTH_INIT_SCRIPT)
return browser, context
@asynccontextmanager
async def undetected_browser(
*,
headless: bool = True,
use_proxy: bool = False,
timezone_id: str = "America/New_York",
locale: str = "en-US",
storage_state: str | None = None,
) -> AsyncIterator["tuple[BrowserContext, Page]"]:
"""Async context manager yielding a (context, page) pair.
Example::
async with undetected_browser(headless=False) as (ctx, page):
await page.goto("https://apps.fcc.gov/rmd/")
...
"""
async with async_playwright() as pw:
browser, context = await launch_context(
pw,
headless=headless,
use_proxy=use_proxy,
timezone_id=timezone_id,
locale=locale,
storage_state=storage_state,
)
try:
page = await context.new_page()
yield context, page
finally:
await context.close()
await browser.close()
# ─── Human-like interaction helpers (lifted from scripts/formation/base.py) ──
async def human_delay(min_s: float = 1.0, max_s: float = 3.0) -> None:
"""Random delay to appear human. Mirrors the formation base helper."""
import asyncio
await asyncio.sleep(random.uniform(min_s, max_s))
async def type_slowly(page: "Page", selector: str, text: str, delay_ms: int = 50) -> None:
"""Type text character-by-character with jitter."""
await page.click(selector)
for char in text:
await page.type(selector, char, delay=delay_ms + random.randint(0, 30))
def is_using_patchright() -> bool:
"""Return True if patchright is the active backend."""
return _USING_PATCHRIGHT