The residential proxy password contains a '#', which urlparse() misreads as a
URL fragment and corrupts the port (ValueError: Port could not be cast...).
Parse scheme://creds@host:port manually and percent-decode user/pass so both
raw ('#') and encoded ('%23') passwords work. Verified against the live
credential.
269 lines
9.7 KiB
Python
269 lines
9.7 KiB
Python
"""Undetected Playwright launcher for FCC / USAC / BDC portals.
|
|
|
|
All automated filing handlers (RMD, CPNI, Form 499-A, BDC) go through this
|
|
helper instead of importing ``playwright`` directly. We prefer ``patchright``
|
|
(a drop-in Playwright replacement that patches ``navigator.webdriver``, CDP
|
|
leakage, runtime-enable fingerprints, and the ``--disable-blink-features=
|
|
AutomationControlled`` artifact) and fall back to vanilla Playwright with
|
|
the same stealth init scripts we use in ``scripts/formation/base.py`` if
|
|
patchright is not installed.
|
|
|
|
State formation portals that sit behind Incapsula/Akamai (Nevada, Delaware,
|
|
etc.) should also use this helper — see ``docs/state-automation-status.md``
|
|
for the list.
|
|
|
|
Optional residential proxy support: set ``UNDETECTED_PROXY_URL`` in the
|
|
environment (e.g. ``socks5://user:pass@proxy.example.com:1080``) and pass
|
|
``use_proxy=True`` when launching. Healthcare NPPES/PECOS flows use a
|
|
dedicated ``HEALTHCARE_PROXY_URL`` (residential SOCKS proxy, username
|
|
``performancewest``) via ``use_proxy="HEALTHCARE_PROXY_URL"``; it falls back
|
|
to ``UNDETECTED_PROXY_URL`` if the healthcare-specific var is unset.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import random
|
|
from contextlib import asynccontextmanager
|
|
from typing import TYPE_CHECKING, AsyncIterator
|
|
|
|
if TYPE_CHECKING:
|
|
from playwright.async_api import Browser, BrowserContext, Page, Playwright
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Prefer patchright; fall back to playwright with manual stealth patches.
|
|
_USING_PATCHRIGHT = False
|
|
try:
|
|
from patchright.async_api import async_playwright # type: ignore
|
|
|
|
_USING_PATCHRIGHT = True
|
|
logger.info("undetected_browser: using patchright")
|
|
except ImportError:
|
|
from playwright.async_api import async_playwright # type: ignore
|
|
|
|
logger.warning(
|
|
"undetected_browser: patchright not installed — falling back to "
|
|
"vanilla playwright with home-grown stealth patches. Install with "
|
|
"`pip install patchright` and run `patchright install chromium` "
|
|
"for best results against bot-detection-heavy portals."
|
|
)
|
|
|
|
|
|
# Common modern Chrome UAs. We rotate between a handful so that a burst of
|
|
# concurrent submissions doesn't all look like the same client.
|
|
_USER_AGENTS = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
]
|
|
|
|
# Common viewports; mildly jittered per-launch to vary fingerprint.
|
|
_VIEWPORTS = [
|
|
{"width": 1280, "height": 900},
|
|
{"width": 1440, "height": 900},
|
|
{"width": 1536, "height": 864},
|
|
{"width": 1920, "height": 1080},
|
|
]
|
|
|
|
# Init script run on every page — only used on the vanilla-playwright path;
|
|
# patchright handles all of these patches (and many more) internally.
|
|
_STEALTH_INIT_SCRIPT = """
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => [1, 2, 3, 4, 5].map(() => ({ name: 'Chrome PDF Plugin' })),
|
|
});
|
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
window.chrome = { runtime: {} };
|
|
const originalQuery = window.navigator.permissions && window.navigator.permissions.query;
|
|
if (originalQuery) {
|
|
window.navigator.permissions.query = (parameters) =>
|
|
parameters.name === 'notifications'
|
|
? Promise.resolve({ state: Notification.permission })
|
|
: originalQuery(parameters);
|
|
}
|
|
"""
|
|
|
|
|
|
def _proxy_config(proxy_env: str = "UNDETECTED_PROXY_URL") -> dict | None:
|
|
"""Read a proxy URL env var and turn it into a Playwright proxy dict.
|
|
|
|
``proxy_env`` names the environment variable to read (default
|
|
``UNDETECTED_PROXY_URL``). Callers that need a dedicated upstream — e.g.
|
|
the healthcare NPPES/PECOS flows, which route through the residential
|
|
SOCKS proxy — pass ``use_proxy="HEALTHCARE_PROXY_URL"`` so the credential
|
|
is configured in exactly one place (the env / ansible vault).
|
|
|
|
The URL may be ``http://`` or ``socks5://`` and may embed credentials:
|
|
``socks5://user:pass@host:port``.
|
|
"""
|
|
url = os.environ.get(proxy_env, "").strip()
|
|
# Allow a single shared residential proxy to back several domain-specific
|
|
# env names: if the requested var is unset, fall back to the generic one.
|
|
if not url and proxy_env != "UNDETECTED_PROXY_URL":
|
|
url = os.environ.get("UNDETECTED_PROXY_URL", "").strip()
|
|
if not url:
|
|
return None
|
|
|
|
# Parse manually so credentials may contain URL-reserved characters such
|
|
# as '#', '@', ':' or '/'. urlparse() chokes on those (e.g. a '#' in the
|
|
# password is misread as a fragment, corrupting the port), so we split the
|
|
# ``scheme://creds@host:port`` shape ourselves and percent-decode the
|
|
# username/password. The password may be stored raw OR percent-encoded
|
|
# (e.g. '#' as '%23') in the env — unquote() handles both.
|
|
from urllib.parse import unquote
|
|
|
|
rest = url
|
|
scheme = ""
|
|
if "://" in rest:
|
|
scheme, rest = rest.split("://", 1)
|
|
|
|
creds = ""
|
|
hostport = rest
|
|
if "@" in rest:
|
|
# rsplit so an '@' inside the password doesn't split the host off early
|
|
creds, hostport = rest.rsplit("@", 1)
|
|
|
|
cfg: dict = {"server": f"{scheme}://{hostport}" if scheme else hostport}
|
|
|
|
if creds:
|
|
if ":" in creds:
|
|
user, pw = creds.split(":", 1)
|
|
else:
|
|
user, pw = creds, ""
|
|
if user:
|
|
cfg["username"] = unquote(user)
|
|
if pw:
|
|
cfg["password"] = unquote(pw)
|
|
return cfg
|
|
|
|
|
|
async def launch_context(
|
|
playwright: "Playwright",
|
|
*,
|
|
headless: bool = True,
|
|
use_proxy: "bool | str" = False,
|
|
timezone_id: str = "America/New_York",
|
|
locale: str = "en-US",
|
|
storage_state: str | None = None,
|
|
) -> "tuple[Browser, BrowserContext]":
|
|
"""Launch a Chromium browser + context with stealth settings.
|
|
|
|
``use_proxy`` may be ``True`` (read ``UNDETECTED_PROXY_URL``) or the name
|
|
of a specific env var, e.g. ``"HEALTHCARE_PROXY_URL"`` for the residential
|
|
proxy used by the NPPES/PECOS healthcare flows.
|
|
|
|
Returns ``(browser, context)`` — caller is responsible for closing both
|
|
(prefer the :func:`undetected_browser` context manager instead).
|
|
"""
|
|
|
|
launch_args = [
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
]
|
|
# On the vanilla-playwright path we add the extra flag that hides the
|
|
# AutomationControlled fingerprint. Patchright already does this (and
|
|
# adding the flag with patchright is harmless but redundant).
|
|
if not _USING_PATCHRIGHT:
|
|
launch_args.append("--disable-blink-features=AutomationControlled")
|
|
|
|
browser = await playwright.chromium.launch(
|
|
headless=headless,
|
|
args=launch_args,
|
|
)
|
|
|
|
context_kwargs: dict = {
|
|
"viewport": random.choice(_VIEWPORTS),
|
|
"user_agent": random.choice(_USER_AGENTS),
|
|
"locale": locale,
|
|
"timezone_id": timezone_id,
|
|
"java_script_enabled": True,
|
|
}
|
|
if use_proxy:
|
|
proxy_env = use_proxy if isinstance(use_proxy, str) else "UNDETECTED_PROXY_URL"
|
|
proxy = _proxy_config(proxy_env)
|
|
if proxy:
|
|
context_kwargs["proxy"] = proxy
|
|
logger.info(
|
|
"undetected_browser: routing through proxy %s (from %s)",
|
|
proxy.get("server"),
|
|
proxy_env,
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"undetected_browser: use_proxy set but %s is unset", proxy_env
|
|
)
|
|
if storage_state:
|
|
context_kwargs["storage_state"] = storage_state
|
|
|
|
context = await browser.new_context(**context_kwargs)
|
|
|
|
if not _USING_PATCHRIGHT:
|
|
await context.add_init_script(_STEALTH_INIT_SCRIPT)
|
|
|
|
return browser, context
|
|
|
|
|
|
@asynccontextmanager
|
|
async def undetected_browser(
|
|
*,
|
|
headless: bool = True,
|
|
use_proxy: "bool | str" = False,
|
|
timezone_id: str = "America/New_York",
|
|
locale: str = "en-US",
|
|
storage_state: str | None = None,
|
|
) -> AsyncIterator["tuple[BrowserContext, Page]"]:
|
|
"""Async context manager yielding a (context, page) pair.
|
|
|
|
``use_proxy`` may be ``True`` (read ``UNDETECTED_PROXY_URL``) or the name
|
|
of a specific env var, e.g. ``"HEALTHCARE_PROXY_URL"`` for the residential
|
|
proxy the NPPES/PECOS flows route through.
|
|
|
|
Example::
|
|
|
|
async with undetected_browser(headless=False) as (ctx, page):
|
|
await page.goto("https://apps.fcc.gov/rmd/")
|
|
...
|
|
"""
|
|
async with async_playwright() as pw:
|
|
browser, context = await launch_context(
|
|
pw,
|
|
headless=headless,
|
|
use_proxy=use_proxy,
|
|
timezone_id=timezone_id,
|
|
locale=locale,
|
|
storage_state=storage_state,
|
|
)
|
|
try:
|
|
page = await context.new_page()
|
|
yield context, page
|
|
finally:
|
|
await context.close()
|
|
await browser.close()
|
|
|
|
|
|
# ─── Human-like interaction helpers (lifted from scripts/formation/base.py) ──
|
|
|
|
async def human_delay(min_s: float = 1.0, max_s: float = 3.0) -> None:
|
|
"""Random delay to appear human. Mirrors the formation base helper."""
|
|
import asyncio
|
|
|
|
await asyncio.sleep(random.uniform(min_s, max_s))
|
|
|
|
|
|
async def type_slowly(page: "Page", selector: str, text: str, delay_ms: int = 50) -> None:
|
|
"""Type text character-by-character with jitter."""
|
|
await page.click(selector)
|
|
for char in text:
|
|
await page.type(selector, char, delay=delay_ms + random.randint(0, 30))
|
|
|
|
|
|
def is_using_patchright() -> bool:
|
|
"""Return True if patchright is the active backend."""
|
|
return _USING_PATCHRIGHT
|