From 17318f6e7d938151390299e614f4cc6433eab6fc Mon Sep 17 00:00:00 2001 From: justin Date: Fri, 5 Jun 2026 14:36:01 -0500 Subject: [PATCH] feat(healthcare): route NPPES/PECOS Playwright flows through residential SOCKS proxy CMS healthcare portals (NPPES, PECOS, I&A) block datacenter IPs, so the healthcare browser automation needs to egress via the residential proxy on hg409y7ez04.sn.mynetname.net (username 'performancewest'). - undetected_browser: use_proxy now accepts an env-var name, so callers can select a domain-specific proxy. _proxy_config(proxy_env) reads it and falls back to UNDETECTED_PROXY_URL. Healthcare uses 'HEALTHCARE_PROXY_URL'. - probe_npi_undetected: launches with use_proxy='HEALTHCARE_PROXY_URL' when set. - npi_provider: documents that the (future) automated NPPES/PECOS flows must use the healthcare proxy. - Plumb HEALTHCARE_PROXY_URL (+ UNDETECTED_PROXY_URL fallback) through the ansible env template and docker-compose workers env. The credential itself is NOT in the repo. Set the full URL in the ansible vault as vault_healthcare_proxy_url: socks5://performancewest:@hg409y7ez04.sn.mynetname.net: Verified parsing + Playwright proxy-dict wiring with a unit test. --- docker-compose.yml | 4 ++ infra/ansible/roles/app/templates/app.env.j2 | 10 ++++ scripts/probe_npi_undetected.py | 12 ++++- scripts/workers/services/npi_provider.py | 5 ++ .../services/telecom/undetected_browser.py | 49 +++++++++++++++---- 5 files changed, 70 insertions(+), 10 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 98b59e6..655cb0e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -107,6 +107,10 @@ services: - CRYPTO_SWEEP_ADMIN_EMAIL=${ADMIN_EMAIL:-ops@performancewest.net} - USAC_USERNAME=${USAC_USERNAME} - USAC_PASSWORD=${USAC_PASSWORD} + # Residential SOCKS proxy for healthcare (NPPES/PECOS) Playwright flows. + # Username "performancewest"; full URL set in .env via the ansible vault. + - HEALTHCARE_PROXY_URL=${HEALTHCARE_PROXY_URL:-} + - UNDETECTED_PROXY_URL=${UNDETECTED_PROXY_URL:-} - ANYTIME_MAILBOX_SIGNUP_EMAIL=${ANYTIME_MAILBOX_SIGNUP_EMAIL:-noreply@performancewest.net} - ANYTIME_MAILBOX_SIGNUP_PHONE=${ANYTIME_MAILBOX_SIGNUP_PHONE} - ANYTIME_MAILBOX_DEFAULT_PASSWORD=${ANYTIME_MAILBOX_DEFAULT_PASSWORD} diff --git a/infra/ansible/roles/app/templates/app.env.j2 b/infra/ansible/roles/app/templates/app.env.j2 index 8dd4cc5..5933eec 100644 --- a/infra/ansible/roles/app/templates/app.env.j2 +++ b/infra/ansible/roles/app/templates/app.env.j2 @@ -119,6 +119,16 @@ HESTIA_URL={{ vault_hestia_url | default('https://cp.carrierone.com:8083') }} HESTIA_USER={{ vault_hestia_user | default('admin') }} HESTIA_PASS={{ vault_hestia_pass | default('') }} +# ── Residential proxy (healthcare NPPES/PECOS automation) ──────────────────── +# CMS healthcare portals (NPPES, PECOS, I&A) block datacenter IPs, so the +# Playwright healthcare flows route through a residential SOCKS proxy. +# Format: socks5://performancewest:@hg409y7ez04.sn.mynetname.net: +# (username is "performancewest"). Set the full URL in the ansible vault as +# vault_healthcare_proxy_url. Leave blank to run without a proxy. +# UNDETECTED_PROXY_URL is the generic fallback used by FCC/state flows. +HEALTHCARE_PROXY_URL={{ vault_healthcare_proxy_url | default('') }} +UNDETECTED_PROXY_URL={{ vault_undetected_proxy_url | default(vault_healthcare_proxy_url | default('')) }} + # ── Application URLs ────────────────────────────────────────────────────────── DOMAIN=https://{{ domain }} SITE_URL=https://{{ domain }} diff --git a/scripts/probe_npi_undetected.py b/scripts/probe_npi_undetected.py index efef442..a0fb8d1 100644 --- a/scripts/probe_npi_undetected.py +++ b/scripts/probe_npi_undetected.py @@ -5,6 +5,7 @@ real endpoints and a fingerprint-detection page and prints what it sees. Run: python3 scripts/probe_npi_undetected.py """ import asyncio +import os import sys sys.path.insert(0, "scripts") @@ -12,6 +13,12 @@ from workers.services.telecom.undetected_browser import ( # noqa: E402 undetected_browser, is_using_patchright, ) +# Route healthcare (NPPES/PECOS/I&A) traffic through the residential SOCKS +# proxy (username "performancewest"). Set HEALTHCARE_PROXY_URL=1 (or any +# truthy value) to force it; the proxy is also used automatically whenever +# HEALTHCARE_PROXY_URL is configured with a real URL. +USE_HEALTHCARE_PROXY = bool(os.environ.get("HEALTHCARE_PROXY_URL", "").strip()) + TARGETS = [ # NPPES public registry UI (where NPI lookups/updates happen) ("NPPES registry", "https://npiregistry.cms.hhs.gov/"), @@ -28,7 +35,10 @@ SANNYSOFT = "https://bot.sannysoft.com/" async def probe(headless: bool): print(f"\n{'='*60}\nbackend = {'patchright' if is_using_patchright() else 'vanilla-playwright'} | headless={headless}\n{'='*60}") - async with undetected_browser(headless=headless) as (ctx, page): + async with undetected_browser( + headless=headless, + use_proxy="HEALTHCARE_PROXY_URL" if USE_HEALTHCARE_PROXY else False, + ) as (ctx, page): # 1. navigator.webdriver + a couple of fingerprint signals try: await page.goto("about:blank") diff --git a/scripts/workers/services/npi_provider.py b/scripts/workers/services/npi_provider.py index bf7c237..2aa6ab4 100644 --- a/scripts/workers/services/npi_provider.py +++ b/scripts/workers/services/npi_provider.py @@ -5,6 +5,11 @@ NPI + intake details for a human to file in CMS PECOS / NPPES. This mirrors the FCC auto-filing-off safety default — no automated submission to government portals until the Playwright flows are proven. +When the Playwright NPPES/PECOS flows are enabled, they must route through the +residential SOCKS proxy (CMS blocks datacenter IPs) by launching with +``undetected_browser(use_proxy="HEALTHCARE_PROXY_URL")`` — the credential +(username ``performancewest``) is configured via HEALTHCARE_PROXY_URL in .env. + Covers slugs: npi-revalidation Medicare PECOS revalidation (5-yr cycle) npi-reactivation reactivate a deactivated NPI diff --git a/scripts/workers/services/telecom/undetected_browser.py b/scripts/workers/services/telecom/undetected_browser.py index aaf888a..79ef4f0 100644 --- a/scripts/workers/services/telecom/undetected_browser.py +++ b/scripts/workers/services/telecom/undetected_browser.py @@ -13,8 +13,11 @@ etc.) should also use this helper — see ``docs/state-automation-status.md`` for the list. Optional residential proxy support: set ``UNDETECTED_PROXY_URL`` in the -environment (e.g. ``http://user:pass@proxy.example.com:8080``) and pass -``use_proxy=True`` when launching. +environment (e.g. ``socks5://user:pass@proxy.example.com:1080``) and pass +``use_proxy=True`` when launching. Healthcare NPPES/PECOS flows use a +dedicated ``HEALTHCARE_PROXY_URL`` (residential SOCKS proxy, username +``performancewest``) via ``use_proxy="HEALTHCARE_PROXY_URL"``; it falls back +to ``UNDETECTED_PROXY_URL`` if the healthcare-specific var is unset. """ from __future__ import annotations @@ -87,9 +90,23 @@ if (originalQuery) { """ -def _proxy_config() -> dict | None: - """Read UNDETECTED_PROXY_URL and turn it into a Playwright proxy dict.""" - url = os.environ.get("UNDETECTED_PROXY_URL", "").strip() +def _proxy_config(proxy_env: str = "UNDETECTED_PROXY_URL") -> dict | None: + """Read a proxy URL env var and turn it into a Playwright proxy dict. + + ``proxy_env`` names the environment variable to read (default + ``UNDETECTED_PROXY_URL``). Callers that need a dedicated upstream — e.g. + the healthcare NPPES/PECOS flows, which route through the residential + SOCKS proxy — pass ``use_proxy="HEALTHCARE_PROXY_URL"`` so the credential + is configured in exactly one place (the env / ansible vault). + + The URL may be ``http://`` or ``socks5://`` and may embed credentials: + ``socks5://user:pass@host:port``. + """ + url = os.environ.get(proxy_env, "").strip() + # Allow a single shared residential proxy to back several domain-specific + # env names: if the requested var is unset, fall back to the generic one. + if not url and proxy_env != "UNDETECTED_PROXY_URL": + url = os.environ.get("UNDETECTED_PROXY_URL", "").strip() if not url: return None @@ -113,13 +130,17 @@ async def launch_context( playwright: "Playwright", *, headless: bool = True, - use_proxy: bool = False, + use_proxy: "bool | str" = False, timezone_id: str = "America/New_York", locale: str = "en-US", storage_state: str | None = None, ) -> "tuple[Browser, BrowserContext]": """Launch a Chromium browser + context with stealth settings. + ``use_proxy`` may be ``True`` (read ``UNDETECTED_PROXY_URL``) or the name + of a specific env var, e.g. ``"HEALTHCARE_PROXY_URL"`` for the residential + proxy used by the NPPES/PECOS healthcare flows. + Returns ``(browser, context)`` — caller is responsible for closing both (prefer the :func:`undetected_browser` context manager instead). """ @@ -148,12 +169,18 @@ async def launch_context( "java_script_enabled": True, } if use_proxy: - proxy = _proxy_config() + proxy_env = use_proxy if isinstance(use_proxy, str) else "UNDETECTED_PROXY_URL" + proxy = _proxy_config(proxy_env) if proxy: context_kwargs["proxy"] = proxy + logger.info( + "undetected_browser: routing through proxy %s (from %s)", + proxy.get("server"), + proxy_env, + ) else: logger.warning( - "undetected_browser: use_proxy=True but UNDETECTED_PROXY_URL is unset" + "undetected_browser: use_proxy set but %s is unset", proxy_env ) if storage_state: context_kwargs["storage_state"] = storage_state @@ -170,13 +197,17 @@ async def launch_context( async def undetected_browser( *, headless: bool = True, - use_proxy: bool = False, + use_proxy: "bool | str" = False, timezone_id: str = "America/New_York", locale: str = "en-US", storage_state: str | None = None, ) -> AsyncIterator["tuple[BrowserContext, Page]"]: """Async context manager yielding a (context, page) pair. + ``use_proxy`` may be ``True`` (read ``UNDETECTED_PROXY_URL``) or the name + of a specific env var, e.g. ``"HEALTHCARE_PROXY_URL"`` for the residential + proxy the NPPES/PECOS flows route through. + Example:: async with undetected_browser(headless=False) as (ctx, page):