new-site/scripts/probe_npi_undetected.py
justin 17318f6e7d feat(healthcare): route NPPES/PECOS Playwright flows through residential SOCKS proxy
CMS healthcare portals (NPPES, PECOS, I&A) block datacenter IPs, so the
healthcare browser automation needs to egress via the residential proxy on
hg409y7ez04.sn.mynetname.net (username 'performancewest').

- undetected_browser: use_proxy now accepts an env-var name, so callers can
  select a domain-specific proxy. _proxy_config(proxy_env) reads it and falls
  back to UNDETECTED_PROXY_URL. Healthcare uses 'HEALTHCARE_PROXY_URL'.
- probe_npi_undetected: launches with use_proxy='HEALTHCARE_PROXY_URL' when set.
- npi_provider: documents that the (future) automated NPPES/PECOS flows must
  use the healthcare proxy.
- Plumb HEALTHCARE_PROXY_URL (+ UNDETECTED_PROXY_URL fallback) through the
  ansible env template and docker-compose workers env.

The credential itself is NOT in the repo. Set the full URL in the ansible
vault as vault_healthcare_proxy_url:
  socks5://performancewest:<password>@hg409y7ez04.sn.mynetname.net:<port>
Verified parsing + Playwright proxy-dict wiring with a unit test.
2026-06-05 14:36:01 -05:00

100 lines
4 KiB
Python

"""Probe whether our undetected (patchright) browser can reach NPPES / PECOS
and how detectable it looks. Honest, no assertions from memory — it visits
real endpoints and a fingerprint-detection page and prints what it sees.
Run: python3 scripts/probe_npi_undetected.py
"""
import asyncio
import os
import sys
sys.path.insert(0, "scripts")
from workers.services.telecom.undetected_browser import ( # noqa: E402
undetected_browser, is_using_patchright,
)
# Route healthcare (NPPES/PECOS/I&A) traffic through the residential SOCKS
# proxy (username "performancewest"). Set HEALTHCARE_PROXY_URL=1 (or any
# truthy value) to force it; the proxy is also used automatically whenever
# HEALTHCARE_PROXY_URL is configured with a real URL.
USE_HEALTHCARE_PROXY = bool(os.environ.get("HEALTHCARE_PROXY_URL", "").strip())
TARGETS = [
# NPPES public registry UI (where NPI lookups/updates happen)
("NPPES registry", "https://npiregistry.cms.hhs.gov/"),
# NPPES public API (already used by our free tool — sanity check)
("NPPES API", "https://npiregistry.cms.hhs.gov/api/?version=2.1&number=1234567893"),
# PECOS / I&A login surface (Identity & Access)
("PECOS portal", "https://pecos.cms.hhs.gov/pecos/login.do"),
("I&A portal", "https://nppes.cms.hhs.gov/IAWeb/login.do"),
]
# Public bot-detection fingerprint check.
SANNYSOFT = "https://bot.sannysoft.com/"
async def probe(headless: bool):
print(f"\n{'='*60}\nbackend = {'patchright' if is_using_patchright() else 'vanilla-playwright'} | headless={headless}\n{'='*60}")
async with undetected_browser(
headless=headless,
use_proxy="HEALTHCARE_PROXY_URL" if USE_HEALTHCARE_PROXY else False,
) as (ctx, page):
# 1. navigator.webdriver + a couple of fingerprint signals
try:
await page.goto("about:blank")
fp = await page.evaluate("""() => ({
webdriver: navigator.webdriver,
plugins: navigator.plugins.length,
languages: navigator.languages,
chrome: typeof window.chrome,
ua: navigator.userAgent,
})""")
print("fingerprint:", fp)
except Exception as e:
print("fingerprint eval failed:", e)
# 2. real target reachability
for name, url in TARGETS:
try:
resp = await page.goto(url, wait_until="domcontentloaded", timeout=30000)
status = resp.status if resp else "?"
title = await page.title()
body = (await page.content())[:400].lower()
blocked = any(w in body for w in [
"access denied", "are you a human", "captcha", "blocked",
"incapsula", "akamai", "unusual traffic", "request unsuccessful",
])
print(f" [{status}] {name:14} blocked={blocked} title={title[:60]!r}")
except Exception as e:
print(f" [ERR] {name:14} {type(e).__name__}: {str(e)[:80]}")
# 3. sannysoft fingerprint scorecard (count red FAILs)
try:
await page.goto(SANNYSOFT, wait_until="networkidle", timeout=30000)
await asyncio.sleep(2)
fails = await page.evaluate("""() => {
const rows = [...document.querySelectorAll('tr')];
const bad = [];
for (const r of rows) {
const cls = r.className || '';
const txt = r.innerText.replace(/\\s+/g,' ').trim();
if (/fail|warn/i.test(cls)) bad.push(txt.slice(0,80));
}
return bad;
}""")
if fails:
print(f" sannysoft FAIL/WARN rows ({len(fails)}):")
for f in fails:
print(f" - {f}")
else:
print(" sannysoft: no FAIL/WARN rows detected (clean)")
except Exception as e:
print(" sannysoft check failed:", e)
async def main():
await probe(headless=True)
if __name__ == "__main__":
asyncio.run(main())