CMS healthcare portals (NPPES, PECOS, I&A) block datacenter IPs, so the healthcare browser automation needs to egress via the residential proxy on hg409y7ez04.sn.mynetname.net (username 'performancewest'). - undetected_browser: use_proxy now accepts an env-var name, so callers can select a domain-specific proxy. _proxy_config(proxy_env) reads it and falls back to UNDETECTED_PROXY_URL. Healthcare uses 'HEALTHCARE_PROXY_URL'. - probe_npi_undetected: launches with use_proxy='HEALTHCARE_PROXY_URL' when set. - npi_provider: documents that the (future) automated NPPES/PECOS flows must use the healthcare proxy. - Plumb HEALTHCARE_PROXY_URL (+ UNDETECTED_PROXY_URL fallback) through the ansible env template and docker-compose workers env. The credential itself is NOT in the repo. Set the full URL in the ansible vault as vault_healthcare_proxy_url: socks5://performancewest:<password>@hg409y7ez04.sn.mynetname.net:<port> Verified parsing + Playwright proxy-dict wiring with a unit test.
100 lines
4 KiB
Python
100 lines
4 KiB
Python
"""Probe whether our undetected (patchright) browser can reach NPPES / PECOS
|
|
and how detectable it looks. Honest, no assertions from memory — it visits
|
|
real endpoints and a fingerprint-detection page and prints what it sees.
|
|
|
|
Run: python3 scripts/probe_npi_undetected.py
|
|
"""
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, "scripts")
|
|
from workers.services.telecom.undetected_browser import ( # noqa: E402
|
|
undetected_browser, is_using_patchright,
|
|
)
|
|
|
|
# Route healthcare (NPPES/PECOS/I&A) traffic through the residential SOCKS
|
|
# proxy (username "performancewest"). Set HEALTHCARE_PROXY_URL=1 (or any
|
|
# truthy value) to force it; the proxy is also used automatically whenever
|
|
# HEALTHCARE_PROXY_URL is configured with a real URL.
|
|
USE_HEALTHCARE_PROXY = bool(os.environ.get("HEALTHCARE_PROXY_URL", "").strip())
|
|
|
|
TARGETS = [
|
|
# NPPES public registry UI (where NPI lookups/updates happen)
|
|
("NPPES registry", "https://npiregistry.cms.hhs.gov/"),
|
|
# NPPES public API (already used by our free tool — sanity check)
|
|
("NPPES API", "https://npiregistry.cms.hhs.gov/api/?version=2.1&number=1234567893"),
|
|
# PECOS / I&A login surface (Identity & Access)
|
|
("PECOS portal", "https://pecos.cms.hhs.gov/pecos/login.do"),
|
|
("I&A portal", "https://nppes.cms.hhs.gov/IAWeb/login.do"),
|
|
]
|
|
|
|
# Public bot-detection fingerprint check.
|
|
SANNYSOFT = "https://bot.sannysoft.com/"
|
|
|
|
|
|
async def probe(headless: bool):
|
|
print(f"\n{'='*60}\nbackend = {'patchright' if is_using_patchright() else 'vanilla-playwright'} | headless={headless}\n{'='*60}")
|
|
async with undetected_browser(
|
|
headless=headless,
|
|
use_proxy="HEALTHCARE_PROXY_URL" if USE_HEALTHCARE_PROXY else False,
|
|
) as (ctx, page):
|
|
# 1. navigator.webdriver + a couple of fingerprint signals
|
|
try:
|
|
await page.goto("about:blank")
|
|
fp = await page.evaluate("""() => ({
|
|
webdriver: navigator.webdriver,
|
|
plugins: navigator.plugins.length,
|
|
languages: navigator.languages,
|
|
chrome: typeof window.chrome,
|
|
ua: navigator.userAgent,
|
|
})""")
|
|
print("fingerprint:", fp)
|
|
except Exception as e:
|
|
print("fingerprint eval failed:", e)
|
|
|
|
# 2. real target reachability
|
|
for name, url in TARGETS:
|
|
try:
|
|
resp = await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
|
status = resp.status if resp else "?"
|
|
title = await page.title()
|
|
body = (await page.content())[:400].lower()
|
|
blocked = any(w in body for w in [
|
|
"access denied", "are you a human", "captcha", "blocked",
|
|
"incapsula", "akamai", "unusual traffic", "request unsuccessful",
|
|
])
|
|
print(f" [{status}] {name:14} blocked={blocked} title={title[:60]!r}")
|
|
except Exception as e:
|
|
print(f" [ERR] {name:14} {type(e).__name__}: {str(e)[:80]}")
|
|
|
|
# 3. sannysoft fingerprint scorecard (count red FAILs)
|
|
try:
|
|
await page.goto(SANNYSOFT, wait_until="networkidle", timeout=30000)
|
|
await asyncio.sleep(2)
|
|
fails = await page.evaluate("""() => {
|
|
const rows = [...document.querySelectorAll('tr')];
|
|
const bad = [];
|
|
for (const r of rows) {
|
|
const cls = r.className || '';
|
|
const txt = r.innerText.replace(/\\s+/g,' ').trim();
|
|
if (/fail|warn/i.test(cls)) bad.push(txt.slice(0,80));
|
|
}
|
|
return bad;
|
|
}""")
|
|
if fails:
|
|
print(f" sannysoft FAIL/WARN rows ({len(fails)}):")
|
|
for f in fails:
|
|
print(f" - {f}")
|
|
else:
|
|
print(" sannysoft: no FAIL/WARN rows detected (clean)")
|
|
except Exception as e:
|
|
print(" sannysoft check failed:", e)
|
|
|
|
|
|
async def main():
|
|
await probe(headless=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|