From 744f0a89cf4e97019463af68f3233f9a3f758b63 Mon Sep 17 00:00:00 2001 From: justin Date: Sat, 20 Jun 2026 15:28:12 -0500 Subject: [PATCH] healthcare: bound NPPES-stale window [3,10]yr + restore verify_ok gate - Add NPPES_STALE_MAX_YEARS (default 10): a record untouched for many years is a stronger signal the practice closed/moved, and a bounce burns the warming IP. Observed institutional distribution clusters 3-7yrs with ~0 beyond 8, so 10 is a safe ceiling that mails the whole real pool while excluding any outlier ancient record. MIN stays 3 (keeps the 'out of date' claim credible). - Restore the SMTP-verification gate (verify_ok) that the shared institutional_verified selector had -- the swap to nppes_stale dropped it; we only mail inboxes we already proved live. - enrich: process the re-fetch queue STALEST-FIRST so a bounded (--limit) or --max-age refresh spends its budget on the most-overdue cache entries (and new NPIs) first, never starving them behind merely-aging ones. - Selector unit-tested (10 cases incl. window edges, verify gate, deactivated). --- scripts/build_healthcare_campaigns_cron.py | 29 +++++++++++++++------- scripts/enrich_nppes_last_updated.py | 10 ++++++-- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/scripts/build_healthcare_campaigns_cron.py b/scripts/build_healthcare_campaigns_cron.py index 544ee27..4421d2e 100644 --- a/scripts/build_healthcare_campaigns_cron.py +++ b/scripts/build_healthcare_campaigns_cron.py @@ -99,13 +99,18 @@ WARMUP_DUE_SOON_MIN = int(os.getenv("HC_DUE_SOON_MIN", "1")) WARMUP_DUE_SOON_MAX = int(os.getenv("HC_DUE_SOON_MAX", "90")) # NPPES "out of date" segment: only mail records whose REAL NPPES last_updated -# date is at least this many whole years ago. This is what makes the "your NPPES -# record may be out of date" claim LITERALLY TRUE and verifiable -- the provider +# date is within an [MIN, MAX] whole-years-stale window. MIN keeps the "out of +# date" claim credible (a record updated <3yrs ago isn't convincingly stale); +# MAX caps deliverability/defunct risk (a record untouched for many years is a +# stronger signal the practice closed/moved -- and a bounce burns the warming +# IP). This is what makes the claim LITERALLY TRUE and verifiable -- the provider # can confirm the exact same last_updated date on the public registry. The date # is joined in by enrich_nppes_last_updated.py (column nppes_years_stale). Until # that enrichment has run, the field is empty and this segment safely mails # nobody (we never assert "out of date" without the government date to back it). +# Observed institutional distribution: tightly clustered 3-7yrs, ~0 beyond 8yrs. NPPES_STALE_MIN_YEARS = int(os.getenv("HC_NPPES_STALE_MIN_YEARS", "3")) +NPPES_STALE_MAX_YEARS = int(os.getenv("HC_NPPES_STALE_MAX_YEARS", "10")) def _overdue_days(r: dict): @@ -348,17 +353,23 @@ def row_matches(seg_key: str, r: dict) -> bool: if sel == "optout_ending": return optout if sel == "nppes_stale": # NPPES "out of date" segment. Only mail records whose REAL NPPES - # last_updated date (joined by enrich_nppes_last_updated.py) is at least - # NPPES_STALE_MIN_YEARS whole years old, so the "may be out of date" - # claim is literally true and the provider can verify the same date on - # the public registry. Deactivated NPIs belong to npi_reactivation, not - # here, so they're excluded. Empty field (enrichment not yet run) -> no - # match, so we never assert staleness without the government date. + # last_updated date (joined by enrich_nppes_last_updated.py) falls in the + # [MIN, MAX] years-stale window, so the "may be out of date" claim is + # literally true AND deliverable (very-stale records likely belong to + # closed/moved practices that bounce). The provider can verify the same + # date on the public registry. Deactivated NPIs belong to + # npi_reactivation, not here, so they're excluded. We also keep the + # institutional list's SMTP-verification gate (verify_ok) so we only mail + # inboxes we already proved are live. Empty stale field (enrichment not + # yet run) -> no match, so we never assert staleness without the date. if (r.get("nppes_deactivated") or "").strip().upper() == "Y": return False + if (str(r.get("verify_ok", "")).strip().upper() + not in ("Y", "YES", "TRUE", "1", "")): + return False ys = (r.get("nppes_years_stale") or "").strip() try: - return int(ys) >= NPPES_STALE_MIN_YEARS + return NPPES_STALE_MIN_YEARS <= int(ys) <= NPPES_STALE_MAX_YEARS except ValueError: return False if sel == "any": diff --git a/scripts/enrich_nppes_last_updated.py b/scripts/enrich_nppes_last_updated.py index 7129829..ea3233a 100644 --- a/scripts/enrich_nppes_last_updated.py +++ b/scripts/enrich_nppes_last_updated.py @@ -175,11 +175,17 @@ def main() -> int: cache = load_cache(args.cache) log(f"cache={args.cache} entries={len(cache):,}") - # Determine which NPIs need a (re)fetch. + # Determine which NPIs need a (re)fetch, STALEST FIRST so a bounded run + # (--limit) always spends its budget on the most-overdue cache entries. + # Never-fetched entries have an empty fetched_at, which sorts first, so new + # NPIs are prioritized over merely-aging ones. todo = [n for n in npis if not is_fresh(cache.get(n, {}), today, args.max_age)] + todo.sort(key=lambda n: cache.get(n, {}).get("fetched_at", "") or "") + n_due = len(todo) if args.limit: todo = todo[:args.limit] - log(f"to_fetch={len(todo):,} (of {len(npis):,} unique NPIs; limit={args.limit or 'all'})") + log(f"to_fetch={len(todo):,} (of {n_due:,} due / {len(npis):,} unique NPIs; " + f"limit={args.limit or 'all'})") fetched = 0 t0 = time.time()