healthcare: bound NPPES-stale window [3,10]yr + restore verify_ok gate

- Add NPPES_STALE_MAX_YEARS (default 10): a record untouched for many years is
  a stronger signal the practice closed/moved, and a bounce burns the warming
  IP. Observed institutional distribution clusters 3-7yrs with ~0 beyond 8, so
  10 is a safe ceiling that mails the whole real pool while excluding any
  outlier ancient record. MIN stays 3 (keeps the 'out of date' claim credible).
- Restore the SMTP-verification gate (verify_ok) that the shared
  institutional_verified selector had -- the swap to nppes_stale dropped it; we
  only mail inboxes we already proved live.
- enrich: process the re-fetch queue STALEST-FIRST so a bounded (--limit) or
  --max-age refresh spends its budget on the most-overdue cache entries (and new
  NPIs) first, never starving them behind merely-aging ones.
- Selector unit-tested (10 cases incl. window edges, verify gate, deactivated).
This commit is contained in:
justin 2026-06-20 15:28:12 -05:00
parent 9e155d214c
commit 744f0a89cf
2 changed files with 28 additions and 11 deletions

View file

@ -99,13 +99,18 @@ WARMUP_DUE_SOON_MIN = int(os.getenv("HC_DUE_SOON_MIN", "1"))
WARMUP_DUE_SOON_MAX = int(os.getenv("HC_DUE_SOON_MAX", "90"))
# NPPES "out of date" segment: only mail records whose REAL NPPES last_updated
# date is at least this many whole years ago. This is what makes the "your NPPES
# record may be out of date" claim LITERALLY TRUE and verifiable -- the provider
# date is within an [MIN, MAX] whole-years-stale window. MIN keeps the "out of
# date" claim credible (a record updated <3yrs ago isn't convincingly stale);
# MAX caps deliverability/defunct risk (a record untouched for many years is a
# stronger signal the practice closed/moved -- and a bounce burns the warming
# IP). This is what makes the claim LITERALLY TRUE and verifiable -- the provider
# can confirm the exact same last_updated date on the public registry. The date
# is joined in by enrich_nppes_last_updated.py (column nppes_years_stale). Until
# that enrichment has run, the field is empty and this segment safely mails
# nobody (we never assert "out of date" without the government date to back it).
# Observed institutional distribution: tightly clustered 3-7yrs, ~0 beyond 8yrs.
NPPES_STALE_MIN_YEARS = int(os.getenv("HC_NPPES_STALE_MIN_YEARS", "3"))
NPPES_STALE_MAX_YEARS = int(os.getenv("HC_NPPES_STALE_MAX_YEARS", "10"))
def _overdue_days(r: dict):
@ -348,17 +353,23 @@ def row_matches(seg_key: str, r: dict) -> bool:
if sel == "optout_ending": return optout
if sel == "nppes_stale":
# NPPES "out of date" segment. Only mail records whose REAL NPPES
# last_updated date (joined by enrich_nppes_last_updated.py) is at least
# NPPES_STALE_MIN_YEARS whole years old, so the "may be out of date"
# claim is literally true and the provider can verify the same date on
# the public registry. Deactivated NPIs belong to npi_reactivation, not
# here, so they're excluded. Empty field (enrichment not yet run) -> no
# match, so we never assert staleness without the government date.
# last_updated date (joined by enrich_nppes_last_updated.py) falls in the
# [MIN, MAX] years-stale window, so the "may be out of date" claim is
# literally true AND deliverable (very-stale records likely belong to
# closed/moved practices that bounce). The provider can verify the same
# date on the public registry. Deactivated NPIs belong to
# npi_reactivation, not here, so they're excluded. We also keep the
# institutional list's SMTP-verification gate (verify_ok) so we only mail
# inboxes we already proved are live. Empty stale field (enrichment not
# yet run) -> no match, so we never assert staleness without the date.
if (r.get("nppes_deactivated") or "").strip().upper() == "Y":
return False
if (str(r.get("verify_ok", "")).strip().upper()
not in ("Y", "YES", "TRUE", "1", "")):
return False
ys = (r.get("nppes_years_stale") or "").strip()
try:
return int(ys) >= NPPES_STALE_MIN_YEARS
return NPPES_STALE_MIN_YEARS <= int(ys) <= NPPES_STALE_MAX_YEARS
except ValueError:
return False
if sel == "any":

View file

@ -175,11 +175,17 @@ def main() -> int:
cache = load_cache(args.cache)
log(f"cache={args.cache} entries={len(cache):,}")
# Determine which NPIs need a (re)fetch.
# Determine which NPIs need a (re)fetch, STALEST FIRST so a bounded run
# (--limit) always spends its budget on the most-overdue cache entries.
# Never-fetched entries have an empty fetched_at, which sorts first, so new
# NPIs are prioritized over merely-aging ones.
todo = [n for n in npis if not is_fresh(cache.get(n, {}), today, args.max_age)]
todo.sort(key=lambda n: cache.get(n, {}).get("fetched_at", "") or "")
n_due = len(todo)
if args.limit:
todo = todo[:args.limit]
log(f"to_fetch={len(todo):,} (of {len(npis):,} unique NPIs; limit={args.limit or 'all'})")
log(f"to_fetch={len(todo):,} (of {n_due:,} due / {len(npis):,} unique NPIs; "
f"limit={args.limit or 'all'})")
fetched = 0
t0 = time.time()