healthcare: bound NPPES-stale window [3,10]yr + restore verify_ok gate
- Add NPPES_STALE_MAX_YEARS (default 10): a record untouched for many years is a stronger signal the practice closed/moved, and a bounce burns the warming IP. Observed institutional distribution clusters 3-7yrs with ~0 beyond 8, so 10 is a safe ceiling that mails the whole real pool while excluding any outlier ancient record. MIN stays 3 (keeps the 'out of date' claim credible). - Restore the SMTP-verification gate (verify_ok) that the shared institutional_verified selector had -- the swap to nppes_stale dropped it; we only mail inboxes we already proved live. - enrich: process the re-fetch queue STALEST-FIRST so a bounded (--limit) or --max-age refresh spends its budget on the most-overdue cache entries (and new NPIs) first, never starving them behind merely-aging ones. - Selector unit-tested (10 cases incl. window edges, verify gate, deactivated).
This commit is contained in:
parent
9e155d214c
commit
744f0a89cf
2 changed files with 28 additions and 11 deletions
|
|
@ -99,13 +99,18 @@ WARMUP_DUE_SOON_MIN = int(os.getenv("HC_DUE_SOON_MIN", "1"))
|
||||||
WARMUP_DUE_SOON_MAX = int(os.getenv("HC_DUE_SOON_MAX", "90"))
|
WARMUP_DUE_SOON_MAX = int(os.getenv("HC_DUE_SOON_MAX", "90"))
|
||||||
|
|
||||||
# NPPES "out of date" segment: only mail records whose REAL NPPES last_updated
|
# NPPES "out of date" segment: only mail records whose REAL NPPES last_updated
|
||||||
# date is at least this many whole years ago. This is what makes the "your NPPES
|
# date is within an [MIN, MAX] whole-years-stale window. MIN keeps the "out of
|
||||||
# record may be out of date" claim LITERALLY TRUE and verifiable -- the provider
|
# date" claim credible (a record updated <3yrs ago isn't convincingly stale);
|
||||||
|
# MAX caps deliverability/defunct risk (a record untouched for many years is a
|
||||||
|
# stronger signal the practice closed/moved -- and a bounce burns the warming
|
||||||
|
# IP). This is what makes the claim LITERALLY TRUE and verifiable -- the provider
|
||||||
# can confirm the exact same last_updated date on the public registry. The date
|
# can confirm the exact same last_updated date on the public registry. The date
|
||||||
# is joined in by enrich_nppes_last_updated.py (column nppes_years_stale). Until
|
# is joined in by enrich_nppes_last_updated.py (column nppes_years_stale). Until
|
||||||
# that enrichment has run, the field is empty and this segment safely mails
|
# that enrichment has run, the field is empty and this segment safely mails
|
||||||
# nobody (we never assert "out of date" without the government date to back it).
|
# nobody (we never assert "out of date" without the government date to back it).
|
||||||
|
# Observed institutional distribution: tightly clustered 3-7yrs, ~0 beyond 8yrs.
|
||||||
NPPES_STALE_MIN_YEARS = int(os.getenv("HC_NPPES_STALE_MIN_YEARS", "3"))
|
NPPES_STALE_MIN_YEARS = int(os.getenv("HC_NPPES_STALE_MIN_YEARS", "3"))
|
||||||
|
NPPES_STALE_MAX_YEARS = int(os.getenv("HC_NPPES_STALE_MAX_YEARS", "10"))
|
||||||
|
|
||||||
|
|
||||||
def _overdue_days(r: dict):
|
def _overdue_days(r: dict):
|
||||||
|
|
@ -348,17 +353,23 @@ def row_matches(seg_key: str, r: dict) -> bool:
|
||||||
if sel == "optout_ending": return optout
|
if sel == "optout_ending": return optout
|
||||||
if sel == "nppes_stale":
|
if sel == "nppes_stale":
|
||||||
# NPPES "out of date" segment. Only mail records whose REAL NPPES
|
# NPPES "out of date" segment. Only mail records whose REAL NPPES
|
||||||
# last_updated date (joined by enrich_nppes_last_updated.py) is at least
|
# last_updated date (joined by enrich_nppes_last_updated.py) falls in the
|
||||||
# NPPES_STALE_MIN_YEARS whole years old, so the "may be out of date"
|
# [MIN, MAX] years-stale window, so the "may be out of date" claim is
|
||||||
# claim is literally true and the provider can verify the same date on
|
# literally true AND deliverable (very-stale records likely belong to
|
||||||
# the public registry. Deactivated NPIs belong to npi_reactivation, not
|
# closed/moved practices that bounce). The provider can verify the same
|
||||||
# here, so they're excluded. Empty field (enrichment not yet run) -> no
|
# date on the public registry. Deactivated NPIs belong to
|
||||||
# match, so we never assert staleness without the government date.
|
# npi_reactivation, not here, so they're excluded. We also keep the
|
||||||
|
# institutional list's SMTP-verification gate (verify_ok) so we only mail
|
||||||
|
# inboxes we already proved are live. Empty stale field (enrichment not
|
||||||
|
# yet run) -> no match, so we never assert staleness without the date.
|
||||||
if (r.get("nppes_deactivated") or "").strip().upper() == "Y":
|
if (r.get("nppes_deactivated") or "").strip().upper() == "Y":
|
||||||
return False
|
return False
|
||||||
|
if (str(r.get("verify_ok", "")).strip().upper()
|
||||||
|
not in ("Y", "YES", "TRUE", "1", "")):
|
||||||
|
return False
|
||||||
ys = (r.get("nppes_years_stale") or "").strip()
|
ys = (r.get("nppes_years_stale") or "").strip()
|
||||||
try:
|
try:
|
||||||
return int(ys) >= NPPES_STALE_MIN_YEARS
|
return NPPES_STALE_MIN_YEARS <= int(ys) <= NPPES_STALE_MAX_YEARS
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
if sel == "any":
|
if sel == "any":
|
||||||
|
|
|
||||||
|
|
@ -175,11 +175,17 @@ def main() -> int:
|
||||||
cache = load_cache(args.cache)
|
cache = load_cache(args.cache)
|
||||||
log(f"cache={args.cache} entries={len(cache):,}")
|
log(f"cache={args.cache} entries={len(cache):,}")
|
||||||
|
|
||||||
# Determine which NPIs need a (re)fetch.
|
# Determine which NPIs need a (re)fetch, STALEST FIRST so a bounded run
|
||||||
|
# (--limit) always spends its budget on the most-overdue cache entries.
|
||||||
|
# Never-fetched entries have an empty fetched_at, which sorts first, so new
|
||||||
|
# NPIs are prioritized over merely-aging ones.
|
||||||
todo = [n for n in npis if not is_fresh(cache.get(n, {}), today, args.max_age)]
|
todo = [n for n in npis if not is_fresh(cache.get(n, {}), today, args.max_age)]
|
||||||
|
todo.sort(key=lambda n: cache.get(n, {}).get("fetched_at", "") or "")
|
||||||
|
n_due = len(todo)
|
||||||
if args.limit:
|
if args.limit:
|
||||||
todo = todo[:args.limit]
|
todo = todo[:args.limit]
|
||||||
log(f"to_fetch={len(todo):,} (of {len(npis):,} unique NPIs; limit={args.limit or 'all'})")
|
log(f"to_fetch={len(todo):,} (of {n_due:,} due / {len(npis):,} unique NPIs; "
|
||||||
|
f"limit={args.limit or 'all'})")
|
||||||
|
|
||||||
fetched = 0
|
fetched = 0
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue