hc refresh: propagate fresh status into the channel CSVs the cron reads

The channel CSVs (hc_warmup_nongoogle/google/week1_verified) are email-keyed subsets of the master with extra deliverability columns (verify_ok/verify_reason). The refresh now writes the fresh status fields (reval_due_date, days_overdue, reval_status, leie_excluded, optout_ending, name/specialty/state) back into each, preserving the extra columns and row membership, so a single weekly run updates everything the campaign cron consumes -- not just the master.
2026-06-08 03:13:00 -05:00 · 2026-06-08 03:13:00 -05:00 · 85dc3d5c3b
commit 85dc3d5c3b
parent 4f455475c0
1 changed files with 29 additions and 3 deletions
--- a/scripts/hc_data_refresh.py
+++ b/scripts/hc_data_refresh.py
@ -268,9 +268,35 @@ def main() -> int:
    write_atomic(args.master, refreshed, HEADER)
    log(f"wrote {args.master} ({len(refreshed)} rows)")
-    # Re-derive the channel CSVs the campaign cron reads (Google vs non-Google
+    # Propagate the fresh status fields into the channel CSVs the campaign cron
-    # split is a deliverability concern, not a segment one; keep the existing
+    # actually reads. These are email-keyed subsets of the master with extra
-    # split if those files exist so we don't lose warmup-cohort separation).
+    # deliverability columns (verify_ok/verify_reason) we must preserve; we only
    # overwrite the status fields the refresh owns.
    REFRESHED_FIELDS = ["reval_due_date", "days_overdue", "reval_status",
                        "leie_excluded", "optout_ending", "name", "specialty", "state"]
    by_email = {r["email"].strip().lower(): r for r in refreshed if r.get("email")}
    channel_csvs = [os.path.join(args.out_dir, f) for f in
                    ("hc_warmup_nongoogle.csv", "hc_warmup_google.csv",
                     "hc_warmup_week1_verified.csv")]
    for path in channel_csvs:
        if not os.path.exists(path):
            continue
        with open(path, newline="") as f:
            rdr = csv.DictReader(f)
            cols = rdr.fieldnames or []
            rows_ch = list(rdr)
        updated = 0
        for r in rows_ch:
            m = by_email.get(r.get("email", "").strip().lower())
            if not m:
                continue
            for fld in REFRESHED_FIELDS:
                if fld in cols and fld in m:
                    r[fld] = m[fld]
            updated += 1
        write_atomic(path, rows_ch, cols)
        log(f"propagated to {os.path.basename(path)}: {updated}/{len(rows_ch)} rows updated")
    return 0