new-site/scripts/test_healthcare_email_streams.py

61 lines
2.4 KiB
Python

"""Unit tests for the healthcare email-stream classifier.
Run: python3 -m scripts.test_healthcare_email_streams (or pytest)
Guards the subtle case that motivated the two-tier DIRECT detection: a naive
substring "direct" match wrongly parked real "Direct Primary Care" / counseling
practices (registrable domain merely contains the word) into the undeliverable
HISP pile. Direct/HISP gateways instead use "direct"/"hisp" as a whole DNS label.
"""
from scripts.healthcare_email_streams import classify
CASES = [
# Real Direct-Primary-Care / counseling practices -> institutional
("chelsea@arthurdirectcare.com", "institutional"),
("bassi@valleydirectprimarycare.com", "institutional"),
("megan@newdirectionscounselingservices.com", "institutional"),
("john@islanddirectprimarycare.com", "institutional"),
("kamlesh@mydirectcare.com", "institutional"),
("allison@truedirectioncounseling.com", "institutional"),
("marty@holtondirectcare.com", "institutional"),
("sbass@newdirectionsnonemergency.org", "institutional"),
("info@consumerdirectcare.com", "institutional"),
("x@rehabdirectives.com", "institutional"),
# Genuine Direct/HISP gateways -> direct (parked)
("x@direct.novanthealth.org", "direct"),
("x@CarolinasHealthcareSystem.direct-ci.com", "direct"),
("x@cfp.directbygreenway.com", "direct"),
("x@foo.4693.direct.athenahealth.com", "direct"),
("x@directHISP.wakemed.org", "direct"),
("x@boss.directak.net", "direct"),
("x@hisp.bryanhealth.org", "direct"),
("x@ehrdirect.mayoclinicmsg.org", "direct"),
("x@directaddress.net", "direct"),
("x@negaidx.allscriptsdirect.net", "direct"),
("x@mmiller@lickingmemorial.medicity.net".replace("mmiller@", ""), "direct"),
("x@foo.nextgenshare.com", "direct"),
# Consumer / institutional / excluded / invalid
("drsmith@gmail.com", "consumer"),
("info@smallclinic.com", "institutional"),
("x@somehospital.va.gov", "excluded"),
("x@base.health.mil", "excluded"),
("not-an-email", "invalid"),
("", "invalid"),
]
def test_classify():
failures = []
for email, expected in CASES:
got = classify(email)
if got != expected:
failures.append((email, got, expected))
assert not failures, "Misclassified: " + "; ".join(
f"{e} -> {g} (want {x})" for e, g, x in failures
)
if __name__ == "__main__":
test_classify()
print(f"OK: all {len(CASES)} classifier cases pass")