otc: domain->email scraper + filing-agent domain filtering

scrape_otc_emails.py: fetch each issuer domain's IR/contact pages (gzip, HTML-only, early-abort, prefer ir@/investor@/info@), extract a contact email. Skip filing-agent domains (DFN/Donnelley/Broadridge/etc.) that leak into the extracted domain -- those are not the issuer's site. Same filter added to the harvester's DOMAIN_NOISE for future runs. Phone (100%) is the fallback channel for email misses.
2026-06-14 06:56:45 -05:00 · 2026-06-14 06:56:45 -05:00 · 4d3af2aeae
commit 4d3af2aeae
parent fdea97e57e
2 changed files with 149 additions and 1 deletions
--- a/scripts/harvest_otc_issuers.py
+++ b/scripts/harvest_otc_issuers.py
@ -37,7 +37,9 @@ URL_RE = re.compile(r'https?://(?:www\.)?([a-z0-9\-]+\.(?:com|net|io|co|us))', r
 DOMAIN_NOISE = ("sec.gov", "xbrl", "w3.org", "schema", "adobe", "fasb",
                "rdgfilings", "edgar", "sECDatabase".lower(), "donnelley",
                "broadridge", "toppanmerrill", "dfinsolutions", "workiva",
-                "cloudfront", "googleapis", "gstatic", "jquery", "bootstrap")
+                "cloudfront", "googleapis", "gstatic", "jquery", "bootstrap",
+                "issuerdirect", "globenewswire", "businesswire", "prnewswire",
+                "vintage", "secdatabase", "sec1934act", "dfn.com")


 def get(url: str, timeout: int = 15) -> str: