otc: domain->email scraper + filing-agent domain filtering

scrape_otc_emails.py: fetch each issuer domain's IR/contact pages (gzip,
HTML-only, early-abort, prefer ir@/investor@/info@), extract a contact email.
Skip filing-agent domains (DFN/Donnelley/Broadridge/etc.) that leak into the
extracted domain -- those are not the issuer's site. Same filter added to the
harvester's DOMAIN_NOISE for future runs. Phone (100%) is the fallback channel
for email misses.
This commit is contained in:
justin 2026-06-14 06:56:45 -05:00
parent fdea97e57e
commit 4d3af2aeae
2 changed files with 149 additions and 1 deletions

View file

@ -37,7 +37,9 @@ URL_RE = re.compile(r'https?://(?:www\.)?([a-z0-9\-]+\.(?:com|net|io|co|us))', r
DOMAIN_NOISE = ("sec.gov", "xbrl", "w3.org", "schema", "adobe", "fasb",
"rdgfilings", "edgar", "sECDatabase".lower(), "donnelley",
"broadridge", "toppanmerrill", "dfinsolutions", "workiva",
"cloudfront", "googleapis", "gstatic", "jquery", "bootstrap")
"cloudfront", "googleapis", "gstatic", "jquery", "bootstrap",
"issuerdirect", "globenewswire", "businesswire", "prnewswire",
"vintage", "secdatabase", "sec1934act", "dfn.com")
def get(url: str, timeout: int = 15) -> str: