otc: domain->email scraper + filing-agent domain filtering
scrape_otc_emails.py: fetch each issuer domain's IR/contact pages (gzip, HTML-only, early-abort, prefer ir@/investor@/info@), extract a contact email. Skip filing-agent domains (DFN/Donnelley/Broadridge/etc.) that leak into the extracted domain -- those are not the issuer's site. Same filter added to the harvester's DOMAIN_NOISE for future runs. Phone (100%) is the fallback channel for email misses.
This commit is contained in:
parent
fdea97e57e
commit
4d3af2aeae
2 changed files with 149 additions and 1 deletions
|
|
@ -37,7 +37,9 @@ URL_RE = re.compile(r'https?://(?:www\.)?([a-z0-9\-]+\.(?:com|net|io|co|us))', r
|
|||
DOMAIN_NOISE = ("sec.gov", "xbrl", "w3.org", "schema", "adobe", "fasb",
|
||||
"rdgfilings", "edgar", "sECDatabase".lower(), "donnelley",
|
||||
"broadridge", "toppanmerrill", "dfinsolutions", "workiva",
|
||||
"cloudfront", "googleapis", "gstatic", "jquery", "bootstrap")
|
||||
"cloudfront", "googleapis", "gstatic", "jquery", "bootstrap",
|
||||
"issuerdirect", "globenewswire", "businesswire", "prnewswire",
|
||||
"vintage", "secdatabase", "sec1934act", "dfn.com")
|
||||
|
||||
|
||||
def get(url: str, timeout: int = 15) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue