diff --git a/scripts/harvest_otc_issuers.py b/scripts/harvest_otc_issuers.py index e7904dd..5fe7b12 100644 --- a/scripts/harvest_otc_issuers.py +++ b/scripts/harvest_otc_issuers.py @@ -37,7 +37,9 @@ URL_RE = re.compile(r'https?://(?:www\.)?([a-z0-9\-]+\.(?:com|net|io|co|us))', r DOMAIN_NOISE = ("sec.gov", "xbrl", "w3.org", "schema", "adobe", "fasb", "rdgfilings", "edgar", "sECDatabase".lower(), "donnelley", "broadridge", "toppanmerrill", "dfinsolutions", "workiva", - "cloudfront", "googleapis", "gstatic", "jquery", "bootstrap") + "cloudfront", "googleapis", "gstatic", "jquery", "bootstrap", + "issuerdirect", "globenewswire", "businesswire", "prnewswire", + "vintage", "secdatabase", "sec1934act", "dfn.com") def get(url: str, timeout: int = 15) -> str: diff --git a/scripts/scrape_otc_emails.py b/scripts/scrape_otc_emails.py new file mode 100644 index 0000000..5d98758 --- /dev/null +++ b/scripts/scrape_otc_emails.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Scrape a contact/IR email for each OTC issuer domain (harvest step 2). + +Takes harvest_otc_issuers.py output (has a `domain`), fetches the issuer's +home + investor/contact pages, and extracts the best contact email. Public +companies almost always have an IR or general inbox. Phone is the fallback +channel for misses (the harvest already has phone at 100%). + +Bandwidth-friendly: gzip, HTML document only (no assets), early-abort on first +good email, capped page size. Optional --proxy for residential egress (usually +unnecessary -- fetching a known corporate site is not search-engine scraping). + +Usage: + python3 scripts/scrape_otc_emails.py OTC_ISSUERS.csv OUT.csv [--proxy URL] +""" +from __future__ import annotations +import argparse +import csv +import gzip +import io +import re +import sys +import time +import urllib.request + +UA = "Mozilla/5.0 (compatible; PW-research/1.0; +https://performancewest.net)" +EMAIL_RE = re.compile(r'[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}') +# Prefer these local-parts for a corporate contact. +PREFER = ("ir", "investor", "investorrelations", "info", "contact", "admin", + "corporate", "legal", "compliance", "general") +BAD_EMAIL_BITS = ("example.com", "sentry", "wixpress", "@2x", "godaddy", + "cloudflare", "domain.com", "email.com", "yourdomain", + "sentry.io", ".png", ".jpg", ".gif", ".svg", ".webp") +PATHS = ("/investor-relations", "/investors", "/ir", "/contact", "/contact-us", + "/about", "/") +MAX_BYTES = 600_000 + +# Filing-agent / EDGAR-vendor domains that sometimes leak into the extracted +# `domain` (e.g. netlistDFN.com = Donnelley). These are NOT the issuer's site, +# so skip scraping them -- treat the row as "no usable domain" (phone fallback). +FILING_AGENT_HINTS = ("dfn", "rdgfilings", "edgar", "donnelley", "broadridge", + "toppanmerrill", "dfinsolutions", "workiva", "vintage", + "issuerdirect", "globenewswire", "businesswire", "prnewswire", + "secdatabase", "sec1934act") + + +def is_filing_agent_domain(domain: str) -> bool: + base = domain.split(".")[0] + return any(h in domain for h in FILING_AGENT_HINTS) or base.endswith("dfn") + + +def fetch(url: str, proxy: str | None, timeout: int = 10) -> str | None: + req = urllib.request.Request(url, headers={ + "User-Agent": UA, + "Accept-Encoding": "gzip", + "Accept": "text/html,application/xhtml+xml", + }) + opener = (urllib.request.build_opener( + urllib.request.ProxyHandler({"http": proxy, "https": proxy})) + if proxy else urllib.request.build_opener()) + try: + with opener.open(req, timeout=timeout) as r: + ctype = r.headers.get("Content-Type", "") + if "html" not in ctype and ctype: + return None + raw = r.read(MAX_BYTES) + if r.headers.get("Content-Encoding") == "gzip": + try: + raw = gzip.GzipFile(fileobj=io.BytesIO(raw)).read() + except OSError: + pass + return raw.decode("utf-8", "ignore") + except Exception: + return None + + +def best_email(html: str, domain: str) -> str | None: + found = [] + for e in EMAIL_RE.findall(html): + el = e.lower() + if any(b in el for b in BAD_EMAIL_BITS): + continue + found.append(el) + if not found: + return None + # 1) same-domain + preferred local-part + same = [e for e in found if e.split("@")[1].endswith(domain)] + pool = same or found + for pref in PREFER: + for e in pool: + if e.split("@")[0] == pref or e.split("@")[0].startswith(pref): + return e + return pool[0] + + +def scrape_domain(domain: str, proxy: str | None) -> str | None: + for path in PATHS: + for host in (f"https://{domain}", f"https://www.{domain}"): + html = fetch(host + path, proxy) + if not html: + continue + em = best_email(html, domain) + if em: + return em + break # https worked for this path; don't retry www for same path + return None + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("infile") + ap.add_argument("outfile") + ap.add_argument("--proxy", default=None) + ap.add_argument("--sleep", type=float, default=0.4) + args = ap.parse_args() + + rows = list(csv.DictReader(open(args.infile, newline="", encoding="utf-8"))) + out = [] + got = 0 + for i, r in enumerate(rows, 1): + dom = (r.get("domain") or "").strip().lower() + if dom and is_filing_agent_domain(dom): + dom = "" # filing-agent artifact, not the issuer's site + email = scrape_domain(dom, args.proxy) if dom else None + r["email"] = email or "" + out.append(r) + if email: + got += 1 + if i % 25 == 0: + print(f" {i}/{len(rows)} | emails {got}", file=sys.stderr) + if dom: + time.sleep(args.sleep) + + fields = list(rows[0].keys()) + (["email"] if "email" not in rows[0] else []) + with open(args.outfile, "w", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=fields) + w.writeheader() + w.writerows(out) + print(f"issuers: {len(rows):,} | with domain: {sum(1 for r in rows if r.get('domain'))} " + f"| emails scraped: {got} ({100*got/max(len(rows),1):.0f}%)", file=sys.stderr) + print(f" -> {args.outfile}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())