From fdea97e57efef47c6d2e0ae8835198ada76e0889 Mon Sep 17 00:00:00 2001 From: justin Date: Sun, 14 Jun 2026 01:24:56 -0500 Subject: [PATCH] otc: EDGAR harvester for US-domestic OTC issuers + domain-from-filings Pilot -> production: harvest_otc_issuers.py pulls the OTC/None universe (2,771), keeps US-domestic (requires BOTH a US state-of-incorporation AND a US-state business address -- disambiguates the 'DE'=Delaware-vs-Germany trap that leaked Infineon etc.), and extracts each issuer's website DOMAIN directly from its latest 10-K/8-K/DEF-14A filing (free, no scrape; ~58-60% find rate in testing). Outputs cik,name,ticker,state_inc,phone,city,state,zip,domain -- ready for the domain->email scrape + verify step. Phone is 100% (clean fallback call channel). Reincorporation-to-TX / RA / foreign-qual / franchise-tax / annual-report fit. --- scripts/harvest_otc_issuers.py | 150 +++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 scripts/harvest_otc_issuers.py diff --git a/scripts/harvest_otc_issuers.py b/scripts/harvest_otc_issuers.py new file mode 100644 index 0000000..e7904dd --- /dev/null +++ b/scripts/harvest_otc_issuers.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +"""Harvest US-domestic OTC SEC issuers with their website domain (from EDGAR). + +The reincorporation / registered-agent / foreign-qualification / franchise-tax / +annual-report audience. EDGAR is free and bulk-OK (10 req/s, declare a UA). + +Pipeline: + 1. company_tickers_exchange.json -> issuers on exchange OTC or None + 2. per-CIK submissions JSON -> keep US-domestic (stateOfIncorporation in a US + state); record name, ticker, CIK, state, phone, business address + 3. pull the latest 10-K/10-Q/8-K filing HTML -> regex the company's own website + domain (drop sec.gov / filing-agent / boilerplate noise) +Output CSV: cik,name,ticker,state_inc,phone,city,state,zip,domain +(Email is a SEPARATE step: scrape domain -> contact/IR email, then verify.) + +Usage: + python3 scripts/harvest_otc_issuers.py OUT.csv [--limit N] [--de-nv-only] +""" +from __future__ import annotations +import argparse +import csv +import json +import re +import sys +import time +import urllib.request + +UA = "Performance West Inc compliance@performancewest.net" +TICKERS_URL = "https://www.sec.gov/files/company_tickers_exchange.json" +US_STATES = { + "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA", + "KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ", + "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT", + "VA","WA","WV","WI","WY","DC", +} +URL_RE = re.compile(r'https?://(?:www\.)?([a-z0-9\-]+\.(?:com|net|io|co|us))', re.I) +DOMAIN_NOISE = ("sec.gov", "xbrl", "w3.org", "schema", "adobe", "fasb", + "rdgfilings", "edgar", "sECDatabase".lower(), "donnelley", + "broadridge", "toppanmerrill", "dfinsolutions", "workiva", + "cloudfront", "googleapis", "gstatic", "jquery", "bootstrap") + + +def get(url: str, timeout: int = 15) -> str: + req = urllib.request.Request(url, headers={"User-Agent": UA}) + return urllib.request.urlopen(req, timeout=timeout).read().decode("utf-8", "ignore") + + +def get_json(url: str, timeout: int = 15): + return json.loads(get(url, timeout)) + + +def domain_from_filings(cik: str, recent: dict) -> str: + forms = recent.get("form", []) + accns = recent.get("accessionNumber", []) + docs = recent.get("primaryDocument", []) + # try a few recent substantive filings + tried = 0 + for i, f in enumerate(forms): + if f not in ("10-K", "10-Q", "8-K", "DEF 14A", "20-F", "S-1"): + continue + try: + accn = accns[i].replace("-", "") + html = get(f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accn}/{docs[i]}") + except Exception: + continue + cand = {} + for m in URL_RE.finditer(html): + dom = m.group(1).lower() + if any(b in dom for b in DOMAIN_NOISE): + continue + cand[dom] = cand.get(dom, 0) + 1 + if cand: + # the most-frequent non-noise domain is almost always the issuer's + return max(cand, key=cand.get) + tried += 1 + if tried >= 2: + break + time.sleep(0.2) + return "" + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("outfile") + ap.add_argument("--limit", type=int, default=0, help="cap issuers processed (0=all)") + ap.add_argument("--de-nv-only", action="store_true", + help="only Delaware/Nevada issuers (prime reincorp targets)") + ap.add_argument("--sleep", type=float, default=0.12) + args = ap.parse_args() + + d = get_json(TICKERS_URL) + fields = d["fields"]; rows = d["data"] + ci = fields.index("cik"); ni = fields.index("name") + ti = fields.index("ticker"); ei = fields.index("exchange") + otc = [r for r in rows if r[ei] in ("OTC", None)] + print(f"OTC/None issuers: {len(otc):,}", file=sys.stderr) + + out = [] + us = foreign = 0 + for n, r in enumerate(otc, 1): + if args.limit and len(out) >= args.limit: + break + cik = str(r[ci]).zfill(10) + try: + sub = get_json(f"https://data.sec.gov/submissions/CIK{cik}.json") + except Exception: + continue + soi = (sub.get("stateOfIncorporation") or "").strip() + ba = sub.get("addresses", {}).get("business", {}) or {} + addr_loc = (ba.get("stateOrCountry") or "").strip() + # The 2-letter stateOfIncorporation is ambiguous: 'DE' = Delaware OR + # Germany, etc. Disambiguate with the business address country: a real + # US-domestic issuer has BOTH a US state-of-incorporation AND a US-state + # business address. Foreign ADRs (Infineon: soi=DE but addr country=2M) + # are correctly excluded. + if soi not in US_STATES or addr_loc not in US_STATES: + foreign += 1 + time.sleep(args.sleep) + continue + if args.de_nv_only and soi not in ("DE", "NV"): + time.sleep(args.sleep) + continue + us += 1 + domain = domain_from_filings(cik, sub.get("filings", {}).get("recent", {})) + out.append({ + "cik": cik, "name": sub.get("name", r[ni]), "ticker": r[ti] or "", + "state_inc": soi, "phone": sub.get("phone", "") or "", + "city": ba.get("city", ""), "state": ba.get("stateOrCountry", ""), + "zip": ba.get("zipCode", ""), "domain": domain, + }) + if len(out) % 25 == 0: + print(f" processed {n} | us={us} found_domain={sum(1 for o in out if o['domain'])}", + file=sys.stderr) + time.sleep(args.sleep) + + with open(args.outfile, "w", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=["cik", "name", "ticker", "state_inc", + "phone", "city", "state", "zip", "domain"]) + w.writeheader() + w.writerows(out) + + with_domain = sum(1 for o in out if o["domain"]) + print(f"US-domestic issuers: {len(out):,} (skipped {foreign:,} foreign)", file=sys.stderr) + print(f" with a domain: {with_domain:,} ({100*with_domain/max(len(out),1):.0f}%)", file=sys.stderr) + print(f" -> {args.outfile}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())