#!/usr/bin/env python3 """Pull the OTC-Markets (pink-sheets) corporate-services lead list from SEC EDGAR. EDGAR is free, public, and explicitly OK to bulk-download (max 10 req/sec, declare a User-Agent with a contact). We pull two things: 1. company_tickers_exchange.json -- the master ticker/exchange map. We keep issuers whose exchange is "OTC" (or blank/None, which is also off-major- exchange). This is the ~2,771-issuer OTC SEC-filer universe. 2. submissions/CIK{cik}.json (one per issuer) -- per-company detail: state of incorporation, business + mailing address, phone, SIC industry, entity type, filer category (= public-float size class), and last filing date. We then FILTER to the genuine prospect set (see docs/research-otc-markets-lead- source.md, sections 3 + 4b): - US-domestic incorporation only (drops foreign ADRs we can't redomesticate; also keeps us cleanly under CAN-SPAM, away from CASL/GDPR). - microcaps only: drop "Large accelerated filer" / "Accelerated filer" (those keep securities counsel on retainer -- not our lane). Keep Non-accelerated / Smaller reporting / Emerging growth / blank. - actively filing: drop issuers with no filing in the last ~13 months (delinquent / dark shells). Output: a CSV of the ~700-850 active US-domestic microcap issuers with everything needed to segment + reach them (EDGAR has no email, so we capture phone + address + website for enrichment / direct mail / cold call). Usage: python3 scripts/otc_lead_pull.py # full pull -> data/otc_leads.csv python3 scripts/otc_lead_pull.py --out PATH python3 scripts/otc_lead_pull.py --limit 200 # sample run (first N OTC ciks) python3 scripts/otc_lead_pull.py --include-large # keep accelerated/large filers python3 scripts/otc_lead_pull.py --include-foreign # keep foreign-incorporated python3 scripts/otc_lead_pull.py --max-stale-days 395 # active-filer cutoff (default 395) python3 scripts/otc_lead_pull.py --rps 6 # requests/sec (<=10 per SEC policy) Set OTC_SEC_CONTACT (or pass --contact) to your real contact e-mail for the User-Agent header. SEC requires a contact; default falls back to the ops address. """ from __future__ import annotations import argparse, csv, datetime, json, os, sys, tempfile, time import urllib.request, urllib.error TICKERS_EXCHANGE_URL = "https://www.sec.gov/files/company_tickers_exchange.json" SUBMISSIONS_URL = "https://data.sec.gov/submissions/CIK{cik:010d}.json" DATA_DIR = os.getenv("OTC_DATA_DIR", os.path.join(os.path.dirname(__file__), "..", "data")) DEFAULT_OUT = os.path.join(DATA_DIR, "otc_leads.csv") DEFAULT_CONTACT = os.getenv("OTC_SEC_CONTACT", "compliance@performancewest.net") # OTC issuers carry these exchange tags in company_tickers_exchange.json. Blank/None # = off-major-exchange (also OTC/expert-market in practice), so we keep it too. OTC_EXCHANGES = {"OTC", "", None} # SEC public-float size classes (Rule 12b-2). We drop these "large" ones -- they # keep securities counsel on retainer and won't buy a flat-fee filing service. LARGE_FILER_MARKERS = ("large accelerated filer", "accelerated filer") # Note: "non-accelerated filer" contains the substring "accelerated filer", so we # match on word-boundary-ish logic in is_large_filer() rather than naive `in`. US_STATES = set( "AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO " "MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY DC".split() ) OUT_FIELDS = [ "cik", "name", "ticker", "all_tickers", "exchange", "state_of_incorporation", "incorporation_desc", "is_us_domestic", "sic", "sic_desc", "entity_type", "filer_category", "filer_size_bucket", "biz_street1", "biz_street2", "biz_city", "biz_state", "biz_zip", "mail_street1", "mail_city", "mail_state", "mail_zip", "phone", "website", "investor_website", "ein", "former_names", "last_filing_date", "last_filing_form", "is_active", ] def log(msg: str) -> None: print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] {msg}", flush=True) def http_json(url: str, contact: str, timeout: int = 20) -> dict: req = urllib.request.Request( url, headers={"User-Agent": f"Performance West OTC lead pull ({contact})", "Accept-Encoding": "gzip, deflate"} ) # urllib does not auto-decompress; ask for identity to keep it simple. req.add_header("Accept-Encoding", "identity") with urllib.request.urlopen(req, timeout=timeout) as resp: return json.load(resp) def is_large_filer(category: str) -> bool: """True only for accelerated / large-accelerated filers (>= $75M float). "Non-accelerated filer" must NOT match even though it contains "accelerated filer" as a substring, so check the leading words of each category line. """ cat = (category or "").lower() for line in cat.replace("
", "\n").split("\n"): line = line.strip() if line.startswith("large accelerated filer") or line.startswith("accelerated filer"): return True return False def size_bucket(category: str) -> str: cat = (category or "").lower() if "large accelerated filer" in cat: return "large_accelerated_>=700M" # accelerated but not large, and not non-accelerated if is_large_filer(category): return "accelerated_75M-700M" if "smaller reporting company" in cat or "non-accelerated" in cat or "emerging growth" in cat or not cat: return "smaller_reporting_<75M" return "other" def addr(d: dict, key: str) -> dict: return (d.get("addresses", {}) or {}).get(key, {}) or {} def main() -> int: ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--out", default=DEFAULT_OUT) ap.add_argument("--contact", default=DEFAULT_CONTACT, help="contact e-mail for the SEC User-Agent header") ap.add_argument("--limit", type=int, default=0, help="only process the first N OTC ciks (sampling/testing)") ap.add_argument("--rps", type=float, default=6.0, help="requests/sec to data.sec.gov (SEC policy max is 10)") ap.add_argument("--max-stale-days", type=int, default=395, help="drop issuers whose last filing is older than this (active filter)") ap.add_argument("--include-large", action="store_true", help="keep accelerated/large-accelerated filers (default: drop)") ap.add_argument("--include-foreign", action="store_true", help="keep foreign-incorporated issuers (default: US-domestic only)") ap.add_argument("--include-stale", action="store_true", help="keep delinquent/dark issuers (default: drop stale filers)") ap.add_argument("--keep-rejects", default="", help="optional path to also write the dropped issuers (for QA)") args = ap.parse_args() if args.rps > 10: log("WARNING: --rps above SEC fair-access limit (10); clamping to 8") args.rps = 8.0 delay = 1.0 / args.rps if args.rps > 0 else 0.0 today = datetime.date.today() log("Fetching master ticker/exchange map ...") master = http_json(TICKERS_EXCHANGE_URL, args.contact) fields = master["fields"] ix = {f: i for i, f in enumerate(fields)} rows = master["data"] otc = [r for r in rows if r[ix["exchange"]] in OTC_EXCHANGES] if args.limit: otc = otc[: args.limit] log(f"OTC/off-exchange issuers to inspect: {len(otc)} (of {len(rows)} total tickers)") kept: list[dict] = [] rejects: list[dict] = [] stats = {"foreign": 0, "large": 0, "stale": 0, "fetch_err": 0, "kept": 0} for n, r in enumerate(otc, 1): cik = r[ix["cik"]] try: j = http_json(SUBMISSIONS_URL.format(cik=cik), args.contact, timeout=15) except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e: stats["fetch_err"] += 1 time.sleep(delay) continue soi = (j.get("stateOfIncorporation") or "").strip() is_us = soi in US_STATES category = j.get("category") or "" large = is_large_filer(category) recent = j.get("filings", {}).get("recent", {}) or {} dates = recent.get("filingDate") or [] forms = recent.get("form") or [] last_date = dates[0] if dates else "" last_form = forms[0] if forms else "" is_active = False if last_date: try: d = datetime.date.fromisoformat(last_date) is_active = (today - d).days <= args.max_stale_days except ValueError: pass biz, mail = addr(j, "business"), addr(j, "mailing") rec = { "cik": cik, "name": j.get("name", ""), "ticker": r[ix["ticker"]], "all_tickers": "|".join(j.get("tickers") or []), "exchange": r[ix["exchange"]] or "", "state_of_incorporation": soi, "incorporation_desc": j.get("stateOfIncorporationDescription", ""), "is_us_domestic": "Y" if is_us else "N", "sic": j.get("sic", ""), "sic_desc": j.get("sicDescription", ""), "entity_type": j.get("entityType", ""), "filer_category": category.replace("
", " / "), "filer_size_bucket": size_bucket(category), "biz_street1": biz.get("street1", "") or "", "biz_street2": biz.get("street2", "") or "", "biz_city": biz.get("city", "") or "", "biz_state": biz.get("stateOrCountry", "") or "", "biz_zip": biz.get("zipCode", "") or "", "mail_street1": mail.get("street1", "") or "", "mail_city": mail.get("city", "") or "", "mail_state": mail.get("stateOrCountry", "") or "", "mail_zip": mail.get("zipCode", "") or "", "phone": j.get("phone", "") or "", "website": j.get("website", "") or "", "investor_website": j.get("investorWebsite", "") or "", "ein": j.get("ein", "") or "", "former_names": "|".join(fn.get("name", "") for fn in (j.get("formerNames") or [])), "last_filing_date": last_date, "last_filing_form": last_form, "is_active": "Y" if is_active else "N", } drop_reason = None if not args.include_foreign and not is_us: drop_reason = "foreign"; stats["foreign"] += 1 elif not args.include_large and large: drop_reason = "large_filer"; stats["large"] += 1 elif not args.include_stale and not is_active: drop_reason = "stale"; stats["stale"] += 1 if drop_reason: rec["drop_reason"] = drop_reason rejects.append(rec) else: kept.append(rec) stats["kept"] += 1 if n % 200 == 0: log(f" {n}/{len(otc)} inspected; kept {stats['kept']} so far") time.sleep(delay) # sort: DE/NV (reincorporation targets) first, then by state, then name priority = {"DE": 0, "NV": 1} kept.sort(key=lambda x: (priority.get(x["state_of_incorporation"], 2), x["state_of_incorporation"], x["name"].lower())) out = os.path.abspath(args.out) os.makedirs(os.path.dirname(out), exist_ok=True) tmp = out + ".tmp" with open(tmp, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=OUT_FIELDS) w.writeheader() for rec in kept: w.writerow({k: rec.get(k, "") for k in OUT_FIELDS}) os.replace(tmp, out) if args.keep_rejects: rj = os.path.abspath(args.keep_rejects) with open(rj, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=OUT_FIELDS + ["drop_reason"]) w.writeheader() for rec in rejects: w.writerow({k: rec.get(k, "") for k in OUT_FIELDS + ["drop_reason"]}) # summary from collections import Counter by_inc = Counter(r["state_of_incorporation"] for r in kept) log("=" * 60) log(f"DONE. wrote {len(kept)} leads -> {out}") log(f" inspected {len(otc)} OTC issuers; fetch errors {stats['fetch_err']}") log(f" dropped: foreign={stats['foreign']} large_filer={stats['large']} stale={stats['stale']}") de, nv = by_inc.get("DE", 0), by_inc.get("NV", 0) log(f" incorporation of kept: DE={de} NV={nv} (DE+NV={de+nv}) TX={by_inc.get('TX',0)}") log(f" top incorporation states: {by_inc.most_common(8)}") have_phone = sum(1 for r in kept if r["phone"]) have_web = sum(1 for r in kept if r["website"] or r["investor_website"]) log(f" reachability: phone {have_phone}/{len(kept)} website {have_web}/{len(kept)} (EDGAR has no email)") return 0 if __name__ == "__main__": sys.exit(main())