The master file lists warrants/units as separate tickers under one CIK, so the pull now dedupes to one row per company (other tickers kept in all_tickers). data/otc_leads.csv: 861 unique active US-domestic microcap OTC issuers (<$75M float, all actively filing, 100% with business address + phone). By incorporation: DE 365, NV 325 (DE+NV=690 = the reincorporation targets), WY 44, FL 39, MD 38. Dropped from the 2,771 OTC universe: 1,672 foreign, 62 accelerated/large filers, 73 delinquent/dark. EDGAR has no email -> phone + address captured for enrichment / direct mail / call.
299 lines
13 KiB
Python
299 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""Pull the OTC-Markets (pink-sheets) corporate-services lead list from SEC EDGAR.
|
|
|
|
EDGAR is free, public, and explicitly OK to bulk-download (max 10 req/sec, declare
|
|
a User-Agent with a contact). We pull two things:
|
|
|
|
1. company_tickers_exchange.json -- the master ticker/exchange map. We keep
|
|
issuers whose exchange is "OTC" (or blank/None, which is also off-major-
|
|
exchange). This is the ~2,771-issuer OTC SEC-filer universe.
|
|
2. submissions/CIK{cik}.json (one per issuer) -- per-company detail: state of
|
|
incorporation, business + mailing address, phone, SIC industry, entity type,
|
|
filer category (= public-float size class), and last filing date.
|
|
|
|
We then FILTER to the genuine prospect set (see docs/research-otc-markets-lead-
|
|
source.md, sections 3 + 4b):
|
|
|
|
- US-domestic incorporation only (drops foreign ADRs we can't redomesticate;
|
|
also keeps us cleanly under CAN-SPAM, away from CASL/GDPR).
|
|
- microcaps only: drop "Large accelerated filer" / "Accelerated filer"
|
|
(those keep securities counsel on retainer -- not our lane). Keep
|
|
Non-accelerated / Smaller reporting / Emerging growth / blank.
|
|
- actively filing: drop issuers with no filing in the last ~13 months
|
|
(delinquent / dark shells).
|
|
|
|
Output: a CSV of the ~700-850 active US-domestic microcap issuers with everything
|
|
needed to segment + reach them (EDGAR has no email, so we capture phone + address
|
|
+ website for enrichment / direct mail / cold call).
|
|
|
|
Usage:
|
|
python3 scripts/otc_lead_pull.py # full pull -> data/otc_leads.csv
|
|
python3 scripts/otc_lead_pull.py --out PATH
|
|
python3 scripts/otc_lead_pull.py --limit 200 # sample run (first N OTC ciks)
|
|
python3 scripts/otc_lead_pull.py --include-large # keep accelerated/large filers
|
|
python3 scripts/otc_lead_pull.py --include-foreign # keep foreign-incorporated
|
|
python3 scripts/otc_lead_pull.py --max-stale-days 395 # active-filer cutoff (default 395)
|
|
python3 scripts/otc_lead_pull.py --rps 6 # requests/sec (<=10 per SEC policy)
|
|
|
|
Set OTC_SEC_CONTACT (or pass --contact) to your real contact e-mail for the
|
|
User-Agent header. SEC requires a contact; default falls back to the ops address.
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse, csv, datetime, json, os, sys, tempfile, time
|
|
import urllib.request, urllib.error
|
|
|
|
TICKERS_EXCHANGE_URL = "https://www.sec.gov/files/company_tickers_exchange.json"
|
|
SUBMISSIONS_URL = "https://data.sec.gov/submissions/CIK{cik:010d}.json"
|
|
|
|
DATA_DIR = os.getenv("OTC_DATA_DIR", os.path.join(os.path.dirname(__file__), "..", "data"))
|
|
DEFAULT_OUT = os.path.join(DATA_DIR, "otc_leads.csv")
|
|
DEFAULT_CONTACT = os.getenv("OTC_SEC_CONTACT", "compliance@performancewest.net")
|
|
|
|
# OTC issuers carry these exchange tags in company_tickers_exchange.json. Blank/None
|
|
# = off-major-exchange (also OTC/expert-market in practice), so we keep it too.
|
|
OTC_EXCHANGES = {"OTC", "", None}
|
|
|
|
# SEC public-float size classes (Rule 12b-2). We drop these "large" ones -- they
|
|
# keep securities counsel on retainer and won't buy a flat-fee filing service.
|
|
LARGE_FILER_MARKERS = ("large accelerated filer", "accelerated filer")
|
|
# Note: "non-accelerated filer" contains the substring "accelerated filer", so we
|
|
# match on word-boundary-ish logic in is_large_filer() rather than naive `in`.
|
|
|
|
US_STATES = set(
|
|
"AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO "
|
|
"MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY DC".split()
|
|
)
|
|
|
|
OUT_FIELDS = [
|
|
"cik", "name", "ticker", "all_tickers", "exchange",
|
|
"state_of_incorporation", "incorporation_desc", "is_us_domestic",
|
|
"sic", "sic_desc", "entity_type", "filer_category", "filer_size_bucket",
|
|
"biz_street1", "biz_street2", "biz_city", "biz_state", "biz_zip",
|
|
"mail_street1", "mail_city", "mail_state", "mail_zip",
|
|
"phone", "website", "investor_website", "ein",
|
|
"former_names", "last_filing_date", "last_filing_form", "is_active",
|
|
]
|
|
|
|
|
|
def log(msg: str) -> None:
|
|
print(f"[{datetime.datetime.now():%Y-%m-%d %H:%M:%S}] {msg}", flush=True)
|
|
|
|
|
|
def http_json(url: str, contact: str, timeout: int = 20) -> dict:
|
|
req = urllib.request.Request(
|
|
url, headers={"User-Agent": f"Performance West OTC lead pull ({contact})",
|
|
"Accept-Encoding": "gzip, deflate"}
|
|
)
|
|
# urllib does not auto-decompress; ask for identity to keep it simple.
|
|
req.add_header("Accept-Encoding", "identity")
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return json.load(resp)
|
|
|
|
|
|
def is_large_filer(category: str) -> bool:
|
|
"""True only for accelerated / large-accelerated filers (>= $75M float).
|
|
|
|
"Non-accelerated filer" must NOT match even though it contains "accelerated
|
|
filer" as a substring, so check the leading words of each category line.
|
|
"""
|
|
cat = (category or "").lower()
|
|
for line in cat.replace("<br>", "\n").split("\n"):
|
|
line = line.strip()
|
|
if line.startswith("large accelerated filer") or line.startswith("accelerated filer"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def size_bucket(category: str) -> str:
|
|
cat = (category or "").lower()
|
|
if "large accelerated filer" in cat:
|
|
return "large_accelerated_>=700M"
|
|
# accelerated but not large, and not non-accelerated
|
|
if is_large_filer(category):
|
|
return "accelerated_75M-700M"
|
|
if "smaller reporting company" in cat or "non-accelerated" in cat or "emerging growth" in cat or not cat:
|
|
return "smaller_reporting_<75M"
|
|
return "other"
|
|
|
|
|
|
def addr(d: dict, key: str) -> dict:
|
|
return (d.get("addresses", {}) or {}).get(key, {}) or {}
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--out", default=DEFAULT_OUT)
|
|
ap.add_argument("--contact", default=DEFAULT_CONTACT,
|
|
help="contact e-mail for the SEC User-Agent header")
|
|
ap.add_argument("--limit", type=int, default=0,
|
|
help="only process the first N OTC ciks (sampling/testing)")
|
|
ap.add_argument("--rps", type=float, default=6.0,
|
|
help="requests/sec to data.sec.gov (SEC policy max is 10)")
|
|
ap.add_argument("--max-stale-days", type=int, default=395,
|
|
help="drop issuers whose last filing is older than this (active filter)")
|
|
ap.add_argument("--include-large", action="store_true",
|
|
help="keep accelerated/large-accelerated filers (default: drop)")
|
|
ap.add_argument("--include-foreign", action="store_true",
|
|
help="keep foreign-incorporated issuers (default: US-domestic only)")
|
|
ap.add_argument("--include-stale", action="store_true",
|
|
help="keep delinquent/dark issuers (default: drop stale filers)")
|
|
ap.add_argument("--keep-rejects", default="",
|
|
help="optional path to also write the dropped issuers (for QA)")
|
|
args = ap.parse_args()
|
|
|
|
if args.rps > 10:
|
|
log("WARNING: --rps above SEC fair-access limit (10); clamping to 8")
|
|
args.rps = 8.0
|
|
delay = 1.0 / args.rps if args.rps > 0 else 0.0
|
|
today = datetime.date.today()
|
|
|
|
log("Fetching master ticker/exchange map ...")
|
|
master = http_json(TICKERS_EXCHANGE_URL, args.contact)
|
|
fields = master["fields"]
|
|
ix = {f: i for i, f in enumerate(fields)}
|
|
rows = master["data"]
|
|
otc = [r for r in rows if r[ix["exchange"]] in OTC_EXCHANGES]
|
|
# The master file lists warrants/units as separate tickers under the same CIK
|
|
# (e.g. ABPO + ABPWW = Abpro). Dedupe to one row per company (first ticker
|
|
# wins); the issuer's other tickers are still captured in `all_tickers`.
|
|
seen_cik: set[int] = set()
|
|
deduped = []
|
|
for r in otc:
|
|
cik = r[ix["cik"]]
|
|
if cik in seen_cik:
|
|
continue
|
|
seen_cik.add(cik)
|
|
deduped.append(r)
|
|
if len(deduped) != len(otc):
|
|
log(f"deduped {len(otc) - len(deduped)} warrant/unit ticker rows -> {len(deduped)} unique companies")
|
|
otc = deduped
|
|
if args.limit:
|
|
otc = otc[: args.limit]
|
|
log(f"OTC/off-exchange issuers to inspect: {len(otc)} (of {len(rows)} total tickers)")
|
|
|
|
kept: list[dict] = []
|
|
rejects: list[dict] = []
|
|
stats = {"foreign": 0, "large": 0, "stale": 0, "fetch_err": 0, "kept": 0}
|
|
|
|
for n, r in enumerate(otc, 1):
|
|
cik = r[ix["cik"]]
|
|
try:
|
|
j = http_json(SUBMISSIONS_URL.format(cik=cik), args.contact, timeout=15)
|
|
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, json.JSONDecodeError) as e:
|
|
stats["fetch_err"] += 1
|
|
time.sleep(delay)
|
|
continue
|
|
|
|
soi = (j.get("stateOfIncorporation") or "").strip()
|
|
is_us = soi in US_STATES
|
|
category = j.get("category") or ""
|
|
large = is_large_filer(category)
|
|
|
|
recent = j.get("filings", {}).get("recent", {}) or {}
|
|
dates = recent.get("filingDate") or []
|
|
forms = recent.get("form") or []
|
|
last_date = dates[0] if dates else ""
|
|
last_form = forms[0] if forms else ""
|
|
is_active = False
|
|
if last_date:
|
|
try:
|
|
d = datetime.date.fromisoformat(last_date)
|
|
is_active = (today - d).days <= args.max_stale_days
|
|
except ValueError:
|
|
pass
|
|
|
|
biz, mail = addr(j, "business"), addr(j, "mailing")
|
|
rec = {
|
|
"cik": cik,
|
|
"name": j.get("name", ""),
|
|
"ticker": r[ix["ticker"]],
|
|
"all_tickers": "|".join(j.get("tickers") or []),
|
|
"exchange": r[ix["exchange"]] or "",
|
|
"state_of_incorporation": soi,
|
|
"incorporation_desc": j.get("stateOfIncorporationDescription", ""),
|
|
"is_us_domestic": "Y" if is_us else "N",
|
|
"sic": j.get("sic", ""),
|
|
"sic_desc": j.get("sicDescription", ""),
|
|
"entity_type": j.get("entityType", ""),
|
|
"filer_category": category.replace("<br>", " / "),
|
|
"filer_size_bucket": size_bucket(category),
|
|
"biz_street1": biz.get("street1", "") or "",
|
|
"biz_street2": biz.get("street2", "") or "",
|
|
"biz_city": biz.get("city", "") or "",
|
|
"biz_state": biz.get("stateOrCountry", "") or "",
|
|
"biz_zip": biz.get("zipCode", "") or "",
|
|
"mail_street1": mail.get("street1", "") or "",
|
|
"mail_city": mail.get("city", "") or "",
|
|
"mail_state": mail.get("stateOrCountry", "") or "",
|
|
"mail_zip": mail.get("zipCode", "") or "",
|
|
"phone": j.get("phone", "") or "",
|
|
"website": j.get("website", "") or "",
|
|
"investor_website": j.get("investorWebsite", "") or "",
|
|
"ein": j.get("ein", "") or "",
|
|
"former_names": "|".join(fn.get("name", "") for fn in (j.get("formerNames") or [])),
|
|
"last_filing_date": last_date,
|
|
"last_filing_form": last_form,
|
|
"is_active": "Y" if is_active else "N",
|
|
}
|
|
|
|
drop_reason = None
|
|
if not args.include_foreign and not is_us:
|
|
drop_reason = "foreign"; stats["foreign"] += 1
|
|
elif not args.include_large and large:
|
|
drop_reason = "large_filer"; stats["large"] += 1
|
|
elif not args.include_stale and not is_active:
|
|
drop_reason = "stale"; stats["stale"] += 1
|
|
|
|
if drop_reason:
|
|
rec["drop_reason"] = drop_reason
|
|
rejects.append(rec)
|
|
else:
|
|
kept.append(rec)
|
|
stats["kept"] += 1
|
|
|
|
if n % 200 == 0:
|
|
log(f" {n}/{len(otc)} inspected; kept {stats['kept']} so far")
|
|
time.sleep(delay)
|
|
|
|
# sort: DE/NV (reincorporation targets) first, then by state, then name
|
|
priority = {"DE": 0, "NV": 1}
|
|
kept.sort(key=lambda x: (priority.get(x["state_of_incorporation"], 2),
|
|
x["state_of_incorporation"], x["name"].lower()))
|
|
|
|
out = os.path.abspath(args.out)
|
|
os.makedirs(os.path.dirname(out), exist_ok=True)
|
|
tmp = out + ".tmp"
|
|
with open(tmp, "w", newline="", encoding="utf-8") as f:
|
|
w = csv.DictWriter(f, fieldnames=OUT_FIELDS)
|
|
w.writeheader()
|
|
for rec in kept:
|
|
w.writerow({k: rec.get(k, "") for k in OUT_FIELDS})
|
|
os.replace(tmp, out)
|
|
|
|
if args.keep_rejects:
|
|
rj = os.path.abspath(args.keep_rejects)
|
|
with open(rj, "w", newline="", encoding="utf-8") as f:
|
|
w = csv.DictWriter(f, fieldnames=OUT_FIELDS + ["drop_reason"])
|
|
w.writeheader()
|
|
for rec in rejects:
|
|
w.writerow({k: rec.get(k, "") for k in OUT_FIELDS + ["drop_reason"]})
|
|
|
|
# summary
|
|
from collections import Counter
|
|
by_inc = Counter(r["state_of_incorporation"] for r in kept)
|
|
log("=" * 60)
|
|
log(f"DONE. wrote {len(kept)} leads -> {out}")
|
|
log(f" inspected {len(otc)} OTC issuers; fetch errors {stats['fetch_err']}")
|
|
log(f" dropped: foreign={stats['foreign']} large_filer={stats['large']} stale={stats['stale']}")
|
|
de, nv = by_inc.get("DE", 0), by_inc.get("NV", 0)
|
|
log(f" incorporation of kept: DE={de} NV={nv} (DE+NV={de+nv}) TX={by_inc.get('TX',0)}")
|
|
log(f" top incorporation states: {by_inc.most_common(8)}")
|
|
have_phone = sum(1 for r in kept if r["phone"])
|
|
have_web = sum(1 for r in kept if r["website"] or r["investor_website"])
|
|
log(f" reachability: phone {have_phone}/{len(kept)} website {have_web}/{len(kept)} (EDGAR has no email)")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|