Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
commit
f8cd37ac8c
1823 changed files with 145167 additions and 0 deletions
405
scripts/workers/wy_entity_scraper.py
Normal file
405
scripts/workers/wy_entity_scraper.py
Normal file
|
|
@ -0,0 +1,405 @@
|
|||
"""
|
||||
wy_entity_scraper.py — Bulk scrape Wyoming business entities from WyoBiz.
|
||||
|
||||
WyoBiz (wyobiz.wyo.gov) doesn't offer bulk downloads without a $10K+ subscription.
|
||||
This scraper paginates through search results to populate entity_cache.
|
||||
|
||||
Strategy: Search by year prefix (2020-, 2021-, etc.) to get all filings,
|
||||
then paginate through results. WyoBiz shows 10 results per page.
|
||||
|
||||
Usage:
|
||||
# Scrape all years (2000-2026):
|
||||
python -m workers.wy_entity_scraper
|
||||
|
||||
# Scrape specific year range:
|
||||
python -m workers.wy_entity_scraper --start-year 2023 --end-year 2026
|
||||
|
||||
# Resume from a specific year:
|
||||
python -m workers.wy_entity_scraper --start-year 2024
|
||||
|
||||
# Dry run (don't save to DB):
|
||||
python -m workers.wy_entity_scraper --dry-run --start-year 2025
|
||||
|
||||
Environment:
|
||||
DATABASE_URL PostgreSQL connection string
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
LOG = logging.getLogger("workers.wy_entity_scraper")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
|
||||
stream=sys.stdout,
|
||||
)
|
||||
|
||||
DATABASE_URL = os.environ.get("DATABASE_URL", "")
|
||||
WYOBIZ_SEARCH_URL = "https://wyobiz.wyo.gov/Business/FilingSearch.aspx"
|
||||
|
||||
# WY filing IDs are formatted as YYYY-NNNNNNN (year + 7-digit sequence)
|
||||
# Approximate counts per year: ~8,000-15,000 new filings/year
|
||||
|
||||
# Rate limiting — new searches are the risky action, pagination is safe
|
||||
PAGE_DELAY_MIN = 1.5 # Seconds between pagination clicks (same search, low risk)
|
||||
PAGE_DELAY_MAX = 3.0
|
||||
SEARCH_DELAY_MIN = 15 # Seconds between NEW searches (new prefix = new session risk)
|
||||
SEARCH_DELAY_MAX = 30
|
||||
|
||||
|
||||
def _human_delay(min_s: float = PAGE_DELAY_MIN, max_s: float = PAGE_DELAY_MAX):
|
||||
"""Random delay to mimic human browsing speed."""
|
||||
delay = random.uniform(min_s, max_s)
|
||||
time.sleep(delay)
|
||||
|
||||
|
||||
def scrape_prefix(prefix: str, conn, dry_run: bool = False) -> int:
|
||||
"""Scrape all entities matching a name prefix from WyoBiz.
|
||||
|
||||
Uses "Starts With" name search with 2-letter prefixes (AA, AB, ... ZZ)
|
||||
to enumerate all entities without triggering anti-scraping.
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
LOG.info("Scraping WY entities starting with '%s'", prefix)
|
||||
entities: list[dict] = []
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1366, "height": 768},
|
||||
)
|
||||
page = context.new_page()
|
||||
|
||||
# Navigate to search page
|
||||
page.goto(WYOBIZ_SEARCH_URL, wait_until="networkidle", timeout=30000)
|
||||
_human_delay(3, 6)
|
||||
|
||||
# Select "Starts With" radio (default mode)
|
||||
starts_with = page.query_selector("#MainContent_chkSearchStartWith")
|
||||
if starts_with:
|
||||
starts_with.click()
|
||||
_human_delay(0.5, 1)
|
||||
|
||||
# Type the prefix slowly (human-like)
|
||||
name_input = page.query_selector("#MainContent_txtFilingName")
|
||||
if not name_input:
|
||||
LOG.error("Could not find name search input")
|
||||
browser.close()
|
||||
return 0
|
||||
|
||||
for char in prefix:
|
||||
name_input.type(char, delay=random.randint(80, 200))
|
||||
_human_delay(1, 2)
|
||||
|
||||
# Click search
|
||||
page.click("#MainContent_cmdSearch")
|
||||
_human_delay(4, 8)
|
||||
|
||||
# Wait for results
|
||||
try:
|
||||
page.wait_for_selector("#scrolltop", timeout=20000)
|
||||
except Exception:
|
||||
pass
|
||||
page.wait_for_load_state("networkidle", timeout=20000)
|
||||
_human_delay(2, 4)
|
||||
|
||||
# Check for CAPTCHA
|
||||
page_text = page.inner_text("body")
|
||||
if "captcha" in page_text.lower() or "verify you are human" in page_text.lower():
|
||||
LOG.warning(" CAPTCHA detected for prefix '%s' — stopping", prefix)
|
||||
browser.close()
|
||||
return -1 # Signal to caller to back off
|
||||
|
||||
# Parse results pages
|
||||
page_num = 0
|
||||
while True:
|
||||
page_num += 1
|
||||
|
||||
# Extract results from current page
|
||||
rows = page.query_selector_all("table.Grid tr:not(:first-child)")
|
||||
if not rows:
|
||||
rows = page.query_selector_all("#MainContent_gvFilings tr:not(:first-child)")
|
||||
|
||||
if not rows:
|
||||
content = page.inner_text("#scrolltop") if page.query_selector("#scrolltop") else ""
|
||||
if "no record" in content.lower() or not content.strip():
|
||||
break
|
||||
if page_num == 1:
|
||||
LOG.info(" Prefix '%s': no results", prefix)
|
||||
break
|
||||
|
||||
for row in rows:
|
||||
cells = row.query_selector_all("td")
|
||||
if len(cells) < 3:
|
||||
continue
|
||||
|
||||
cell_texts = [c.inner_text().strip() for c in cells]
|
||||
if len(cell_texts) >= 3:
|
||||
# Columns vary — try to identify them
|
||||
filing_id = None
|
||||
name = None
|
||||
status_raw = ""
|
||||
entity_type_raw = ""
|
||||
formation_date_raw = ""
|
||||
|
||||
# First cell with YYYY- pattern is filing ID
|
||||
for i, txt in enumerate(cell_texts):
|
||||
if re.match(r"\d{4}-\d+", txt):
|
||||
filing_id = txt
|
||||
elif not name and len(txt) > 2 and not re.match(r"\d{2}/\d{2}/\d{4}", txt):
|
||||
if txt.upper() not in ("ACTIVE", "INACTIVE", "DISSOLVED", "LLC", "CORPORATION"):
|
||||
name = txt.upper()
|
||||
|
||||
# Remaining cells by position after name
|
||||
if name and len(cell_texts) >= 3:
|
||||
for txt in cell_texts:
|
||||
t = txt.upper()
|
||||
if t in ("ACTIVE", "INACTIVE", "DISSOLVED", "DELINQUENT",
|
||||
"GOOD STANDING", "REVOKED", "SUSPENDED", "CANCELLED"):
|
||||
status_raw = t
|
||||
elif "LLC" in t or "CORP" in t or "LP" in t or "LIMITED" in t:
|
||||
if t != name:
|
||||
entity_type_raw = t
|
||||
elif re.match(r"\d{2}/\d{2}/\d{4}", txt):
|
||||
formation_date_raw = txt
|
||||
|
||||
if not filing_id:
|
||||
# Use name as entity_number fallback
|
||||
filing_id = f"WY_{name[:30]}" if name else None
|
||||
|
||||
if name and filing_id:
|
||||
# Normalize status
|
||||
status = "ACTIVE"
|
||||
s = status_raw.upper()
|
||||
if "DISSOLV" in s or "CANCEL" in s:
|
||||
status = "DISSOLVED"
|
||||
elif "DELINQ" in s or "DEFAULT" in s:
|
||||
status = "DELINQUENT"
|
||||
elif "SUSPEND" in s or "REVOK" in s:
|
||||
status = "SUSPENDED"
|
||||
elif "INACTIVE" in s or "WITHDRAWN" in s:
|
||||
status = "INACTIVE"
|
||||
|
||||
# Normalize type
|
||||
entity_type = None
|
||||
t = entity_type_raw.upper()
|
||||
if "LLC" in t or "LIMITED LIABILITY" in t:
|
||||
entity_type = "LLC"
|
||||
elif "CORP" in t or "INC" in t:
|
||||
entity_type = "CORPORATION"
|
||||
elif "LP" in t:
|
||||
entity_type = "LP"
|
||||
|
||||
# Parse date
|
||||
formation_date = None
|
||||
if formation_date_raw:
|
||||
try:
|
||||
formation_date = datetime.strptime(
|
||||
formation_date_raw.split()[0], "%m/%d/%Y"
|
||||
).strftime("%Y-%m-%d")
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
entities.append({
|
||||
"entity_name": name,
|
||||
"entity_number": filing_id,
|
||||
"entity_type": entity_type,
|
||||
"status": status,
|
||||
"formation_date": formation_date,
|
||||
"formation_state": "WY",
|
||||
"jurisdiction": "US_WY",
|
||||
"state": "WY",
|
||||
})
|
||||
|
||||
# Try to go to next page
|
||||
next_link = page.query_selector("a[href*='Page$Next']")
|
||||
if not next_link:
|
||||
break
|
||||
|
||||
next_link.click()
|
||||
page.wait_for_load_state("networkidle", timeout=20000)
|
||||
_human_delay(PAGE_DELAY_MIN, PAGE_DELAY_MAX) # 1.5-3s between pages
|
||||
|
||||
# Safety limits — if we hit max pages, prefix needs to go deeper
|
||||
if page_num > 200:
|
||||
LOG.warning(" Prefix '%s': hit 200 page limit — needs deeper split", prefix)
|
||||
# Return negative to signal caller to split this prefix
|
||||
browser.close()
|
||||
if entities and not dry_run:
|
||||
_upsert(conn, entities)
|
||||
return -(len(entities)) # Negative = needs split
|
||||
|
||||
if page_num % 50 == 0:
|
||||
LOG.info(" Prefix '%s': page %d (%d entities)", prefix, page_num, len(entities))
|
||||
|
||||
browser.close()
|
||||
|
||||
except Exception as exc:
|
||||
LOG.error("Scraper error for prefix '%s': %s", prefix, exc)
|
||||
|
||||
LOG.info(" Prefix '%s': scraped %d entities", prefix, len(entities))
|
||||
|
||||
# Upsert to DB
|
||||
if entities and not dry_run:
|
||||
count = _upsert(conn, entities)
|
||||
LOG.info(" Prefix '%s': upserted %d to entity_cache", prefix, count)
|
||||
return count
|
||||
return len(entities)
|
||||
|
||||
|
||||
def _upsert(conn, entities: list[dict]) -> int:
|
||||
"""Upsert scraped WY entities into entity_cache."""
|
||||
cur = conn.cursor()
|
||||
count = 0
|
||||
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
deduped = []
|
||||
for e in entities:
|
||||
key = (e["jurisdiction"], e["entity_number"])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
deduped.append(e)
|
||||
|
||||
try:
|
||||
for batch_start in range(0, len(deduped), 200):
|
||||
batch = deduped[batch_start:batch_start + 200]
|
||||
values = []
|
||||
for e in batch:
|
||||
values.append(cur.mogrify(
|
||||
"(%s,%s,%s,%s,%s,%s,%s,%s,%s,'playwright')",
|
||||
(
|
||||
e["jurisdiction"], e["entity_name"], e["entity_number"],
|
||||
e["entity_type"], e["status"], e["formation_date"],
|
||||
e["state"], e.get("formation_state"),
|
||||
None, # principal_address
|
||||
),
|
||||
).decode())
|
||||
|
||||
sql = f"""
|
||||
INSERT INTO entity_cache
|
||||
(jurisdiction, entity_name, entity_number, entity_type, status,
|
||||
formation_date, state, formation_state, principal_address, source)
|
||||
VALUES {",".join(values)}
|
||||
ON CONFLICT (jurisdiction, entity_number) DO UPDATE SET
|
||||
entity_name = EXCLUDED.entity_name,
|
||||
entity_type = EXCLUDED.entity_type,
|
||||
status = EXCLUDED.status,
|
||||
formation_date = EXCLUDED.formation_date,
|
||||
formation_state = COALESCE(EXCLUDED.formation_state, entity_cache.formation_state),
|
||||
last_synced = NOW()
|
||||
"""
|
||||
cur.execute(sql)
|
||||
count += len(batch)
|
||||
|
||||
conn.commit()
|
||||
except Exception as exc:
|
||||
LOG.error("DB upsert error: %s", exc)
|
||||
conn.rollback()
|
||||
|
||||
return count
|
||||
|
||||
|
||||
def _generate_prefixes(start: str = "AA", end: str = "ZZ") -> list[str]:
|
||||
"""Generate 2-letter prefixes from start to end (inclusive)."""
|
||||
prefixes = []
|
||||
for c1 in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
for c2 in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
p = c1 + c2
|
||||
if p >= start and p <= end:
|
||||
prefixes.append(p)
|
||||
# Also add numeric prefixes (1A, 2B, etc. — for entities starting with numbers)
|
||||
for d in "0123456789":
|
||||
prefixes.append(d)
|
||||
return prefixes
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Scrape Wyoming business entities from WyoBiz")
|
||||
parser.add_argument("--start", type=str, default="AA", help="Start prefix (default: AA)")
|
||||
parser.add_argument("--end", type=str, default="ZZ", help="End prefix (default: ZZ)")
|
||||
parser.add_argument("--prefix", type=str, help="Single prefix to scrape (e.g., GT)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Don't save to database")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not DATABASE_URL and not args.dry_run:
|
||||
LOG.error("DATABASE_URL not set")
|
||||
sys.exit(1)
|
||||
|
||||
conn = psycopg2.connect(DATABASE_URL) if not args.dry_run else None
|
||||
total = 0
|
||||
|
||||
if args.prefix:
|
||||
prefixes = [args.prefix.upper()]
|
||||
else:
|
||||
prefixes = _generate_prefixes(args.start.upper(), args.end.upper())
|
||||
|
||||
LOG.info("Will scrape %d prefixes: %s ... %s", len(prefixes), prefixes[0], prefixes[-1])
|
||||
|
||||
for i, prefix in enumerate(prefixes):
|
||||
count = scrape_prefix(prefix, conn, dry_run=args.dry_run)
|
||||
|
||||
if count == -1:
|
||||
# CAPTCHA detected — back off significantly
|
||||
LOG.warning("CAPTCHA hit — backing off 5 minutes before continuing")
|
||||
time.sleep(300)
|
||||
continue
|
||||
|
||||
if count < 0:
|
||||
# Negative = too many results, need deeper split (e.g., "AB" → "ABA", "ABB", ...)
|
||||
LOG.info("Splitting prefix '%s' into 3-letter sub-prefixes", prefix)
|
||||
total += abs(count) # Count what was already saved
|
||||
for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
sub_prefix = prefix + c
|
||||
sub_count = scrape_prefix(sub_prefix, conn, dry_run=args.dry_run)
|
||||
if sub_count == -1:
|
||||
LOG.warning("CAPTCHA during split — backing off 5 min")
|
||||
time.sleep(300)
|
||||
elif sub_count < 0:
|
||||
# Even 3 letters too broad — go to 4 (rare)
|
||||
total += abs(sub_count)
|
||||
for c2 in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
sub4 = sub_prefix + c2
|
||||
s4_count = scrape_prefix(sub4, conn, dry_run=args.dry_run)
|
||||
if s4_count > 0:
|
||||
total += s4_count
|
||||
_human_delay(SEARCH_DELAY_MIN, SEARCH_DELAY_MAX)
|
||||
else:
|
||||
total += sub_count
|
||||
_human_delay(SEARCH_DELAY_MIN, SEARCH_DELAY_MAX)
|
||||
continue
|
||||
|
||||
total += count
|
||||
|
||||
# Progress
|
||||
if (i + 1) % 10 == 0:
|
||||
LOG.info("Progress: %d/%d prefixes done (%d total entities)", i + 1, len(prefixes), total)
|
||||
|
||||
# Longer pause between prefixes
|
||||
_human_delay(SEARCH_DELAY_MIN, SEARCH_DELAY_MAX)
|
||||
|
||||
LOG.info("Done: %d total WY entities scraped across %d prefixes", total, len(prefixes))
|
||||
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue