""" wy_entity_scraper.py — Bulk scrape Wyoming business entities from WyoBiz. WyoBiz (wyobiz.wyo.gov) doesn't offer bulk downloads without a $10K+ subscription. This scraper paginates through search results to populate entity_cache. Strategy: Search by year prefix (2020-, 2021-, etc.) to get all filings, then paginate through results. WyoBiz shows 10 results per page. Usage: # Scrape all years (2000-2026): python -m workers.wy_entity_scraper # Scrape specific year range: python -m workers.wy_entity_scraper --start-year 2023 --end-year 2026 # Resume from a specific year: python -m workers.wy_entity_scraper --start-year 2024 # Dry run (don't save to DB): python -m workers.wy_entity_scraper --dry-run --start-year 2025 Environment: DATABASE_URL PostgreSQL connection string """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from datetime import datetime import psycopg2 import psycopg2.extras LOG = logging.getLogger("workers.wy_entity_scraper") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s", stream=sys.stdout, ) DATABASE_URL = os.environ.get("DATABASE_URL", "") WYOBIZ_SEARCH_URL = "https://wyobiz.wyo.gov/Business/FilingSearch.aspx" # WY filing IDs are formatted as YYYY-NNNNNNN (year + 7-digit sequence) # Approximate counts per year: ~8,000-15,000 new filings/year # Rate limiting — new searches are the risky action, pagination is safe PAGE_DELAY_MIN = 1.5 # Seconds between pagination clicks (same search, low risk) PAGE_DELAY_MAX = 3.0 SEARCH_DELAY_MIN = 15 # Seconds between NEW searches (new prefix = new session risk) SEARCH_DELAY_MAX = 30 def _human_delay(min_s: float = PAGE_DELAY_MIN, max_s: float = PAGE_DELAY_MAX): """Random delay to mimic human browsing speed.""" delay = random.uniform(min_s, max_s) time.sleep(delay) def scrape_prefix(prefix: str, conn, dry_run: bool = False) -> int: """Scrape all entities matching a name prefix from WyoBiz. Uses "Starts With" name search with 2-letter prefixes (AA, AB, ... ZZ) to enumerate all entities without triggering anti-scraping. """ from playwright.sync_api import sync_playwright LOG.info("Scraping WY entities starting with '%s'", prefix) entities: list[dict] = [] try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36", viewport={"width": 1366, "height": 768}, ) page = context.new_page() # Navigate to search page page.goto(WYOBIZ_SEARCH_URL, wait_until="networkidle", timeout=30000) _human_delay(3, 6) # Select "Starts With" radio (default mode) starts_with = page.query_selector("#MainContent_chkSearchStartWith") if starts_with: starts_with.click() _human_delay(0.5, 1) # Type the prefix slowly (human-like) name_input = page.query_selector("#MainContent_txtFilingName") if not name_input: LOG.error("Could not find name search input") browser.close() return 0 for char in prefix: name_input.type(char, delay=random.randint(80, 200)) _human_delay(1, 2) # Click search page.click("#MainContent_cmdSearch") _human_delay(4, 8) # Wait for results try: page.wait_for_selector("#scrolltop", timeout=20000) except Exception: pass page.wait_for_load_state("networkidle", timeout=20000) _human_delay(2, 4) # Check for CAPTCHA page_text = page.inner_text("body") if "captcha" in page_text.lower() or "verify you are human" in page_text.lower(): LOG.warning(" CAPTCHA detected for prefix '%s' — stopping", prefix) browser.close() return -1 # Signal to caller to back off # Parse results pages page_num = 0 while True: page_num += 1 # Extract results from current page rows = page.query_selector_all("table.Grid tr:not(:first-child)") if not rows: rows = page.query_selector_all("#MainContent_gvFilings tr:not(:first-child)") if not rows: content = page.inner_text("#scrolltop") if page.query_selector("#scrolltop") else "" if "no record" in content.lower() or not content.strip(): break if page_num == 1: LOG.info(" Prefix '%s': no results", prefix) break for row in rows: cells = row.query_selector_all("td") if len(cells) < 3: continue cell_texts = [c.inner_text().strip() for c in cells] if len(cell_texts) >= 3: # Columns vary — try to identify them filing_id = None name = None status_raw = "" entity_type_raw = "" formation_date_raw = "" # First cell with YYYY- pattern is filing ID for i, txt in enumerate(cell_texts): if re.match(r"\d{4}-\d+", txt): filing_id = txt elif not name and len(txt) > 2 and not re.match(r"\d{2}/\d{2}/\d{4}", txt): if txt.upper() not in ("ACTIVE", "INACTIVE", "DISSOLVED", "LLC", "CORPORATION"): name = txt.upper() # Remaining cells by position after name if name and len(cell_texts) >= 3: for txt in cell_texts: t = txt.upper() if t in ("ACTIVE", "INACTIVE", "DISSOLVED", "DELINQUENT", "GOOD STANDING", "REVOKED", "SUSPENDED", "CANCELLED"): status_raw = t elif "LLC" in t or "CORP" in t or "LP" in t or "LIMITED" in t: if t != name: entity_type_raw = t elif re.match(r"\d{2}/\d{2}/\d{4}", txt): formation_date_raw = txt if not filing_id: # Use name as entity_number fallback filing_id = f"WY_{name[:30]}" if name else None if name and filing_id: # Normalize status status = "ACTIVE" s = status_raw.upper() if "DISSOLV" in s or "CANCEL" in s: status = "DISSOLVED" elif "DELINQ" in s or "DEFAULT" in s: status = "DELINQUENT" elif "SUSPEND" in s or "REVOK" in s: status = "SUSPENDED" elif "INACTIVE" in s or "WITHDRAWN" in s: status = "INACTIVE" # Normalize type entity_type = None t = entity_type_raw.upper() if "LLC" in t or "LIMITED LIABILITY" in t: entity_type = "LLC" elif "CORP" in t or "INC" in t: entity_type = "CORPORATION" elif "LP" in t: entity_type = "LP" # Parse date formation_date = None if formation_date_raw: try: formation_date = datetime.strptime( formation_date_raw.split()[0], "%m/%d/%Y" ).strftime("%Y-%m-%d") except (ValueError, IndexError): pass entities.append({ "entity_name": name, "entity_number": filing_id, "entity_type": entity_type, "status": status, "formation_date": formation_date, "formation_state": "WY", "jurisdiction": "US_WY", "state": "WY", }) # Try to go to next page next_link = page.query_selector("a[href*='Page$Next']") if not next_link: break next_link.click() page.wait_for_load_state("networkidle", timeout=20000) _human_delay(PAGE_DELAY_MIN, PAGE_DELAY_MAX) # 1.5-3s between pages # Safety limits — if we hit max pages, prefix needs to go deeper if page_num > 200: LOG.warning(" Prefix '%s': hit 200 page limit — needs deeper split", prefix) # Return negative to signal caller to split this prefix browser.close() if entities and not dry_run: _upsert(conn, entities) return -(len(entities)) # Negative = needs split if page_num % 50 == 0: LOG.info(" Prefix '%s': page %d (%d entities)", prefix, page_num, len(entities)) browser.close() except Exception as exc: LOG.error("Scraper error for prefix '%s': %s", prefix, exc) LOG.info(" Prefix '%s': scraped %d entities", prefix, len(entities)) # Upsert to DB if entities and not dry_run: count = _upsert(conn, entities) LOG.info(" Prefix '%s': upserted %d to entity_cache", prefix, count) return count return len(entities) def _upsert(conn, entities: list[dict]) -> int: """Upsert scraped WY entities into entity_cache.""" cur = conn.cursor() count = 0 # Deduplicate seen = set() deduped = [] for e in entities: key = (e["jurisdiction"], e["entity_number"]) if key not in seen: seen.add(key) deduped.append(e) try: for batch_start in range(0, len(deduped), 200): batch = deduped[batch_start:batch_start + 200] values = [] for e in batch: values.append(cur.mogrify( "(%s,%s,%s,%s,%s,%s,%s,%s,%s,'playwright')", ( e["jurisdiction"], e["entity_name"], e["entity_number"], e["entity_type"], e["status"], e["formation_date"], e["state"], e.get("formation_state"), None, # principal_address ), ).decode()) sql = f""" INSERT INTO entity_cache (jurisdiction, entity_name, entity_number, entity_type, status, formation_date, state, formation_state, principal_address, source) VALUES {",".join(values)} ON CONFLICT (jurisdiction, entity_number) DO UPDATE SET entity_name = EXCLUDED.entity_name, entity_type = EXCLUDED.entity_type, status = EXCLUDED.status, formation_date = EXCLUDED.formation_date, formation_state = COALESCE(EXCLUDED.formation_state, entity_cache.formation_state), last_synced = NOW() """ cur.execute(sql) count += len(batch) conn.commit() except Exception as exc: LOG.error("DB upsert error: %s", exc) conn.rollback() return count def _generate_prefixes(start: str = "AA", end: str = "ZZ") -> list[str]: """Generate 2-letter prefixes from start to end (inclusive).""" prefixes = [] for c1 in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": for c2 in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": p = c1 + c2 if p >= start and p <= end: prefixes.append(p) # Also add numeric prefixes (1A, 2B, etc. — for entities starting with numbers) for d in "0123456789": prefixes.append(d) return prefixes def main(): parser = argparse.ArgumentParser(description="Scrape Wyoming business entities from WyoBiz") parser.add_argument("--start", type=str, default="AA", help="Start prefix (default: AA)") parser.add_argument("--end", type=str, default="ZZ", help="End prefix (default: ZZ)") parser.add_argument("--prefix", type=str, help="Single prefix to scrape (e.g., GT)") parser.add_argument("--dry-run", action="store_true", help="Don't save to database") args = parser.parse_args() if not DATABASE_URL and not args.dry_run: LOG.error("DATABASE_URL not set") sys.exit(1) conn = psycopg2.connect(DATABASE_URL) if not args.dry_run else None total = 0 if args.prefix: prefixes = [args.prefix.upper()] else: prefixes = _generate_prefixes(args.start.upper(), args.end.upper()) LOG.info("Will scrape %d prefixes: %s ... %s", len(prefixes), prefixes[0], prefixes[-1]) for i, prefix in enumerate(prefixes): count = scrape_prefix(prefix, conn, dry_run=args.dry_run) if count == -1: # CAPTCHA detected — back off significantly LOG.warning("CAPTCHA hit — backing off 5 minutes before continuing") time.sleep(300) continue if count < 0: # Negative = too many results, need deeper split (e.g., "AB" → "ABA", "ABB", ...) LOG.info("Splitting prefix '%s' into 3-letter sub-prefixes", prefix) total += abs(count) # Count what was already saved for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": sub_prefix = prefix + c sub_count = scrape_prefix(sub_prefix, conn, dry_run=args.dry_run) if sub_count == -1: LOG.warning("CAPTCHA during split — backing off 5 min") time.sleep(300) elif sub_count < 0: # Even 3 letters too broad — go to 4 (rare) total += abs(sub_count) for c2 in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": sub4 = sub_prefix + c2 s4_count = scrape_prefix(sub4, conn, dry_run=args.dry_run) if s4_count > 0: total += s4_count _human_delay(SEARCH_DELAY_MIN, SEARCH_DELAY_MAX) else: total += sub_count _human_delay(SEARCH_DELAY_MIN, SEARCH_DELAY_MAX) continue total += count # Progress if (i + 1) % 10 == 0: LOG.info("Progress: %d/%d prefixes done (%d total entities)", i + 1, len(prefixes), total) # Longer pause between prefixes _human_delay(SEARCH_DELAY_MIN, SEARCH_DELAY_MAX) LOG.info("Done: %d total WY entities scraped across %d prefixes", total, len(prefixes)) if conn: conn.close() if __name__ == "__main__": main()