""" fl_entity_downloader.py — Download Florida Sunbiz corporation data via SFTP. Florida provides free bulk data via SFTP: Host: sftp.floridados.gov User: Public Pass: PubAccess1845! Data is fixed-width text (see https://dos.sunbiz.org/data-definitions/cor.html). Quarterly full dump at doc/Quarterly/Cor/cordata.zip (~1.7GB compressed, ~4M entities). Daily diffs at doc/cor/YYYYMMDDc.txt. Strategy: Download the latest daily diffs (faster than the full quarterly). For initial load: download the full quarterly dump. Usage: # Download daily diffs for the past 7 days: python -m workers.fl_entity_downloader --daily # Download full quarterly dump (slow, ~1.7GB): python -m workers.fl_entity_downloader --full # Dry run (parse but don't save): python -m workers.fl_entity_downloader --daily --dry-run Environment: DATABASE_URL PostgreSQL connection string """ from __future__ import annotations import argparse import logging import os import sys import tempfile import zipfile from datetime import datetime, timedelta import psycopg2 LOG = logging.getLogger("workers.fl_entity_downloader") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s", stream=sys.stdout, ) DATABASE_URL = os.environ.get("DATABASE_URL", "") SFTP_HOST = "sftp.floridados.gov" SFTP_USER = "Public" SFTP_PASS = "PubAccess1845!" # Fixed-width field positions for FL corporation data # See: https://dos.sunbiz.org/data-definitions/cor.html # Positions are 0-indexed, (start, end) FIELDS = { "entity_number": (0, 12), "entity_name": (12, 200), "status": (200, 201), # A=Active, I=Inactive "filing_type": (201, 205), # FLAL=FL LLC, FORL=Foreign LLC, DOMP=Domestic Profit Corp, etc. "address1": (220, 282), "city": (344, 372), "state": (372, 374), "zip": (374, 384), "formation_date": (398, 406), # MMDDYYYY "formation_state": (424, 426), # 2-letter state code for formation jurisdiction } def parse_fl_line(line: str) -> dict | None: """Parse one fixed-width line from FL corporation data.""" if len(line) < 430: return None entity_number = line[0:12].strip() entity_name = line[12:200].strip().upper() status_code = line[200:201].strip() filing_type = line[201:205].strip() city = line[344:372].strip() state = line[372:374].strip() zip_code = line[374:384].strip() # Formation date (MMDDYYYY) date_str = line[398:406].strip() formation_date = None if date_str and len(date_str) == 8: try: formation_date = datetime.strptime(date_str, "%m%d%Y").strftime("%Y-%m-%d") except ValueError: pass # Formation state formation_state = line[424:426].strip() or None if formation_state == "FL": formation_state = "FL" if not entity_name or not entity_number: return None # Normalize status status = "ACTIVE" if status_code == "A" else "INACTIVE" # Normalize entity type from filing_type code entity_type = None ft = filing_type.upper() if "L" in ft: # FLAL, FORL, etc. entity_type = "LLC" elif "P" in ft or "C" in ft: # DOMP, FORP, etc. entity_type = "CORPORATION" elif ft.startswith("LP") or ft.startswith("FLP"): entity_type = "LP" # Determine if foreign is_foreign = ft.startswith("FOR") or ft.startswith("FO") if is_foreign and not formation_state: formation_state = None # Unknown foreign origin address = f"{city}, {state} {zip_code}".strip(", ") return { "entity_name": entity_name, "entity_number": entity_number, "entity_type": entity_type, "status": status, "formation_date": formation_date, "formation_state": formation_state if formation_state != "FL" or not is_foreign else formation_state, "jurisdiction": "US_FL", "state": "FL", "principal_address": address if city else None, } def download_daily(days: int = 7, dry_run: bool = False) -> int: """Download and parse the last N days of daily diff files.""" import paramiko LOG.info("Connecting to FL Sunbiz SFTP...") ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS) sftp = ssh.open_sftp() entities: list[dict] = [] today = datetime.now() for i in range(days): date = today - timedelta(days=i) filename = f"/Public/doc/cor/{date.strftime('%Y%m%d')}c.txt" try: with sftp.open(filename, "rb") as f: for line in f: entity = parse_fl_line(line.decode("latin-1", errors="ignore")) if entity: entities.append(entity) LOG.info(" %s: parsed %d cumulative entities", filename, len(entities)) except FileNotFoundError: pass # No file for weekends/holidays except Exception as exc: LOG.warning(" %s: %s", filename, exc) sftp.close() ssh.close() LOG.info("Downloaded %d FL entities from %d daily files", len(entities), days) if entities and not dry_run: return _upsert(entities) return len(entities) def download_full(dry_run: bool = False) -> int: """Download and parse the full quarterly corporation dump.""" import paramiko LOG.info("Connecting to FL Sunbiz SFTP for full quarterly download...") ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS) sftp = ssh.open_sftp() with tempfile.TemporaryDirectory(prefix="fl_corp_") as tmpdir: zip_path = os.path.join(tmpdir, "cordata.zip") LOG.info("Downloading cordata.zip (~1.7GB)...") sftp.get("/Public/doc/Quarterly/Cor/cordata.zip", zip_path) sftp.close() ssh.close() LOG.info("Extracting ZIP...") entities: list[dict] = [] with zipfile.ZipFile(zip_path, "r") as zf: for name in zf.namelist(): if not name.endswith(".txt"): continue LOG.info(" Parsing %s...", name) with zf.open(name) as f: for line in f: entity = parse_fl_line(line.decode("latin-1", errors="ignore")) if entity: entities.append(entity) if len(entities) % 500000 == 0 and len(entities) > 0: LOG.info(" %d entities parsed...", len(entities)) LOG.info("Total: %d FL entities parsed", len(entities)) if entities and not dry_run: return _upsert(entities) return len(entities) def _upsert(entities: list[dict]) -> int: """Upsert FL entities into entity_cache.""" conn = psycopg2.connect(DATABASE_URL) cur = conn.cursor() count = 0 # Deduplicate seen: set = set() deduped: list = [] for e in entities: key = (e["jurisdiction"], e["entity_number"]) if key not in seen: seen.add(key) deduped.append(e) LOG.info("Upserting %d entities (deduped from %d)...", len(deduped), len(entities)) try: for batch_start in range(0, len(deduped), 500): batch = deduped[batch_start:batch_start + 500] values = [] for e in batch: values.append(cur.mogrify( "(%s,%s,%s,%s,%s,%s,%s,%s,%s,'sftp')", ( e["jurisdiction"], e["entity_name"], e["entity_number"], e["entity_type"], e["status"], e["formation_date"], e["state"], e.get("formation_state"), e.get("principal_address"), ), ).decode()) sql = f""" INSERT INTO entity_cache (jurisdiction, entity_name, entity_number, entity_type, status, formation_date, state, formation_state, principal_address, source) VALUES {",".join(values)} ON CONFLICT (jurisdiction, entity_number) DO UPDATE SET entity_name = EXCLUDED.entity_name, entity_type = EXCLUDED.entity_type, status = EXCLUDED.status, formation_date = EXCLUDED.formation_date, formation_state = COALESCE(EXCLUDED.formation_state, entity_cache.formation_state), principal_address = EXCLUDED.principal_address, last_synced = NOW() """ cur.execute(sql) count += len(batch) if count % 100000 == 0: LOG.info(" Upserted %d...", count) conn.commit() conn.commit() except Exception as exc: LOG.error("DB error: %s", exc) conn.rollback() finally: cur.close() conn.close() LOG.info("Done: upserted %d FL entities", count) return count def main(): parser = argparse.ArgumentParser(description="Download Florida Sunbiz corporation data") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--daily", action="store_true", help="Download last 7 days of daily diffs") group.add_argument("--full", action="store_true", help="Download full quarterly dump (~1.7GB)") parser.add_argument("--days", type=int, default=7, help="Number of days for daily mode (default: 7)") parser.add_argument("--dry-run", action="store_true", help="Parse but don't save to DB") args = parser.parse_args() if not DATABASE_URL and not args.dry_run: LOG.error("DATABASE_URL not set") sys.exit(1) if args.daily: count = download_daily(days=args.days, dry_run=args.dry_run) else: count = download_full(dry_run=args.dry_run) LOG.info("Complete: %d entities", count) if __name__ == "__main__": main()