new-site/scripts/workers/fl_entity_downloader.py

"""
fl_entity_downloader.py — Download Florida Sunbiz corporation data via SFTP.

Florida provides free bulk data via SFTP:
  Host: sftp.floridados.gov
  User: Public
  Pass: PubAccess1845!

Data is fixed-width text (see https://dos.sunbiz.org/data-definitions/cor.html).
Quarterly full dump at doc/Quarterly/Cor/cordata.zip (~1.7GB compressed, ~4M entities).
Daily diffs at doc/cor/YYYYMMDDc.txt.

Strategy: Download the latest daily diffs (faster than the full quarterly).
For initial load: download the full quarterly dump.

Usage:
    # Download daily diffs for the past 7 days:
    python -m workers.fl_entity_downloader --daily

    # Download full quarterly dump (slow, ~1.7GB):
    python -m workers.fl_entity_downloader --full

    # Dry run (parse but don't save):
    python -m workers.fl_entity_downloader --daily --dry-run

Environment:
    DATABASE_URL    PostgreSQL connection string
"""

from __future__ import annotations

import argparse
import logging
import os
import sys
import tempfile
import zipfile
from datetime import datetime, timedelta

import psycopg2

LOG = logging.getLogger("workers.fl_entity_downloader")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
    stream=sys.stdout,
)

DATABASE_URL = os.environ.get("DATABASE_URL", "")

SFTP_HOST = "sftp.floridados.gov"
SFTP_USER = "Public"
SFTP_PASS = "PubAccess1845!"

# Fixed-width field positions for FL corporation data
# See: https://dos.sunbiz.org/data-definitions/cor.html
# Positions are 0-indexed, (start, end)
FIELDS = {
    "entity_number": (0, 12),
    "entity_name": (12, 200),
    "status": (200, 201),       # A=Active, I=Inactive
    "filing_type": (201, 205),  # FLAL=FL LLC, FORL=Foreign LLC, DOMP=Domestic Profit Corp, etc.
    "address1": (220, 282),
    "city": (344, 372),
    "state": (372, 374),
    "zip": (374, 384),
    "formation_date": (398, 406),  # MMDDYYYY
    "formation_state": (424, 426),  # 2-letter state code for formation jurisdiction
}


def parse_fl_line(line: str) -> dict | None:
    """Parse one fixed-width line from FL corporation data."""
    if len(line) < 430:
        return None

    entity_number = line[0:12].strip()
    entity_name = line[12:200].strip().upper()
    status_code = line[200:201].strip()
    filing_type = line[201:205].strip()
    city = line[344:372].strip()
    state = line[372:374].strip()
    zip_code = line[374:384].strip()

    # Formation date (MMDDYYYY)
    date_str = line[398:406].strip()
    formation_date = None
    if date_str and len(date_str) == 8:
        try:
            formation_date = datetime.strptime(date_str, "%m%d%Y").strftime("%Y-%m-%d")
        except ValueError:
            pass

    # Formation state
    formation_state = line[424:426].strip() or None
    if formation_state == "FL":
        formation_state = "FL"

    if not entity_name or not entity_number:
        return None

    # Normalize status
    status = "ACTIVE" if status_code == "A" else "INACTIVE"

    # Normalize entity type from filing_type code
    entity_type = None
    ft = filing_type.upper()
    if "L" in ft:  # FLAL, FORL, etc.
        entity_type = "LLC"
    elif "P" in ft or "C" in ft:  # DOMP, FORP, etc.
        entity_type = "CORPORATION"
    elif ft.startswith("LP") or ft.startswith("FLP"):
        entity_type = "LP"

    # Determine if foreign
    is_foreign = ft.startswith("FOR") or ft.startswith("FO")
    if is_foreign and not formation_state:
        formation_state = None  # Unknown foreign origin

    address = f"{city}, {state} {zip_code}".strip(", ")

    return {
        "entity_name": entity_name,
        "entity_number": entity_number,
        "entity_type": entity_type,
        "status": status,
        "formation_date": formation_date,
        "formation_state": formation_state if formation_state != "FL" or not is_foreign else formation_state,
        "jurisdiction": "US_FL",
        "state": "FL",
        "principal_address": address if city else None,
    }


def download_daily(days: int = 7, dry_run: bool = False) -> int:
    """Download and parse the last N days of daily diff files."""
    import paramiko

    LOG.info("Connecting to FL Sunbiz SFTP...")
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS)
    sftp = ssh.open_sftp()

    entities: list[dict] = []
    today = datetime.now()

    for i in range(days):
        date = today - timedelta(days=i)
        filename = f"/Public/doc/cor/{date.strftime('%Y%m%d')}c.txt"
        try:
            with sftp.open(filename, "rb") as f:
                for line in f:
                    entity = parse_fl_line(line.decode("latin-1", errors="ignore"))
                    if entity:
                        entities.append(entity)
            LOG.info("  %s: parsed %d cumulative entities", filename, len(entities))
        except FileNotFoundError:
            pass  # No file for weekends/holidays
        except Exception as exc:
            LOG.warning("  %s: %s", filename, exc)

    sftp.close()
    ssh.close()

    LOG.info("Downloaded %d FL entities from %d daily files", len(entities), days)

    if entities and not dry_run:
        return _upsert(entities)
    return len(entities)


def download_full(dry_run: bool = False) -> int:
    """Download and parse the full quarterly corporation dump."""
    import paramiko

    LOG.info("Connecting to FL Sunbiz SFTP for full quarterly download...")
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS)
    sftp = ssh.open_sftp()

    with tempfile.TemporaryDirectory(prefix="fl_corp_") as tmpdir:
        zip_path = os.path.join(tmpdir, "cordata.zip")
        LOG.info("Downloading cordata.zip (~1.7GB)...")
        sftp.get("/Public/doc/Quarterly/Cor/cordata.zip", zip_path)
        sftp.close()
        ssh.close()

        LOG.info("Extracting ZIP...")
        entities: list[dict] = []
        with zipfile.ZipFile(zip_path, "r") as zf:
            for name in zf.namelist():
                if not name.endswith(".txt"):
                    continue
                LOG.info("  Parsing %s...", name)
                with zf.open(name) as f:
                    for line in f:
                        entity = parse_fl_line(line.decode("latin-1", errors="ignore"))
                        if entity:
                            entities.append(entity)
                        if len(entities) % 500000 == 0 and len(entities) > 0:
                            LOG.info("    %d entities parsed...", len(entities))

    LOG.info("Total: %d FL entities parsed", len(entities))

    if entities and not dry_run:
        return _upsert(entities)
    return len(entities)


def _upsert(entities: list[dict]) -> int:
    """Upsert FL entities into entity_cache."""
    conn = psycopg2.connect(DATABASE_URL)
    cur = conn.cursor()
    count = 0

    # Deduplicate
    seen: set = set()
    deduped: list = []
    for e in entities:
        key = (e["jurisdiction"], e["entity_number"])
        if key not in seen:
            seen.add(key)
            deduped.append(e)

    LOG.info("Upserting %d entities (deduped from %d)...", len(deduped), len(entities))

    try:
        for batch_start in range(0, len(deduped), 500):
            batch = deduped[batch_start:batch_start + 500]
            values = []
            for e in batch:
                values.append(cur.mogrify(
                    "(%s,%s,%s,%s,%s,%s,%s,%s,%s,'sftp')",
                    (
                        e["jurisdiction"], e["entity_name"], e["entity_number"],
                        e["entity_type"], e["status"], e["formation_date"],
                        e["state"], e.get("formation_state"),
                        e.get("principal_address"),
                    ),
                ).decode())

            sql = f"""
                INSERT INTO entity_cache
                  (jurisdiction, entity_name, entity_number, entity_type, status,
                   formation_date, state, formation_state, principal_address, source)
                VALUES {",".join(values)}
                ON CONFLICT (jurisdiction, entity_number) DO UPDATE SET
                  entity_name = EXCLUDED.entity_name,
                  entity_type = EXCLUDED.entity_type,
                  status = EXCLUDED.status,
                  formation_date = EXCLUDED.formation_date,
                  formation_state = COALESCE(EXCLUDED.formation_state, entity_cache.formation_state),
                  principal_address = EXCLUDED.principal_address,
                  last_synced = NOW()
            """
            cur.execute(sql)
            count += len(batch)

            if count % 100000 == 0:
                LOG.info("  Upserted %d...", count)
                conn.commit()

        conn.commit()
    except Exception as exc:
        LOG.error("DB error: %s", exc)
        conn.rollback()
    finally:
        cur.close()
        conn.close()

    LOG.info("Done: upserted %d FL entities", count)
    return count


def main():
    parser = argparse.ArgumentParser(description="Download Florida Sunbiz corporation data")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--daily", action="store_true", help="Download last 7 days of daily diffs")
    group.add_argument("--full", action="store_true", help="Download full quarterly dump (~1.7GB)")
    parser.add_argument("--days", type=int, default=7, help="Number of days for daily mode (default: 7)")
    parser.add_argument("--dry-run", action="store_true", help="Parse but don't save to DB")
    args = parser.parse_args()

    if not DATABASE_URL and not args.dry_run:
        LOG.error("DATABASE_URL not set")
        sys.exit(1)

    if args.daily:
        count = download_daily(days=args.days, dry_run=args.dry_run)
    else:
        count = download_full(dry_run=args.dry_run)

    LOG.info("Complete: %d entities", count)


if __name__ == "__main__":
    main()