new-site/scripts/workers/fl_entity_downloader.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

299 lines
9.9 KiB
Python

"""
fl_entity_downloader.py — Download Florida Sunbiz corporation data via SFTP.
Florida provides free bulk data via SFTP:
Host: sftp.floridados.gov
User: Public
Pass: PubAccess1845!
Data is fixed-width text (see https://dos.sunbiz.org/data-definitions/cor.html).
Quarterly full dump at doc/Quarterly/Cor/cordata.zip (~1.7GB compressed, ~4M entities).
Daily diffs at doc/cor/YYYYMMDDc.txt.
Strategy: Download the latest daily diffs (faster than the full quarterly).
For initial load: download the full quarterly dump.
Usage:
# Download daily diffs for the past 7 days:
python -m workers.fl_entity_downloader --daily
# Download full quarterly dump (slow, ~1.7GB):
python -m workers.fl_entity_downloader --full
# Dry run (parse but don't save):
python -m workers.fl_entity_downloader --daily --dry-run
Environment:
DATABASE_URL PostgreSQL connection string
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
import tempfile
import zipfile
from datetime import datetime, timedelta
import psycopg2
LOG = logging.getLogger("workers.fl_entity_downloader")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
stream=sys.stdout,
)
DATABASE_URL = os.environ.get("DATABASE_URL", "")
SFTP_HOST = "sftp.floridados.gov"
SFTP_USER = "Public"
SFTP_PASS = "PubAccess1845!"
# Fixed-width field positions for FL corporation data
# See: https://dos.sunbiz.org/data-definitions/cor.html
# Positions are 0-indexed, (start, end)
FIELDS = {
"entity_number": (0, 12),
"entity_name": (12, 200),
"status": (200, 201), # A=Active, I=Inactive
"filing_type": (201, 205), # FLAL=FL LLC, FORL=Foreign LLC, DOMP=Domestic Profit Corp, etc.
"address1": (220, 282),
"city": (344, 372),
"state": (372, 374),
"zip": (374, 384),
"formation_date": (398, 406), # MMDDYYYY
"formation_state": (424, 426), # 2-letter state code for formation jurisdiction
}
def parse_fl_line(line: str) -> dict | None:
"""Parse one fixed-width line from FL corporation data."""
if len(line) < 430:
return None
entity_number = line[0:12].strip()
entity_name = line[12:200].strip().upper()
status_code = line[200:201].strip()
filing_type = line[201:205].strip()
city = line[344:372].strip()
state = line[372:374].strip()
zip_code = line[374:384].strip()
# Formation date (MMDDYYYY)
date_str = line[398:406].strip()
formation_date = None
if date_str and len(date_str) == 8:
try:
formation_date = datetime.strptime(date_str, "%m%d%Y").strftime("%Y-%m-%d")
except ValueError:
pass
# Formation state
formation_state = line[424:426].strip() or None
if formation_state == "FL":
formation_state = "FL"
if not entity_name or not entity_number:
return None
# Normalize status
status = "ACTIVE" if status_code == "A" else "INACTIVE"
# Normalize entity type from filing_type code
entity_type = None
ft = filing_type.upper()
if "L" in ft: # FLAL, FORL, etc.
entity_type = "LLC"
elif "P" in ft or "C" in ft: # DOMP, FORP, etc.
entity_type = "CORPORATION"
elif ft.startswith("LP") or ft.startswith("FLP"):
entity_type = "LP"
# Determine if foreign
is_foreign = ft.startswith("FOR") or ft.startswith("FO")
if is_foreign and not formation_state:
formation_state = None # Unknown foreign origin
address = f"{city}, {state} {zip_code}".strip(", ")
return {
"entity_name": entity_name,
"entity_number": entity_number,
"entity_type": entity_type,
"status": status,
"formation_date": formation_date,
"formation_state": formation_state if formation_state != "FL" or not is_foreign else formation_state,
"jurisdiction": "US_FL",
"state": "FL",
"principal_address": address if city else None,
}
def download_daily(days: int = 7, dry_run: bool = False) -> int:
"""Download and parse the last N days of daily diff files."""
import paramiko
LOG.info("Connecting to FL Sunbiz SFTP...")
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS)
sftp = ssh.open_sftp()
entities: list[dict] = []
today = datetime.now()
for i in range(days):
date = today - timedelta(days=i)
filename = f"/Public/doc/cor/{date.strftime('%Y%m%d')}c.txt"
try:
with sftp.open(filename, "rb") as f:
for line in f:
entity = parse_fl_line(line.decode("latin-1", errors="ignore"))
if entity:
entities.append(entity)
LOG.info(" %s: parsed %d cumulative entities", filename, len(entities))
except FileNotFoundError:
pass # No file for weekends/holidays
except Exception as exc:
LOG.warning(" %s: %s", filename, exc)
sftp.close()
ssh.close()
LOG.info("Downloaded %d FL entities from %d daily files", len(entities), days)
if entities and not dry_run:
return _upsert(entities)
return len(entities)
def download_full(dry_run: bool = False) -> int:
"""Download and parse the full quarterly corporation dump."""
import paramiko
LOG.info("Connecting to FL Sunbiz SFTP for full quarterly download...")
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS)
sftp = ssh.open_sftp()
with tempfile.TemporaryDirectory(prefix="fl_corp_") as tmpdir:
zip_path = os.path.join(tmpdir, "cordata.zip")
LOG.info("Downloading cordata.zip (~1.7GB)...")
sftp.get("/Public/doc/Quarterly/Cor/cordata.zip", zip_path)
sftp.close()
ssh.close()
LOG.info("Extracting ZIP...")
entities: list[dict] = []
with zipfile.ZipFile(zip_path, "r") as zf:
for name in zf.namelist():
if not name.endswith(".txt"):
continue
LOG.info(" Parsing %s...", name)
with zf.open(name) as f:
for line in f:
entity = parse_fl_line(line.decode("latin-1", errors="ignore"))
if entity:
entities.append(entity)
if len(entities) % 500000 == 0 and len(entities) > 0:
LOG.info(" %d entities parsed...", len(entities))
LOG.info("Total: %d FL entities parsed", len(entities))
if entities and not dry_run:
return _upsert(entities)
return len(entities)
def _upsert(entities: list[dict]) -> int:
"""Upsert FL entities into entity_cache."""
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
count = 0
# Deduplicate
seen: set = set()
deduped: list = []
for e in entities:
key = (e["jurisdiction"], e["entity_number"])
if key not in seen:
seen.add(key)
deduped.append(e)
LOG.info("Upserting %d entities (deduped from %d)...", len(deduped), len(entities))
try:
for batch_start in range(0, len(deduped), 500):
batch = deduped[batch_start:batch_start + 500]
values = []
for e in batch:
values.append(cur.mogrify(
"(%s,%s,%s,%s,%s,%s,%s,%s,%s,'sftp')",
(
e["jurisdiction"], e["entity_name"], e["entity_number"],
e["entity_type"], e["status"], e["formation_date"],
e["state"], e.get("formation_state"),
e.get("principal_address"),
),
).decode())
sql = f"""
INSERT INTO entity_cache
(jurisdiction, entity_name, entity_number, entity_type, status,
formation_date, state, formation_state, principal_address, source)
VALUES {",".join(values)}
ON CONFLICT (jurisdiction, entity_number) DO UPDATE SET
entity_name = EXCLUDED.entity_name,
entity_type = EXCLUDED.entity_type,
status = EXCLUDED.status,
formation_date = EXCLUDED.formation_date,
formation_state = COALESCE(EXCLUDED.formation_state, entity_cache.formation_state),
principal_address = EXCLUDED.principal_address,
last_synced = NOW()
"""
cur.execute(sql)
count += len(batch)
if count % 100000 == 0:
LOG.info(" Upserted %d...", count)
conn.commit()
conn.commit()
except Exception as exc:
LOG.error("DB error: %s", exc)
conn.rollback()
finally:
cur.close()
conn.close()
LOG.info("Done: upserted %d FL entities", count)
return count
def main():
parser = argparse.ArgumentParser(description="Download Florida Sunbiz corporation data")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--daily", action="store_true", help="Download last 7 days of daily diffs")
group.add_argument("--full", action="store_true", help="Download full quarterly dump (~1.7GB)")
parser.add_argument("--days", type=int, default=7, help="Number of days for daily mode (default: 7)")
parser.add_argument("--dry-run", action="store_true", help="Parse but don't save to DB")
args = parser.parse_args()
if not DATABASE_URL and not args.dry_run:
LOG.error("DATABASE_URL not set")
sys.exit(1)
if args.daily:
count = download_daily(days=args.days, dry_run=args.dry_run)
else:
count = download_full(dry_run=args.dry_run)
LOG.info("Complete: %d entities", count)
if __name__ == "__main__":
main()