Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
299 lines
9.9 KiB
Python
299 lines
9.9 KiB
Python
"""
|
|
fl_entity_downloader.py — Download Florida Sunbiz corporation data via SFTP.
|
|
|
|
Florida provides free bulk data via SFTP:
|
|
Host: sftp.floridados.gov
|
|
User: Public
|
|
Pass: PubAccess1845!
|
|
|
|
Data is fixed-width text (see https://dos.sunbiz.org/data-definitions/cor.html).
|
|
Quarterly full dump at doc/Quarterly/Cor/cordata.zip (~1.7GB compressed, ~4M entities).
|
|
Daily diffs at doc/cor/YYYYMMDDc.txt.
|
|
|
|
Strategy: Download the latest daily diffs (faster than the full quarterly).
|
|
For initial load: download the full quarterly dump.
|
|
|
|
Usage:
|
|
# Download daily diffs for the past 7 days:
|
|
python -m workers.fl_entity_downloader --daily
|
|
|
|
# Download full quarterly dump (slow, ~1.7GB):
|
|
python -m workers.fl_entity_downloader --full
|
|
|
|
# Dry run (parse but don't save):
|
|
python -m workers.fl_entity_downloader --daily --dry-run
|
|
|
|
Environment:
|
|
DATABASE_URL PostgreSQL connection string
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
import zipfile
|
|
from datetime import datetime, timedelta
|
|
|
|
import psycopg2
|
|
|
|
LOG = logging.getLogger("workers.fl_entity_downloader")
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
|
|
stream=sys.stdout,
|
|
)
|
|
|
|
DATABASE_URL = os.environ.get("DATABASE_URL", "")
|
|
|
|
SFTP_HOST = "sftp.floridados.gov"
|
|
SFTP_USER = "Public"
|
|
SFTP_PASS = "PubAccess1845!"
|
|
|
|
# Fixed-width field positions for FL corporation data
|
|
# See: https://dos.sunbiz.org/data-definitions/cor.html
|
|
# Positions are 0-indexed, (start, end)
|
|
FIELDS = {
|
|
"entity_number": (0, 12),
|
|
"entity_name": (12, 200),
|
|
"status": (200, 201), # A=Active, I=Inactive
|
|
"filing_type": (201, 205), # FLAL=FL LLC, FORL=Foreign LLC, DOMP=Domestic Profit Corp, etc.
|
|
"address1": (220, 282),
|
|
"city": (344, 372),
|
|
"state": (372, 374),
|
|
"zip": (374, 384),
|
|
"formation_date": (398, 406), # MMDDYYYY
|
|
"formation_state": (424, 426), # 2-letter state code for formation jurisdiction
|
|
}
|
|
|
|
|
|
def parse_fl_line(line: str) -> dict | None:
|
|
"""Parse one fixed-width line from FL corporation data."""
|
|
if len(line) < 430:
|
|
return None
|
|
|
|
entity_number = line[0:12].strip()
|
|
entity_name = line[12:200].strip().upper()
|
|
status_code = line[200:201].strip()
|
|
filing_type = line[201:205].strip()
|
|
city = line[344:372].strip()
|
|
state = line[372:374].strip()
|
|
zip_code = line[374:384].strip()
|
|
|
|
# Formation date (MMDDYYYY)
|
|
date_str = line[398:406].strip()
|
|
formation_date = None
|
|
if date_str and len(date_str) == 8:
|
|
try:
|
|
formation_date = datetime.strptime(date_str, "%m%d%Y").strftime("%Y-%m-%d")
|
|
except ValueError:
|
|
pass
|
|
|
|
# Formation state
|
|
formation_state = line[424:426].strip() or None
|
|
if formation_state == "FL":
|
|
formation_state = "FL"
|
|
|
|
if not entity_name or not entity_number:
|
|
return None
|
|
|
|
# Normalize status
|
|
status = "ACTIVE" if status_code == "A" else "INACTIVE"
|
|
|
|
# Normalize entity type from filing_type code
|
|
entity_type = None
|
|
ft = filing_type.upper()
|
|
if "L" in ft: # FLAL, FORL, etc.
|
|
entity_type = "LLC"
|
|
elif "P" in ft or "C" in ft: # DOMP, FORP, etc.
|
|
entity_type = "CORPORATION"
|
|
elif ft.startswith("LP") or ft.startswith("FLP"):
|
|
entity_type = "LP"
|
|
|
|
# Determine if foreign
|
|
is_foreign = ft.startswith("FOR") or ft.startswith("FO")
|
|
if is_foreign and not formation_state:
|
|
formation_state = None # Unknown foreign origin
|
|
|
|
address = f"{city}, {state} {zip_code}".strip(", ")
|
|
|
|
return {
|
|
"entity_name": entity_name,
|
|
"entity_number": entity_number,
|
|
"entity_type": entity_type,
|
|
"status": status,
|
|
"formation_date": formation_date,
|
|
"formation_state": formation_state if formation_state != "FL" or not is_foreign else formation_state,
|
|
"jurisdiction": "US_FL",
|
|
"state": "FL",
|
|
"principal_address": address if city else None,
|
|
}
|
|
|
|
|
|
def download_daily(days: int = 7, dry_run: bool = False) -> int:
|
|
"""Download and parse the last N days of daily diff files."""
|
|
import paramiko
|
|
|
|
LOG.info("Connecting to FL Sunbiz SFTP...")
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS)
|
|
sftp = ssh.open_sftp()
|
|
|
|
entities: list[dict] = []
|
|
today = datetime.now()
|
|
|
|
for i in range(days):
|
|
date = today - timedelta(days=i)
|
|
filename = f"/Public/doc/cor/{date.strftime('%Y%m%d')}c.txt"
|
|
try:
|
|
with sftp.open(filename, "rb") as f:
|
|
for line in f:
|
|
entity = parse_fl_line(line.decode("latin-1", errors="ignore"))
|
|
if entity:
|
|
entities.append(entity)
|
|
LOG.info(" %s: parsed %d cumulative entities", filename, len(entities))
|
|
except FileNotFoundError:
|
|
pass # No file for weekends/holidays
|
|
except Exception as exc:
|
|
LOG.warning(" %s: %s", filename, exc)
|
|
|
|
sftp.close()
|
|
ssh.close()
|
|
|
|
LOG.info("Downloaded %d FL entities from %d daily files", len(entities), days)
|
|
|
|
if entities and not dry_run:
|
|
return _upsert(entities)
|
|
return len(entities)
|
|
|
|
|
|
def download_full(dry_run: bool = False) -> int:
|
|
"""Download and parse the full quarterly corporation dump."""
|
|
import paramiko
|
|
|
|
LOG.info("Connecting to FL Sunbiz SFTP for full quarterly download...")
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect(SFTP_HOST, username=SFTP_USER, password=SFTP_PASS)
|
|
sftp = ssh.open_sftp()
|
|
|
|
with tempfile.TemporaryDirectory(prefix="fl_corp_") as tmpdir:
|
|
zip_path = os.path.join(tmpdir, "cordata.zip")
|
|
LOG.info("Downloading cordata.zip (~1.7GB)...")
|
|
sftp.get("/Public/doc/Quarterly/Cor/cordata.zip", zip_path)
|
|
sftp.close()
|
|
ssh.close()
|
|
|
|
LOG.info("Extracting ZIP...")
|
|
entities: list[dict] = []
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
for name in zf.namelist():
|
|
if not name.endswith(".txt"):
|
|
continue
|
|
LOG.info(" Parsing %s...", name)
|
|
with zf.open(name) as f:
|
|
for line in f:
|
|
entity = parse_fl_line(line.decode("latin-1", errors="ignore"))
|
|
if entity:
|
|
entities.append(entity)
|
|
if len(entities) % 500000 == 0 and len(entities) > 0:
|
|
LOG.info(" %d entities parsed...", len(entities))
|
|
|
|
LOG.info("Total: %d FL entities parsed", len(entities))
|
|
|
|
if entities and not dry_run:
|
|
return _upsert(entities)
|
|
return len(entities)
|
|
|
|
|
|
def _upsert(entities: list[dict]) -> int:
|
|
"""Upsert FL entities into entity_cache."""
|
|
conn = psycopg2.connect(DATABASE_URL)
|
|
cur = conn.cursor()
|
|
count = 0
|
|
|
|
# Deduplicate
|
|
seen: set = set()
|
|
deduped: list = []
|
|
for e in entities:
|
|
key = (e["jurisdiction"], e["entity_number"])
|
|
if key not in seen:
|
|
seen.add(key)
|
|
deduped.append(e)
|
|
|
|
LOG.info("Upserting %d entities (deduped from %d)...", len(deduped), len(entities))
|
|
|
|
try:
|
|
for batch_start in range(0, len(deduped), 500):
|
|
batch = deduped[batch_start:batch_start + 500]
|
|
values = []
|
|
for e in batch:
|
|
values.append(cur.mogrify(
|
|
"(%s,%s,%s,%s,%s,%s,%s,%s,%s,'sftp')",
|
|
(
|
|
e["jurisdiction"], e["entity_name"], e["entity_number"],
|
|
e["entity_type"], e["status"], e["formation_date"],
|
|
e["state"], e.get("formation_state"),
|
|
e.get("principal_address"),
|
|
),
|
|
).decode())
|
|
|
|
sql = f"""
|
|
INSERT INTO entity_cache
|
|
(jurisdiction, entity_name, entity_number, entity_type, status,
|
|
formation_date, state, formation_state, principal_address, source)
|
|
VALUES {",".join(values)}
|
|
ON CONFLICT (jurisdiction, entity_number) DO UPDATE SET
|
|
entity_name = EXCLUDED.entity_name,
|
|
entity_type = EXCLUDED.entity_type,
|
|
status = EXCLUDED.status,
|
|
formation_date = EXCLUDED.formation_date,
|
|
formation_state = COALESCE(EXCLUDED.formation_state, entity_cache.formation_state),
|
|
principal_address = EXCLUDED.principal_address,
|
|
last_synced = NOW()
|
|
"""
|
|
cur.execute(sql)
|
|
count += len(batch)
|
|
|
|
if count % 100000 == 0:
|
|
LOG.info(" Upserted %d...", count)
|
|
conn.commit()
|
|
|
|
conn.commit()
|
|
except Exception as exc:
|
|
LOG.error("DB error: %s", exc)
|
|
conn.rollback()
|
|
finally:
|
|
cur.close()
|
|
conn.close()
|
|
|
|
LOG.info("Done: upserted %d FL entities", count)
|
|
return count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Download Florida Sunbiz corporation data")
|
|
group = parser.add_mutually_exclusive_group(required=True)
|
|
group.add_argument("--daily", action="store_true", help="Download last 7 days of daily diffs")
|
|
group.add_argument("--full", action="store_true", help="Download full quarterly dump (~1.7GB)")
|
|
parser.add_argument("--days", type=int, default=7, help="Number of days for daily mode (default: 7)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Parse but don't save to DB")
|
|
args = parser.parse_args()
|
|
|
|
if not DATABASE_URL and not args.dry_run:
|
|
LOG.error("DATABASE_URL not set")
|
|
sys.exit(1)
|
|
|
|
if args.daily:
|
|
count = download_daily(days=args.days, dry_run=args.dry_run)
|
|
else:
|
|
count = download_full(dry_run=args.dry_run)
|
|
|
|
LOG.info("Complete: %d entities", count)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|