new-site/scripts/workers/fcc_rmd_removed_scraper.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

222 lines
7.4 KiB
Python

"""
fcc_rmd_removed_scraper.py — Research contact emails for FCC RMD-removed carriers.
Attempts to find a contact email for each carrier in fcc_rmd_removed by:
1. Checking if the company still has an entry in our fcc_rmd table (some
were provisionally reinstated and scraped).
2. Querying the company name via the FCC's CORES system for FRN-linked contacts.
3. Web-searching for the company's domain and scraping contact pages.
Usage:
python -m workers.fcc_rmd_removed_scraper
Environment variables:
DATABASE_URL PostgreSQL connection string
"""
from __future__ import annotations
import logging
import os
import re
import sys
import time
from typing import Optional
from urllib.parse import urljoin, urlparse
import psycopg2
import psycopg2.extras
import requests
LOG = logging.getLogger("workers.fcc_rmd_removed_scraper")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
stream=sys.stdout,
)
DATABASE_URL = os.environ.get("DATABASE_URL", "")
DELAY = 1.5 # seconds between web requests
# ---------------------------------------------------------------------------
# Phase 1 — Email research
# ---------------------------------------------------------------------------
def _fcc_cores_lookup(frn: str, session: requests.Session) -> Optional[str]:
"""Query FCC CORES for a contact email by FRN."""
url = f"https://apps.fcc.gov/coresWeb/publicHome.do?frn={frn}"
try:
r = session.get(url, timeout=15)
emails = re.findall(
r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}", r.text
)
for e in emails:
dom = e.split("@")[1].lower()
if "fcc.gov" not in dom and "gov" not in dom:
return e.lower()
except Exception:
pass
return None
def _guess_domain(business_name: str) -> list[str]:
"""Generate candidate domain guesses from a business name."""
slug = re.sub(r"[^a-z0-9]+", "", business_name.lower().split(" llc")[0].split(" inc")[0].split(" corp")[0].split(" ltd")[0])
slug2 = re.sub(r"[^a-z0-9]+", "-", business_name.lower().split(" llc")[0].split(" inc")[0].strip().strip("-"))
candidates = [
f"{slug}.com",
f"{slug2}.com",
f"{slug}.net",
f"{slug2}.net",
f"{slug}.io",
]
return list(dict.fromkeys(candidates)) # deduplicated
def _scrape_contact_from_website(domain: str, session: requests.Session) -> Optional[str]:
"""Try to find a contact email from a company website."""
for path in ["", "/contact", "/contact-us", "/about", "/about-us"]:
url = f"https://{domain}{path}"
try:
r = session.get(url, timeout=10, allow_redirects=True)
if r.status_code != 200:
continue
emails = re.findall(
r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}", r.text
)
noise_locals = {"noreply", "no-reply", "info", "support", "hello",
"sales", "webmaster", "admin", "privacy", "legal",
"abuse", "postmaster", "hostmaster"}
for e in emails:
local = e.split("@")[0].lower()
dom = e.split("@")[1].lower()
if local not in noise_locals and domain.split(".")[0] in dom:
return e.lower()
# Fallback: any non-noise email on the page
for e in emails:
local = e.split("@")[0].lower()
dom = e.split("@")[1].lower()
if local not in noise_locals and "gov" not in dom:
return e.lower()
except Exception:
pass
time.sleep(0.3)
return None
def run_email_research(conn: psycopg2.extensions.connection) -> int:
"""
For each carrier in fcc_rmd_removed without a contact_email:
1. Check if their RMD number matches a record already scraped in fcc_rmd
2. Try FCC CORES lookup by FRN
3. Try guessing the company website and scraping contact emails
"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address
FROM fcc_rmd_removed r
WHERE r.contact_email IS NULL
AND r.business_name NOT LIKE '[%'
ORDER BY r.id
""")
rows = cur.fetchall()
LOG.info("Researching emails for %d removed carriers …", len(rows))
found = 0
session = requests.Session()
session.headers.update({
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
})
for i, row in enumerate(rows, 1):
rid = row["id"]
name = row["business_name"]
rmd_num = row["rmd_number"]
frn = row["frn"]
LOG.info("[%d/%d] Researching: %s", i, len(rows), name)
email = None
notes = []
# Strategy 1: check our existing fcc_rmd scraped data
if rmd_num:
cur.execute(
"SELECT contact_email FROM fcc_rmd WHERE rmd_number = %s AND contact_email IS NOT NULL",
(rmd_num,)
)
match = cur.fetchone()
if match:
email = match["contact_email"]
notes.append(f"Found in fcc_rmd table (previously scraped from FCC portal)")
LOG.info(" -> [fcc_rmd] %s", email)
# Strategy 2: FCC CORES FRN lookup
if not email and frn:
email = _fcc_cores_lookup(frn, session)
if email:
notes.append(f"Found via FCC CORES FRN lookup ({frn})")
LOG.info(" -> [CORES] %s", email)
time.sleep(DELAY)
# Strategy 3: Website scrape from guessed domain
if not email:
for domain in _guess_domain(name):
email = _scrape_contact_from_website(domain, session)
if email:
notes.append(f"Found via website scrape: {domain}")
LOG.info(" -> [website %s] %s", domain, email)
break
time.sleep(DELAY)
if email:
cur.execute("""
UPDATE fcc_rmd_removed
SET contact_email = %s,
email_researched = TRUE,
email_research_notes = %s,
updated_at = now()
WHERE id = %s
""", (email, "; ".join(notes), rid))
conn.commit()
found += 1
else:
cur.execute("""
UPDATE fcc_rmd_removed
SET email_researched = TRUE,
email_research_notes = 'No email found via any method',
updated_at = now()
WHERE id = %s
""", (rid,))
conn.commit()
LOG.info(" -> no email found")
LOG.info("Email research complete — %d/%d emails found", found, len(rows))
return found
# ---------------------------------------------------------------------------
# Entrypoint
# ---------------------------------------------------------------------------
def main() -> None:
if not DATABASE_URL:
LOG.error("DATABASE_URL not set")
sys.exit(1)
conn = psycopg2.connect(DATABASE_URL)
conn.autocommit = False
try:
run_email_research(conn)
finally:
conn.close()
LOG.info("Done.")
if __name__ == "__main__":
main()