Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
418 lines
17 KiB
Python
418 lines
17 KiB
Python
"""
|
|
Flag fcc_rmd and fcc_rmd_removed rows where the contact email appears to belong
|
|
to an outside law firm rather than the carrier itself.
|
|
|
|
Detection rules (any one is sufficient):
|
|
|
|
Rule 1 — Law firm domain keywords
|
|
The email domain contains: law, llp, pllc, legal, counsel, atty, attorney,
|
|
solicitor, barrister, advocates, chambers
|
|
|
|
Rule 2 — Attorney title + domain mismatch
|
|
contact_title contains attorney/counsel/esq/of counsel/solicitor AND the email
|
|
domain shares no significant token with the business name
|
|
|
|
Rule 3 — Known FCC telecom law firm domains
|
|
Email domain is a recognised Washington DC / telecom regulatory law firm
|
|
|
|
Rule 4 — Domain belongs to a clearly different company
|
|
Email domain shares zero tokens with the business name AND contact_title or
|
|
contact_department contains legal/regulatory/counsel/attorney
|
|
AND the domain is not a generic provider (gmail, yahoo, hotmail, etc.)
|
|
"""
|
|
import re, psycopg2, psycopg2.extras
|
|
|
|
PG_DSN = 'postgresql://pw:pw_dev_2026@api-postgres:5432/performancewest'
|
|
|
|
# ── Rule 1: domains that are intrinsically law firm domains ──────────────────
|
|
LAW_DOMAIN_KEYWORDS = [
|
|
'law', 'llp', 'pllc', 'legalgroup', 'legalteam', 'legalcounsel',
|
|
'counsel', 'atty', 'attorney', 'solicitor', 'barrister',
|
|
'advocates', 'chambers', 'lawfirm', 'lawgroup', 'lawoffice',
|
|
]
|
|
|
|
# ── Rule 3: known telecom / FCC regulatory law firm domains ─────────────────
|
|
KNOWN_LAW_FIRM_DOMAINS = {
|
|
# Washington DC telecom regulatory firms
|
|
'wiltshirelaw.com', 'wiltshire.com',
|
|
'kelleydrye.com', 'wiley.law', 'wileyrein.com',
|
|
'harrismoure.com', 'harriswiltshire.com',
|
|
'lermansentry.com', 'lermansentry.com',
|
|
'compliancetm.com',
|
|
'fcclaw.com',
|
|
'telecommunicationslaw.com',
|
|
'blooston.com',
|
|
'posternak.com',
|
|
'commlaw.com',
|
|
'fhhlaw.com',
|
|
'fhhlaw.net',
|
|
'ksmcommunications.com',
|
|
'stinson.com',
|
|
'vonlehman.com',
|
|
'perkinscoie.com',
|
|
'hklaw.com',
|
|
'garveylaw.com',
|
|
'garveylaw.net',
|
|
'taftlaw.com',
|
|
'pillsburylaw.com',
|
|
'watt.com',
|
|
'kmarlaw.com',
|
|
'mcnamarlaw.com',
|
|
'lawt.com',
|
|
'lawmoss.com',
|
|
'adlawfirm.com',
|
|
'adlawgroup.com',
|
|
'jasslaw.com',
|
|
'cldlaw.com',
|
|
'butlersnow.com',
|
|
'hklaw.com',
|
|
'lathropgpm.com',
|
|
'lathropgage.com',
|
|
'bclplaw.com',
|
|
'sheppardmullin.com',
|
|
'hklaw.com',
|
|
'rubin-levin.com',
|
|
'tillmanlaw.com',
|
|
'tillmanlaw.net',
|
|
'taftlaw.com',
|
|
'tillmanlaw.com',
|
|
'sgrlaw.com', # Swidler Berlin
|
|
'dkslaw.com',
|
|
'dkslaw.net',
|
|
'fhhlaw.com',
|
|
'fhhlaw.net',
|
|
'bracewell.com',
|
|
'morganlewis.com',
|
|
'steptoe.com',
|
|
'millernash.com',
|
|
'garveylaw.com',
|
|
'hklaw.com',
|
|
'arnoldporter.com',
|
|
'arnoldporter.net',
|
|
'dwtlaw.com', # Davis Wright Tremaine
|
|
'dwt.com',
|
|
'gklaw.com',
|
|
'jenner.com',
|
|
'macpie.com',
|
|
'wsgr.com', # Wilson Sonsini
|
|
'cooley.com',
|
|
'fbm.com', # Fletcher Heald & Hildreth
|
|
'fhhlaw.com',
|
|
'klgates.com',
|
|
'klgates.net',
|
|
'fisherphillips.com',
|
|
'mintz.com',
|
|
'mintzlaw.com',
|
|
'rcatlp.com', # Regulatory Comm/Advocacy
|
|
'wilkinsonbarker.com',
|
|
'wb-law.com',
|
|
'lbblawyers.com',
|
|
'nbclaw.net',
|
|
'schiffhardin.com',
|
|
'sidley.com',
|
|
'skadden.com',
|
|
'whitecase.com',
|
|
'willkie.com',
|
|
'winston.com',
|
|
'wlrk.com',
|
|
'wlrlaw.com',
|
|
'zuckerman.com',
|
|
'zplaw.com',
|
|
'goldsteinlaw.net',
|
|
'kaplanlaw.com',
|
|
'mkattorneys.com',
|
|
'mklaw.com',
|
|
'mkglaw.com',
|
|
'tankleflaw.com',
|
|
'tlglaw.com',
|
|
'claytonutz.com',
|
|
'hatchmott.com',
|
|
'hatchmottlaw.com',
|
|
}
|
|
|
|
# ── Generic / personal email providers — NOT outside counsel ────────────────
|
|
GENERIC_PROVIDERS = {
|
|
'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com',
|
|
'icloud.com', 'protonmail.com', 'proton.me', 'me.com', 'mac.com',
|
|
'live.com', 'msn.com', 'comcast.net', 'att.net', 'verizon.net',
|
|
}
|
|
|
|
# Attorney/counsel title keywords
|
|
ATTORNEY_TITLE_KEYWORDS = [
|
|
'attorney', 'counsel', 'esq', 'esquire', 'solicitor',
|
|
'barrister', 'of counsel', 'partner', 'associate', # partner/associate in legal context
|
|
'paralegal', 'legal advisor', 'legal counsel',
|
|
]
|
|
|
|
LEGAL_DEPT_KEYWORDS = [
|
|
'legal', 'outside counsel', 'law department', 'general counsel',
|
|
'regulatory affairs', 'regulatory counsel',
|
|
]
|
|
|
|
|
|
# ── Merger / parent-company domain map ──────────────────────────────────────
|
|
# Maps an email domain to a set of brand/company tokens it legitimately covers.
|
|
# Any carrier whose name contains ANY of those tokens is treated as an
|
|
# internal contact (not outside counsel) even though the domain doesn't match
|
|
# the carrier name literally.
|
|
#
|
|
# Built from the actual fcc_rmd data — high-count shared domains where the
|
|
# contact is clearly a parent/acquirer/successor, not outside counsel.
|
|
MERGER_DOMAIN_TOKENS: dict[str, set[str]] = {
|
|
# Verizon — absorbed ALLTEL, GTE Mobilnet, MCI/WorldCom, XO, Straight Path,
|
|
# Bell Atlantic Mobile, AirTouch, Rural Cellular (Unicel), Visible
|
|
'verizon.com': {'verizon','alltel','cellco','gte','mci','mcimetro','xo',
|
|
'bellatlantic','airtouch','ruralcellular','straight','visible',
|
|
'metacomm','metropolitan','commsnet','commnet'},
|
|
# TDS Telecom — operates under hundreds of local brand names
|
|
'tdstelecom.com': {'tds','telephone','telecom','telcom'},
|
|
# Lumen / CenturyLink / Level3 / Qwest
|
|
'lumen.com': {'lumen','centurylink','level3','qwest','embarq','tw telecom',
|
|
'tw','broadwing','genuity','global crossing'},
|
|
'centurylink.com': {'centurylink','qwest','embarq','savvis','level3'},
|
|
'level3.com': {'level3','tw telecom','global crossing','broadwing','genuity',
|
|
'looking glass','wiltel'},
|
|
'lumentech.com': {'lumen','centurylink','level3','qwest'},
|
|
# AT&T — absorbed BellSouth, Cingular, SBC, Alascom, DirecTV, Cricket,
|
|
# Teleport, Time Warner Telecom
|
|
'att.com': {'att','bellsouth','cingular','sbc','pacific bell','nevada bell',
|
|
'illinois bell','southwestern bell','indiana bell','michigan bell',
|
|
'wisconsin bell','alascom','cricket','directv','teleport',
|
|
'firstnet','warnermedia'},
|
|
# T-Mobile — absorbed Sprint, Nextel, MetroPCS, Boost, Assurance
|
|
't-mobile.com': {'tmobile','sprint','nextel','metropcs','boost','assurance',
|
|
'iiphone','sievert','clearwire'},
|
|
'sprint.com': {'sprint','nextel','clearwire','virgin mobile','boost'},
|
|
# Comcast / Xfinity
|
|
'comcast.com': {'comcast','xfinity','nbcuniversal'},
|
|
# Charter / Spectrum — absorbed Time Warner Cable, Bright House
|
|
'charter.com': {'charter','spectrum','time warner','brighthouse','cablevision'},
|
|
'spectrum.com': {'spectrum','charter','time warner','brighthouse'},
|
|
# Cox Communications
|
|
'cox.com': {'cox'},
|
|
# Altice / Optimum / Suddenlink
|
|
'altice.net': {'altice','optimum','suddenlink','cablevision','lightpath'},
|
|
'alticeusa.com': {'altice','optimum','suddenlink','cablevision'},
|
|
# Frontier — absorbed AT&T wireline, Verizon wireline, Citizens Telephone
|
|
'frontier.com': {'frontier','citizens','commonwealth','ctsi'},
|
|
# Brightspeed — spun out of Lumen
|
|
'brightspeed.com': {'brightspeed','centurylink','qwest'},
|
|
# Consolidated Communications — absorbed FairPoint, North Pittsburgh
|
|
'consolidated.com': {'consolidated','fairpoint','north pittsburgh','surewest'},
|
|
'fairpoint.com': {'fairpoint','northern new england telephone'},
|
|
# Windstream — absorbed NTELOS, Iowa Telecom, NuVox, CT Communications
|
|
'windstream.com': {'windstream','ntelos','iowa telecom','nuvox','conestoga',
|
|
'valor telecom','alltel wireline'},
|
|
# Cincinnati Bell / Altafiber
|
|
'altafiber.com': {'altafiber','cincinnati bell','cintec','hawaiian telcom'},
|
|
'cinbell.com': {'cincinnati bell','altafiber'},
|
|
# WideOpenWest (WOW!)
|
|
'wowway.com': {'wowway','wideopenwest','knology'},
|
|
# Mediacom
|
|
'mediacomcc.com': {'mediacom'},
|
|
# Armstrong / GoNetspeed
|
|
'agoc.com': {'armstrong'},
|
|
'gonetspeed.com': {'gonetspeed','otelco','blountsville','brindlee','granby',
|
|
'hopper','midmaine','pine tree','saco','shoreham','trumansburg',
|
|
'war telephone','finger lakes','ontario telephone'},
|
|
# Sparklight / Cable One
|
|
'sparklight.biz': {'sparklight','cable one','cableone','cobridge','comsouh',
|
|
'fidelity','hargray','bluffton'},
|
|
# Sinch — absorbed Inteliquent, Onvoy, ANPI, Voyant
|
|
'sinch.com': {'sinch','inteliquent','onvoy','anpi','voyant'},
|
|
# NetSol / NelNet / netins
|
|
'netins.net': {'netins','iowa'}, # Iowa rural telcos on shared platform
|
|
# Nuvera / New Ulm Telecom platform
|
|
'nuvera.net': {'nuvera','hutchinson','new ulm','peoples','scott rice',
|
|
'sleepy eye','western telephone'},
|
|
# ATNI — American Telephone & Internet parent
|
|
'atni.com': {'atni','commnet','essextel','ntua','sacred wind','sw dineh',
|
|
'viya','vitelco','innovative telephone','virgin islands'},
|
|
# TEC / Telephone and Data Systems spinoffs
|
|
'tec.com': {'tec','bay springs','crockett','national telephone','roanoke',
|
|
'peoples telephone','west tennessee'},
|
|
# Comporium group
|
|
'comporium.com': {'comporium','citizens','fort mill','lancaster','pbt'},
|
|
# MBO / Cross Telecom group
|
|
'mbo.one': {'mbo','cross','cim tel','optictel','pottawatomie'},
|
|
# BevComm group (Minnesota rural)
|
|
'bevcomm.com': {'bevcomm','blue earth','cannon valley','easton','eckles',
|
|
'hager','indianhead','lonsdale','pine island'},
|
|
# Townes Telecom group
|
|
'townes.net': {'townes','choctaw','haxtun','mokan','northeast florida',
|
|
'oregon farmers','pymatuning','walnut hill'},
|
|
# Dobson / Pinnacle group
|
|
'dobson.net': {'dobson','lavaca','mcloud','pinnacle','vantage'},
|
|
# Ritter Communications group
|
|
'rittercommunications.com': {'ritter','millington','mtel','tri county'},
|
|
# EATEL group
|
|
'eatel.com': {'eatel','east ascension','reserve','advanced tel','sjii','tlx'},
|
|
# BCM One — absorbed SIP.US, CoreDial, Skyswitch, nexVortex
|
|
'bcmone.com': {'bcmone','coredial','payg','skyswitch','sipus','nexvortex'},
|
|
# Motorola Solutions — owns SwiftReach (emergency notification, not telecom carrier)
|
|
'motorolasolutions.com': {'swiftreach'},
|
|
# Claro Latin America (América Móvil subsidiaries)
|
|
'claro.com.ar': {'claro','amx','telmex'},
|
|
'claro.cr': {'claro'},
|
|
# Tata Communications global
|
|
'tatacommunications.com': {'tata'},
|
|
# Deutsche Telekom / T-Systems
|
|
't-systems.com': {'t-systems','deutsche telekom','telekom'},
|
|
# Amazon / AWS telecom
|
|
'amazon.com': {'amazon','amcs','aws'},
|
|
# Cisco / BroadSoft
|
|
'cisco.com': {'cisco','broadsoft'},
|
|
# EchoStar / Hughes
|
|
'echostar.com': {'echostar','hughes','dish'},
|
|
# Five9 (cloud contact center)
|
|
'five9.com': {'five9'},
|
|
# 8x8
|
|
'8x8.com': {'8x8'},
|
|
# Mitel
|
|
'mitel.com': {'mitel','shoretel'},
|
|
# GoDaddy (domain/hosting + voice services)
|
|
'godaddy.com': {'godaddy'},
|
|
# Telstra US
|
|
'telstra.com': {'telstra'},
|
|
# Highline / various rural
|
|
'highlinefast.com': {'highline','dalton','elsie','livingston','rye telephone',
|
|
'south park','waverly'},
|
|
# CentraNet
|
|
'centracom.com': {'centracom','bear lake','central utah','skyline','central telcom'},
|
|
# HCI / hcinet rural group
|
|
'hcinet.net': {'hcinet','clara city','fort randall','sacred heart',
|
|
'starbuck','zumbrota'},
|
|
# Fastwyre — parent/brand for Cameron Telephone, KLM, Blair, Mukluk, TelAlaska, etc.
|
|
'fastwyre.com': {'fastwyre','cameron','klm','blair','mukluk','telalaska',
|
|
'lbh','elizabeth','interior','eastern nebraska','rock county',
|
|
'huntel','holway','moundville','arlington'},
|
|
# Comcast — acquired Masergy in 2021
|
|
'comcast.com': {'comcast','xfinity','masergy','nbcuniversal'},
|
|
}
|
|
|
|
|
|
def tokenise(name: str) -> set:
|
|
"""Return set of lowercase alpha tokens ≥4 chars from a business name."""
|
|
return {t for t in re.findall(r'[a-z]{4,}', name.lower())
|
|
if t not in {'corp', 'llcc', 'corp', 'inc.', 'llp.', 'pllc', 'dba.',
|
|
'the', 'and', 'for', 'with', 'from', 'that', 'this',
|
|
'company', 'communications', 'telephone', 'services',
|
|
'systems', 'solutions', 'network', 'networks',
|
|
'technologies', 'technology', 'limited', 'group'}}
|
|
|
|
|
|
def is_outside_counsel(email: str, title: str, dept: str, biz_name: str) -> bool:
|
|
if not email or '@' not in email:
|
|
return False
|
|
|
|
domain = email.split('@')[1].lower()
|
|
|
|
# Rule 1: law firm domain keyword
|
|
for kw in LAW_DOMAIN_KEYWORDS:
|
|
if kw in domain:
|
|
return True
|
|
|
|
# Rule 3: known law firm domain
|
|
if domain in KNOWN_LAW_FIRM_DOMAINS:
|
|
return True
|
|
|
|
# Skip generic providers for rules 2 & 4
|
|
if domain in GENERIC_PROVIDERS:
|
|
return False
|
|
|
|
# Merger check: domain is a known parent/acquirer that legitimately covers
|
|
# this carrier — treat as internal even if names don't literally match
|
|
if domain in MERGER_DOMAIN_TOKENS:
|
|
covered_tokens = MERGER_DOMAIN_TOKENS[domain]
|
|
# Strip punctuation for matching (e.g. "K.L.M." → "klm")
|
|
biz_lower = re.sub(r'[^a-z0-9 ]', ' ', biz_name.lower())
|
|
if any(tok in biz_lower for tok in covered_tokens):
|
|
return False
|
|
# Domain is in the merger map but carrier name doesn't match any known
|
|
# subsidiary — still could be internal (large conglomerates), so only
|
|
# flag if we also have an attorney title signal
|
|
title_lc = (title or '').lower()
|
|
dept_lc = (dept or '').lower()
|
|
if not any(kw in title_lc for kw in ATTORNEY_TITLE_KEYWORDS) and \
|
|
not any(kw in dept_lc for kw in LEGAL_DEPT_KEYWORDS):
|
|
return False
|
|
|
|
title_lc = (title or '').lower()
|
|
dept_lc = (dept or '').lower()
|
|
|
|
has_attorney_title = any(kw in title_lc for kw in ATTORNEY_TITLE_KEYWORDS)
|
|
has_legal_dept = any(kw in dept_lc for kw in LEGAL_DEPT_KEYWORDS)
|
|
|
|
if not (has_attorney_title or has_legal_dept):
|
|
return False
|
|
|
|
# Rule 2 / Rule 4: domain doesn't share tokens with carrier name
|
|
biz_tokens = tokenise(biz_name)
|
|
domain_tokens = tokenise(domain.split('.')[0]) # just the SLD
|
|
|
|
# Meaningful token overlap → internal contact
|
|
if biz_tokens & domain_tokens:
|
|
return False
|
|
|
|
# Domain shares no tokens with carrier → likely outside counsel
|
|
return True
|
|
|
|
|
|
def run():
|
|
pg = psycopg2.connect(PG_DSN)
|
|
cur = pg.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
|
|
for table in ('fcc_rmd', 'fcc_rmd_removed'):
|
|
print(f'\nProcessing {table}...')
|
|
|
|
# fcc_rmd_removed doesn't have contact_title / contact_department
|
|
if table == 'fcc_rmd_removed':
|
|
cur.execute(f"""
|
|
SELECT id, contact_email, NULL AS contact_title,
|
|
NULL AS contact_department, business_name
|
|
FROM {table}
|
|
WHERE contact_email IS NOT NULL
|
|
""")
|
|
else:
|
|
cur.execute(f"""
|
|
SELECT id, contact_email, contact_title, contact_department, business_name
|
|
FROM {table}
|
|
WHERE contact_email IS NOT NULL
|
|
""")
|
|
rows = cur.fetchall()
|
|
|
|
flagged = 0
|
|
for row in rows:
|
|
flag = is_outside_counsel(
|
|
row['contact_email'],
|
|
row.get('contact_title') or '',
|
|
row.get('contact_department') or '',
|
|
row['business_name'] or '',
|
|
)
|
|
cur.execute(
|
|
f"UPDATE {table} SET is_outside_counsel = %s WHERE id = %s",
|
|
(flag, row['id'])
|
|
)
|
|
if flag:
|
|
flagged += 1
|
|
|
|
pg.commit()
|
|
print(f' {len(rows)} rows checked, {flagged} flagged as outside counsel')
|
|
|
|
# Show samples of flagged rows from fcc_rmd
|
|
cur.execute("""
|
|
SELECT contact_email, contact_title, contact_department, business_name
|
|
FROM fcc_rmd
|
|
WHERE is_outside_counsel = TRUE
|
|
ORDER BY split_part(contact_email,'@',2), contact_email
|
|
LIMIT 30
|
|
""")
|
|
print('\nSample flagged rows (fcc_rmd):')
|
|
for r in cur.fetchall():
|
|
print(f" {r['contact_email']:<45} | {(r['contact_title'] or '')[:30]:<30} | {r['business_name'][:40]}")
|
|
|
|
pg.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
run()
|