""" Flag fcc_rmd and fcc_rmd_removed rows where the contact email appears to belong to an outside law firm rather than the carrier itself. Detection rules (any one is sufficient): Rule 1 — Law firm domain keywords The email domain contains: law, llp, pllc, legal, counsel, atty, attorney, solicitor, barrister, advocates, chambers Rule 2 — Attorney title + domain mismatch contact_title contains attorney/counsel/esq/of counsel/solicitor AND the email domain shares no significant token with the business name Rule 3 — Known FCC telecom law firm domains Email domain is a recognised Washington DC / telecom regulatory law firm Rule 4 — Domain belongs to a clearly different company Email domain shares zero tokens with the business name AND contact_title or contact_department contains legal/regulatory/counsel/attorney AND the domain is not a generic provider (gmail, yahoo, hotmail, etc.) """ import re, psycopg2, psycopg2.extras PG_DSN = 'postgresql://pw:pw_dev_2026@api-postgres:5432/performancewest' # ── Rule 1: domains that are intrinsically law firm domains ────────────────── LAW_DOMAIN_KEYWORDS = [ 'law', 'llp', 'pllc', 'legalgroup', 'legalteam', 'legalcounsel', 'counsel', 'atty', 'attorney', 'solicitor', 'barrister', 'advocates', 'chambers', 'lawfirm', 'lawgroup', 'lawoffice', ] # ── Rule 3: known telecom / FCC regulatory law firm domains ───────────────── KNOWN_LAW_FIRM_DOMAINS = { # Washington DC telecom regulatory firms 'wiltshirelaw.com', 'wiltshire.com', 'kelleydrye.com', 'wiley.law', 'wileyrein.com', 'harrismoure.com', 'harriswiltshire.com', 'lermansentry.com', 'lermansentry.com', 'compliancetm.com', 'fcclaw.com', 'telecommunicationslaw.com', 'blooston.com', 'posternak.com', 'commlaw.com', 'fhhlaw.com', 'fhhlaw.net', 'ksmcommunications.com', 'stinson.com', 'vonlehman.com', 'perkinscoie.com', 'hklaw.com', 'garveylaw.com', 'garveylaw.net', 'taftlaw.com', 'pillsburylaw.com', 'watt.com', 'kmarlaw.com', 'mcnamarlaw.com', 'lawt.com', 'lawmoss.com', 'adlawfirm.com', 'adlawgroup.com', 'jasslaw.com', 'cldlaw.com', 'butlersnow.com', 'hklaw.com', 'lathropgpm.com', 'lathropgage.com', 'bclplaw.com', 'sheppardmullin.com', 'hklaw.com', 'rubin-levin.com', 'tillmanlaw.com', 'tillmanlaw.net', 'taftlaw.com', 'tillmanlaw.com', 'sgrlaw.com', # Swidler Berlin 'dkslaw.com', 'dkslaw.net', 'fhhlaw.com', 'fhhlaw.net', 'bracewell.com', 'morganlewis.com', 'steptoe.com', 'millernash.com', 'garveylaw.com', 'hklaw.com', 'arnoldporter.com', 'arnoldporter.net', 'dwtlaw.com', # Davis Wright Tremaine 'dwt.com', 'gklaw.com', 'jenner.com', 'macpie.com', 'wsgr.com', # Wilson Sonsini 'cooley.com', 'fbm.com', # Fletcher Heald & Hildreth 'fhhlaw.com', 'klgates.com', 'klgates.net', 'fisherphillips.com', 'mintz.com', 'mintzlaw.com', 'rcatlp.com', # Regulatory Comm/Advocacy 'wilkinsonbarker.com', 'wb-law.com', 'lbblawyers.com', 'nbclaw.net', 'schiffhardin.com', 'sidley.com', 'skadden.com', 'whitecase.com', 'willkie.com', 'winston.com', 'wlrk.com', 'wlrlaw.com', 'zuckerman.com', 'zplaw.com', 'goldsteinlaw.net', 'kaplanlaw.com', 'mkattorneys.com', 'mklaw.com', 'mkglaw.com', 'tankleflaw.com', 'tlglaw.com', 'claytonutz.com', 'hatchmott.com', 'hatchmottlaw.com', } # ── Generic / personal email providers — NOT outside counsel ──────────────── GENERIC_PROVIDERS = { 'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'aol.com', 'icloud.com', 'protonmail.com', 'proton.me', 'me.com', 'mac.com', 'live.com', 'msn.com', 'comcast.net', 'att.net', 'verizon.net', } # Attorney/counsel title keywords ATTORNEY_TITLE_KEYWORDS = [ 'attorney', 'counsel', 'esq', 'esquire', 'solicitor', 'barrister', 'of counsel', 'partner', 'associate', # partner/associate in legal context 'paralegal', 'legal advisor', 'legal counsel', ] LEGAL_DEPT_KEYWORDS = [ 'legal', 'outside counsel', 'law department', 'general counsel', 'regulatory affairs', 'regulatory counsel', ] # ── Merger / parent-company domain map ────────────────────────────────────── # Maps an email domain to a set of brand/company tokens it legitimately covers. # Any carrier whose name contains ANY of those tokens is treated as an # internal contact (not outside counsel) even though the domain doesn't match # the carrier name literally. # # Built from the actual fcc_rmd data — high-count shared domains where the # contact is clearly a parent/acquirer/successor, not outside counsel. MERGER_DOMAIN_TOKENS: dict[str, set[str]] = { # Verizon — absorbed ALLTEL, GTE Mobilnet, MCI/WorldCom, XO, Straight Path, # Bell Atlantic Mobile, AirTouch, Rural Cellular (Unicel), Visible 'verizon.com': {'verizon','alltel','cellco','gte','mci','mcimetro','xo', 'bellatlantic','airtouch','ruralcellular','straight','visible', 'metacomm','metropolitan','commsnet','commnet'}, # TDS Telecom — operates under hundreds of local brand names 'tdstelecom.com': {'tds','telephone','telecom','telcom'}, # Lumen / CenturyLink / Level3 / Qwest 'lumen.com': {'lumen','centurylink','level3','qwest','embarq','tw telecom', 'tw','broadwing','genuity','global crossing'}, 'centurylink.com': {'centurylink','qwest','embarq','savvis','level3'}, 'level3.com': {'level3','tw telecom','global crossing','broadwing','genuity', 'looking glass','wiltel'}, 'lumentech.com': {'lumen','centurylink','level3','qwest'}, # AT&T — absorbed BellSouth, Cingular, SBC, Alascom, DirecTV, Cricket, # Teleport, Time Warner Telecom 'att.com': {'att','bellsouth','cingular','sbc','pacific bell','nevada bell', 'illinois bell','southwestern bell','indiana bell','michigan bell', 'wisconsin bell','alascom','cricket','directv','teleport', 'firstnet','warnermedia'}, # T-Mobile — absorbed Sprint, Nextel, MetroPCS, Boost, Assurance 't-mobile.com': {'tmobile','sprint','nextel','metropcs','boost','assurance', 'iiphone','sievert','clearwire'}, 'sprint.com': {'sprint','nextel','clearwire','virgin mobile','boost'}, # Comcast / Xfinity 'comcast.com': {'comcast','xfinity','nbcuniversal'}, # Charter / Spectrum — absorbed Time Warner Cable, Bright House 'charter.com': {'charter','spectrum','time warner','brighthouse','cablevision'}, 'spectrum.com': {'spectrum','charter','time warner','brighthouse'}, # Cox Communications 'cox.com': {'cox'}, # Altice / Optimum / Suddenlink 'altice.net': {'altice','optimum','suddenlink','cablevision','lightpath'}, 'alticeusa.com': {'altice','optimum','suddenlink','cablevision'}, # Frontier — absorbed AT&T wireline, Verizon wireline, Citizens Telephone 'frontier.com': {'frontier','citizens','commonwealth','ctsi'}, # Brightspeed — spun out of Lumen 'brightspeed.com': {'brightspeed','centurylink','qwest'}, # Consolidated Communications — absorbed FairPoint, North Pittsburgh 'consolidated.com': {'consolidated','fairpoint','north pittsburgh','surewest'}, 'fairpoint.com': {'fairpoint','northern new england telephone'}, # Windstream — absorbed NTELOS, Iowa Telecom, NuVox, CT Communications 'windstream.com': {'windstream','ntelos','iowa telecom','nuvox','conestoga', 'valor telecom','alltel wireline'}, # Cincinnati Bell / Altafiber 'altafiber.com': {'altafiber','cincinnati bell','cintec','hawaiian telcom'}, 'cinbell.com': {'cincinnati bell','altafiber'}, # WideOpenWest (WOW!) 'wowway.com': {'wowway','wideopenwest','knology'}, # Mediacom 'mediacomcc.com': {'mediacom'}, # Armstrong / GoNetspeed 'agoc.com': {'armstrong'}, 'gonetspeed.com': {'gonetspeed','otelco','blountsville','brindlee','granby', 'hopper','midmaine','pine tree','saco','shoreham','trumansburg', 'war telephone','finger lakes','ontario telephone'}, # Sparklight / Cable One 'sparklight.biz': {'sparklight','cable one','cableone','cobridge','comsouh', 'fidelity','hargray','bluffton'}, # Sinch — absorbed Inteliquent, Onvoy, ANPI, Voyant 'sinch.com': {'sinch','inteliquent','onvoy','anpi','voyant'}, # NetSol / NelNet / netins 'netins.net': {'netins','iowa'}, # Iowa rural telcos on shared platform # Nuvera / New Ulm Telecom platform 'nuvera.net': {'nuvera','hutchinson','new ulm','peoples','scott rice', 'sleepy eye','western telephone'}, # ATNI — American Telephone & Internet parent 'atni.com': {'atni','commnet','essextel','ntua','sacred wind','sw dineh', 'viya','vitelco','innovative telephone','virgin islands'}, # TEC / Telephone and Data Systems spinoffs 'tec.com': {'tec','bay springs','crockett','national telephone','roanoke', 'peoples telephone','west tennessee'}, # Comporium group 'comporium.com': {'comporium','citizens','fort mill','lancaster','pbt'}, # MBO / Cross Telecom group 'mbo.one': {'mbo','cross','cim tel','optictel','pottawatomie'}, # BevComm group (Minnesota rural) 'bevcomm.com': {'bevcomm','blue earth','cannon valley','easton','eckles', 'hager','indianhead','lonsdale','pine island'}, # Townes Telecom group 'townes.net': {'townes','choctaw','haxtun','mokan','northeast florida', 'oregon farmers','pymatuning','walnut hill'}, # Dobson / Pinnacle group 'dobson.net': {'dobson','lavaca','mcloud','pinnacle','vantage'}, # Ritter Communications group 'rittercommunications.com': {'ritter','millington','mtel','tri county'}, # EATEL group 'eatel.com': {'eatel','east ascension','reserve','advanced tel','sjii','tlx'}, # BCM One — absorbed SIP.US, CoreDial, Skyswitch, nexVortex 'bcmone.com': {'bcmone','coredial','payg','skyswitch','sipus','nexvortex'}, # Motorola Solutions — owns SwiftReach (emergency notification, not telecom carrier) 'motorolasolutions.com': {'swiftreach'}, # Claro Latin America (América Móvil subsidiaries) 'claro.com.ar': {'claro','amx','telmex'}, 'claro.cr': {'claro'}, # Tata Communications global 'tatacommunications.com': {'tata'}, # Deutsche Telekom / T-Systems 't-systems.com': {'t-systems','deutsche telekom','telekom'}, # Amazon / AWS telecom 'amazon.com': {'amazon','amcs','aws'}, # Cisco / BroadSoft 'cisco.com': {'cisco','broadsoft'}, # EchoStar / Hughes 'echostar.com': {'echostar','hughes','dish'}, # Five9 (cloud contact center) 'five9.com': {'five9'}, # 8x8 '8x8.com': {'8x8'}, # Mitel 'mitel.com': {'mitel','shoretel'}, # GoDaddy (domain/hosting + voice services) 'godaddy.com': {'godaddy'}, # Telstra US 'telstra.com': {'telstra'}, # Highline / various rural 'highlinefast.com': {'highline','dalton','elsie','livingston','rye telephone', 'south park','waverly'}, # CentraNet 'centracom.com': {'centracom','bear lake','central utah','skyline','central telcom'}, # HCI / hcinet rural group 'hcinet.net': {'hcinet','clara city','fort randall','sacred heart', 'starbuck','zumbrota'}, # Fastwyre — parent/brand for Cameron Telephone, KLM, Blair, Mukluk, TelAlaska, etc. 'fastwyre.com': {'fastwyre','cameron','klm','blair','mukluk','telalaska', 'lbh','elizabeth','interior','eastern nebraska','rock county', 'huntel','holway','moundville','arlington'}, # Comcast — acquired Masergy in 2021 'comcast.com': {'comcast','xfinity','masergy','nbcuniversal'}, } def tokenise(name: str) -> set: """Return set of lowercase alpha tokens ≥4 chars from a business name.""" return {t for t in re.findall(r'[a-z]{4,}', name.lower()) if t not in {'corp', 'llcc', 'corp', 'inc.', 'llp.', 'pllc', 'dba.', 'the', 'and', 'for', 'with', 'from', 'that', 'this', 'company', 'communications', 'telephone', 'services', 'systems', 'solutions', 'network', 'networks', 'technologies', 'technology', 'limited', 'group'}} def is_outside_counsel(email: str, title: str, dept: str, biz_name: str) -> bool: if not email or '@' not in email: return False domain = email.split('@')[1].lower() # Rule 1: law firm domain keyword for kw in LAW_DOMAIN_KEYWORDS: if kw in domain: return True # Rule 3: known law firm domain if domain in KNOWN_LAW_FIRM_DOMAINS: return True # Skip generic providers for rules 2 & 4 if domain in GENERIC_PROVIDERS: return False # Merger check: domain is a known parent/acquirer that legitimately covers # this carrier — treat as internal even if names don't literally match if domain in MERGER_DOMAIN_TOKENS: covered_tokens = MERGER_DOMAIN_TOKENS[domain] # Strip punctuation for matching (e.g. "K.L.M." → "klm") biz_lower = re.sub(r'[^a-z0-9 ]', ' ', biz_name.lower()) if any(tok in biz_lower for tok in covered_tokens): return False # Domain is in the merger map but carrier name doesn't match any known # subsidiary — still could be internal (large conglomerates), so only # flag if we also have an attorney title signal title_lc = (title or '').lower() dept_lc = (dept or '').lower() if not any(kw in title_lc for kw in ATTORNEY_TITLE_KEYWORDS) and \ not any(kw in dept_lc for kw in LEGAL_DEPT_KEYWORDS): return False title_lc = (title or '').lower() dept_lc = (dept or '').lower() has_attorney_title = any(kw in title_lc for kw in ATTORNEY_TITLE_KEYWORDS) has_legal_dept = any(kw in dept_lc for kw in LEGAL_DEPT_KEYWORDS) if not (has_attorney_title or has_legal_dept): return False # Rule 2 / Rule 4: domain doesn't share tokens with carrier name biz_tokens = tokenise(biz_name) domain_tokens = tokenise(domain.split('.')[0]) # just the SLD # Meaningful token overlap → internal contact if biz_tokens & domain_tokens: return False # Domain shares no tokens with carrier → likely outside counsel return True def run(): pg = psycopg2.connect(PG_DSN) cur = pg.cursor(cursor_factory=psycopg2.extras.RealDictCursor) for table in ('fcc_rmd', 'fcc_rmd_removed'): print(f'\nProcessing {table}...') # fcc_rmd_removed doesn't have contact_title / contact_department if table == 'fcc_rmd_removed': cur.execute(f""" SELECT id, contact_email, NULL AS contact_title, NULL AS contact_department, business_name FROM {table} WHERE contact_email IS NOT NULL """) else: cur.execute(f""" SELECT id, contact_email, contact_title, contact_department, business_name FROM {table} WHERE contact_email IS NOT NULL """) rows = cur.fetchall() flagged = 0 for row in rows: flag = is_outside_counsel( row['contact_email'], row.get('contact_title') or '', row.get('contact_department') or '', row['business_name'] or '', ) cur.execute( f"UPDATE {table} SET is_outside_counsel = %s WHERE id = %s", (flag, row['id']) ) if flag: flagged += 1 pg.commit() print(f' {len(rows)} rows checked, {flagged} flagged as outside counsel') # Show samples of flagged rows from fcc_rmd cur.execute(""" SELECT contact_email, contact_title, contact_department, business_name FROM fcc_rmd WHERE is_outside_counsel = TRUE ORDER BY split_part(contact_email,'@',2), contact_email LIMIT 30 """) print('\nSample flagged rows (fcc_rmd):') for r in cur.fetchall(): print(f" {r['contact_email']:<45} | {(r['contact_title'] or '')[:30]:<30} | {r['business_name'][:40]}") pg.close() if __name__ == '__main__': run()