Fix long-running PG transactions in RMD scrapers

Both scrapers held a cursor/transaction open while doing slow HTTP
requests to FCC ServiceNow and company websites, causing
"idle in transaction" for 10+ minutes and triggering the
PostgresSlowQueries alert.

Fix: fetch all row IDs upfront, commit the read transaction
immediately, then process each row with its own short
UPDATE+COMMIT cycle. No long-lived transactions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
justin 2026-05-03 04:13:33 -05:00
parent 5e74c1dcb9
commit eee2aa497b
2 changed files with 38 additions and 31 deletions

View file

@ -113,15 +113,18 @@ def run_email_research(conn: psycopg2.extensions.connection) -> int:
2. Try FCC CORES lookup by FRN 2. Try FCC CORES lookup by FRN
3. Try guessing the company website and scraping contact emails 3. Try guessing the company website and scraping contact emails
""" """
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # Fetch all IDs upfront then close cursor — don't hold a transaction
cur.execute(""" # open while doing slow HTTP requests (CORES, website scraping)
SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
FROM fcc_rmd_removed r cur.execute("""
WHERE r.contact_email IS NULL SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address
AND r.business_name NOT LIKE '[%' FROM fcc_rmd_removed r
ORDER BY r.id WHERE r.contact_email IS NULL
""") AND r.business_name NOT LIKE '[%'
rows = cur.fetchall() ORDER BY r.id
""")
rows = cur.fetchall()
conn.commit() # close the read transaction immediately
LOG.info("Researching emails for %d removed carriers …", len(rows)) LOG.info("Researching emails for %d removed carriers …", len(rows))
found = 0 found = 0

View file

@ -304,18 +304,21 @@ def run_email_scrape(
Returns the number of emails successfully fetched. Returns the number of emails successfully fetched.
""" """
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # Fetch all IDs upfront then close the cursor — don't hold a transaction
query = """ # open while doing slow HTTP requests to ServiceNow
SELECT id, rmd_number, servicenow_sys_id with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
FROM fcc_rmd query = """
WHERE contact_email IS NULL SELECT id, rmd_number, servicenow_sys_id
AND servicenow_sys_id IS NOT NULL FROM fcc_rmd
ORDER BY rmd_number WHERE contact_email IS NULL
""" AND servicenow_sys_id IS NOT NULL
if limit: ORDER BY rmd_number
query += f" LIMIT {int(limit)}" """
cur.execute(query) if limit:
rows = cur.fetchall() query += f" LIMIT {int(limit)}"
cur.execute(query)
rows = cur.fetchall()
conn.commit() # close the read transaction immediately
LOG.info("Fetching email for %d records via SP API …", len(rows)) LOG.info("Fetching email for %d records via SP API …", len(rows))
scraped = 0 scraped = 0
@ -340,16 +343,17 @@ def run_email_scrape(
email_val = _fetch_email_via_sp_api(sys_id, session) email_val = _fetch_email_via_sp_api(sys_id, session)
if email_val: if email_val:
cur.execute( with conn.cursor() as ucur:
""" ucur.execute(
UPDATE fcc_rmd """
SET contact_email = %s, UPDATE fcc_rmd
contact_email_scraped_at = now(), SET contact_email = %s,
updated_at = now() contact_email_scraped_at = now(),
WHERE id = %s updated_at = now()
""", WHERE id = %s
(email_val, row["id"]), """,
) (email_val, row["id"]),
)
conn.commit() conn.commit()
scraped += 1 scraped += 1
if i <= 10 or i % 500 == 0: if i <= 10 or i % 500 == 0: