Fix long-running PG transactions in RMD scrapers

Both scrapers held a cursor/transaction open while doing slow HTTP
requests to FCC ServiceNow and company websites, causing
"idle in transaction" for 10+ minutes and triggering the
PostgresSlowQueries alert.

Fix: fetch all row IDs upfront, commit the read transaction
immediately, then process each row with its own short
UPDATE+COMMIT cycle. No long-lived transactions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
justin 2026-05-03 04:13:33 -05:00
parent 5e74c1dcb9
commit eee2aa497b
2 changed files with 38 additions and 31 deletions

View file

@ -113,15 +113,18 @@ def run_email_research(conn: psycopg2.extensions.connection) -> int:
2. Try FCC CORES lookup by FRN
3. Try guessing the company website and scraping contact emails
"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address
FROM fcc_rmd_removed r
WHERE r.contact_email IS NULL
AND r.business_name NOT LIKE '[%'
ORDER BY r.id
""")
rows = cur.fetchall()
# Fetch all IDs upfront then close cursor — don't hold a transaction
# open while doing slow HTTP requests (CORES, website scraping)
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("""
SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address
FROM fcc_rmd_removed r
WHERE r.contact_email IS NULL
AND r.business_name NOT LIKE '[%'
ORDER BY r.id
""")
rows = cur.fetchall()
conn.commit() # close the read transaction immediately
LOG.info("Researching emails for %d removed carriers …", len(rows))
found = 0

View file

@ -304,18 +304,21 @@ def run_email_scrape(
Returns the number of emails successfully fetched.
"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
query = """
SELECT id, rmd_number, servicenow_sys_id
FROM fcc_rmd
WHERE contact_email IS NULL
AND servicenow_sys_id IS NOT NULL
ORDER BY rmd_number
"""
if limit:
query += f" LIMIT {int(limit)}"
cur.execute(query)
rows = cur.fetchall()
# Fetch all IDs upfront then close the cursor — don't hold a transaction
# open while doing slow HTTP requests to ServiceNow
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
query = """
SELECT id, rmd_number, servicenow_sys_id
FROM fcc_rmd
WHERE contact_email IS NULL
AND servicenow_sys_id IS NOT NULL
ORDER BY rmd_number
"""
if limit:
query += f" LIMIT {int(limit)}"
cur.execute(query)
rows = cur.fetchall()
conn.commit() # close the read transaction immediately
LOG.info("Fetching email for %d records via SP API …", len(rows))
scraped = 0
@ -340,16 +343,17 @@ def run_email_scrape(
email_val = _fetch_email_via_sp_api(sys_id, session)
if email_val:
cur.execute(
"""
UPDATE fcc_rmd
SET contact_email = %s,
contact_email_scraped_at = now(),
updated_at = now()
WHERE id = %s
""",
(email_val, row["id"]),
)
with conn.cursor() as ucur:
ucur.execute(
"""
UPDATE fcc_rmd
SET contact_email = %s,
contact_email_scraped_at = now(),
updated_at = now()
WHERE id = %s
""",
(email_val, row["id"]),
)
conn.commit()
scraped += 1
if i <= 10 or i % 500 == 0: