Fix long-running PG transactions in RMD scrapers
Both scrapers held a cursor/transaction open while doing slow HTTP requests to FCC ServiceNow and company websites, causing "idle in transaction" for 10+ minutes and triggering the PostgresSlowQueries alert. Fix: fetch all row IDs upfront, commit the read transaction immediately, then process each row with its own short UPDATE+COMMIT cycle. No long-lived transactions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5e74c1dcb9
commit
eee2aa497b
2 changed files with 38 additions and 31 deletions
|
|
@ -113,15 +113,18 @@ def run_email_research(conn: psycopg2.extensions.connection) -> int:
|
|||
2. Try FCC CORES lookup by FRN
|
||||
3. Try guessing the company website and scraping contact emails
|
||||
"""
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""
|
||||
SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address
|
||||
FROM fcc_rmd_removed r
|
||||
WHERE r.contact_email IS NULL
|
||||
AND r.business_name NOT LIKE '[%'
|
||||
ORDER BY r.id
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
# Fetch all IDs upfront then close cursor — don't hold a transaction
|
||||
# open while doing slow HTTP requests (CORES, website scraping)
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT r.id, r.rmd_number, r.frn, r.business_name, r.business_address
|
||||
FROM fcc_rmd_removed r
|
||||
WHERE r.contact_email IS NULL
|
||||
AND r.business_name NOT LIKE '[%'
|
||||
ORDER BY r.id
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
conn.commit() # close the read transaction immediately
|
||||
LOG.info("Researching emails for %d removed carriers …", len(rows))
|
||||
|
||||
found = 0
|
||||
|
|
|
|||
|
|
@ -304,18 +304,21 @@ def run_email_scrape(
|
|||
|
||||
Returns the number of emails successfully fetched.
|
||||
"""
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
query = """
|
||||
SELECT id, rmd_number, servicenow_sys_id
|
||||
FROM fcc_rmd
|
||||
WHERE contact_email IS NULL
|
||||
AND servicenow_sys_id IS NOT NULL
|
||||
ORDER BY rmd_number
|
||||
"""
|
||||
if limit:
|
||||
query += f" LIMIT {int(limit)}"
|
||||
cur.execute(query)
|
||||
rows = cur.fetchall()
|
||||
# Fetch all IDs upfront then close the cursor — don't hold a transaction
|
||||
# open while doing slow HTTP requests to ServiceNow
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||
query = """
|
||||
SELECT id, rmd_number, servicenow_sys_id
|
||||
FROM fcc_rmd
|
||||
WHERE contact_email IS NULL
|
||||
AND servicenow_sys_id IS NOT NULL
|
||||
ORDER BY rmd_number
|
||||
"""
|
||||
if limit:
|
||||
query += f" LIMIT {int(limit)}"
|
||||
cur.execute(query)
|
||||
rows = cur.fetchall()
|
||||
conn.commit() # close the read transaction immediately
|
||||
|
||||
LOG.info("Fetching email for %d records via SP API …", len(rows))
|
||||
scraped = 0
|
||||
|
|
@ -340,16 +343,17 @@ def run_email_scrape(
|
|||
email_val = _fetch_email_via_sp_api(sys_id, session)
|
||||
|
||||
if email_val:
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE fcc_rmd
|
||||
SET contact_email = %s,
|
||||
contact_email_scraped_at = now(),
|
||||
updated_at = now()
|
||||
WHERE id = %s
|
||||
""",
|
||||
(email_val, row["id"]),
|
||||
)
|
||||
with conn.cursor() as ucur:
|
||||
ucur.execute(
|
||||
"""
|
||||
UPDATE fcc_rmd
|
||||
SET contact_email = %s,
|
||||
contact_email_scraped_at = now(),
|
||||
updated_at = now()
|
||||
WHERE id = %s
|
||||
""",
|
||||
(email_val, row["id"]),
|
||||
)
|
||||
conn.commit()
|
||||
scraped += 1
|
||||
if i <= 10 or i % 500 == 0:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue