From f94ad1682b374497c9f71c7e8c13027aba33165e Mon Sep 17 00:00:00 2001 From: justin Date: Tue, 9 Jun 2026 08:34:37 -0500 Subject: [PATCH] fix(formation/TX): name search via Texas open-data API, not scraping The TX Comptroller web search is now a JS form (old input#entityName selector dead) and SOSDirect is login-gated, so the scraper returned garbage. Replaced search_name with the Texas Socrata 'Active Franchise Taxpayers' dataset (data.texas.gov/resource/9cir-efmm.json) over SoQL -- free, no-auth, no-login, no bot-blocks. Exact normalized match => unavailable; no rows => available; API error => available=None (never a false 'taken'). Verified: unique name = 0 rows (available), 'APPLE INC.' = exact match (taken). --- scripts/formation/states/tx/adapter.py | 125 ++++++++++--------------- 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/scripts/formation/states/tx/adapter.py b/scripts/formation/states/tx/adapter.py index 902feab..fc22632 100644 --- a/scripts/formation/states/tx/adapter.py +++ b/scripts/formation/states/tx/adapter.py @@ -51,99 +51,68 @@ class TXPortal(StatePortal): # ── Name Search (Comptroller Taxable Entity Search — no login) ────── async def search_name(self, name: str) -> NameSearchResult: - """Search Texas business name availability via the Comptroller - Taxable Entity Search (free, no login required). + """Search Texas business name availability via the Texas open-data API. - URL: https://mycpa.cpa.state.tx.us/coa/Index.html + Uses the Comptroller "Active Franchise Taxpayers" dataset on the state + Socrata portal (data.texas.gov, dataset 9cir-efmm) over SoQL. This is a + free, no-auth, no-login JSON API -- far more robust than scraping the + Comptroller web search (which is a JS form) or SOSDirect (login-gated, + ASP.NET). An entity name appearing as an *active* franchise taxpayer is + a strong "name in use" signal; absence => available (subject to a final + SOSDirect confirmation when the order is actually filed). - This searches the Comptroller's database, not the SOS. A name - can be "available" in the Comptroller DB but reserved at SOS. - For definitive availability, SOSDirect's name check is better — - but requires login. We check Comptroller first (free + fast), - then flag for SOSDirect confirmation if the customer proceeds. + Availability semantics: + - exact (normalized) match -> available=False, exact_match=True + - only similar names -> available=True, similar_names listed + - no rows -> available=True + - API error -> available=None (never a false "taken") """ + import json as _json + import urllib.parse as _url + import urllib.request as _req + + def _norm(s: str) -> str: + return ( + s.upper() + .replace(",", "") + .replace(".", "") + .replace(" ", " ") + .strip() + ) + + # SoQL: case-insensitive LIKE on the first ~20 normalized chars so we + # catch the exact name plus close variants, capped at 25 rows. + needle = _norm(name)[:20].replace("'", "''") + where = f"upper(taxpayer_name) like '%{needle}%'" + api = ( + "https://data.texas.gov/resource/9cir-efmm.json?" + + _url.urlencode({"$where": where, "$limit": "25", + "$select": "taxpayer_name"}) + ) try: - page = await self.start_browser() - await page.goto( - "https://mycpa.cpa.state.tx.us/coa/Index.html", - wait_until="networkidle", - ) - await self.human_delay(1.0, 2.0) + loop = asyncio.get_event_loop() - # The Comptroller search page has a text input + search button - # Selector: input field for entity name - await page.fill( - 'input[name="entityName"], input#entityName, input[type="text"]', - "", - ) - await self.type_slowly( - page, - 'input[name="entityName"], input#entityName, input[type="text"]', - name, - ) - await self.human_delay(0.5, 1.0) - - # Click search - await page.click( - 'input[type="submit"], button[type="submit"], ' - '#searchButton, input[value="Search"]' - ) - await page.wait_for_load_state("networkidle") - await self.human_delay(1.0, 2.0) - - # Parse results - content = await page.content() - await self.screenshot(page, f"tx_name_search_{name}") - - # Check for "no results" indicator - no_results = ( - "no match" in content.lower() - or "no entities found" in content.lower() - or "no records" in content.lower() - or "0 results" in content.lower() - ) - - if no_results: - return NameSearchResult( - available=True, - exact_match=False, - similar_names=[], - state_code="TX", - searched_name=name, - raw_response=content[:2000], - ) - - # Extract matching entity names from results - similar: list[str] = [] - # Common patterns: table rows with entity names - name_pattern = re.compile( - r']*>([^<]*?' + re.escape(name[:10]) + r'[^<]*?)', - re.IGNORECASE, - ) - for m in name_pattern.finditer(content): - found = m.group(1).strip() - if found and len(found) > 3: - similar.append(found) - - # Exact match = one of the results matches our name closely - exact = any( - s.upper().replace(",", "").replace(".", "").strip() - == name.upper().replace(",", "").replace(".", "").strip() - for s in similar - ) + def _fetch() -> list[dict]: + req = _req.Request(api, headers={"User-Agent": "PerformanceWest formation name-check"}) + with _req.urlopen(req, timeout=20) as resp: + return _json.loads(resp.read().decode("utf-8")) + rows = await loop.run_in_executor(None, _fetch) + similar = [r.get("taxpayer_name", "").strip() for r in rows if r.get("taxpayer_name")] + target = _norm(name) + exact = any(_norm(s) == target for s in similar) return NameSearchResult( available=not exact, exact_match=exact, similar_names=similar[:10], state_code="TX", searched_name=name, - raw_response=content[:2000], + raw_response=f"texas-open-data 9cir-efmm: {len(similar)} match(es)", ) - except Exception as exc: + # available=None => "could not determine" (never a false "taken"). return NameSearchResult( - available=False, + available=None, state_code="TX", searched_name=name, raw_response=f"Error: {exc}",