diff --git a/scripts/formation/states/tx/adapter.py b/scripts/formation/states/tx/adapter.py index 902feab..fc22632 100644 --- a/scripts/formation/states/tx/adapter.py +++ b/scripts/formation/states/tx/adapter.py @@ -51,99 +51,68 @@ class TXPortal(StatePortal): # ── Name Search (Comptroller Taxable Entity Search — no login) ────── async def search_name(self, name: str) -> NameSearchResult: - """Search Texas business name availability via the Comptroller - Taxable Entity Search (free, no login required). + """Search Texas business name availability via the Texas open-data API. - URL: https://mycpa.cpa.state.tx.us/coa/Index.html + Uses the Comptroller "Active Franchise Taxpayers" dataset on the state + Socrata portal (data.texas.gov, dataset 9cir-efmm) over SoQL. This is a + free, no-auth, no-login JSON API -- far more robust than scraping the + Comptroller web search (which is a JS form) or SOSDirect (login-gated, + ASP.NET). An entity name appearing as an *active* franchise taxpayer is + a strong "name in use" signal; absence => available (subject to a final + SOSDirect confirmation when the order is actually filed). - This searches the Comptroller's database, not the SOS. A name - can be "available" in the Comptroller DB but reserved at SOS. - For definitive availability, SOSDirect's name check is better — - but requires login. We check Comptroller first (free + fast), - then flag for SOSDirect confirmation if the customer proceeds. + Availability semantics: + - exact (normalized) match -> available=False, exact_match=True + - only similar names -> available=True, similar_names listed + - no rows -> available=True + - API error -> available=None (never a false "taken") """ + import json as _json + import urllib.parse as _url + import urllib.request as _req + + def _norm(s: str) -> str: + return ( + s.upper() + .replace(",", "") + .replace(".", "") + .replace(" ", " ") + .strip() + ) + + # SoQL: case-insensitive LIKE on the first ~20 normalized chars so we + # catch the exact name plus close variants, capped at 25 rows. + needle = _norm(name)[:20].replace("'", "''") + where = f"upper(taxpayer_name) like '%{needle}%'" + api = ( + "https://data.texas.gov/resource/9cir-efmm.json?" + + _url.urlencode({"$where": where, "$limit": "25", + "$select": "taxpayer_name"}) + ) try: - page = await self.start_browser() - await page.goto( - "https://mycpa.cpa.state.tx.us/coa/Index.html", - wait_until="networkidle", - ) - await self.human_delay(1.0, 2.0) + loop = asyncio.get_event_loop() - # The Comptroller search page has a text input + search button - # Selector: input field for entity name - await page.fill( - 'input[name="entityName"], input#entityName, input[type="text"]', - "", - ) - await self.type_slowly( - page, - 'input[name="entityName"], input#entityName, input[type="text"]', - name, - ) - await self.human_delay(0.5, 1.0) - - # Click search - await page.click( - 'input[type="submit"], button[type="submit"], ' - '#searchButton, input[value="Search"]' - ) - await page.wait_for_load_state("networkidle") - await self.human_delay(1.0, 2.0) - - # Parse results - content = await page.content() - await self.screenshot(page, f"tx_name_search_{name}") - - # Check for "no results" indicator - no_results = ( - "no match" in content.lower() - or "no entities found" in content.lower() - or "no records" in content.lower() - or "0 results" in content.lower() - ) - - if no_results: - return NameSearchResult( - available=True, - exact_match=False, - similar_names=[], - state_code="TX", - searched_name=name, - raw_response=content[:2000], - ) - - # Extract matching entity names from results - similar: list[str] = [] - # Common patterns: table rows with entity names - name_pattern = re.compile( - r'