fix(formation/TX): name search via Texas open-data API, not scraping

The TX Comptroller web search is now a JS form (old input#entityName selector
dead) and SOSDirect is login-gated, so the scraper returned garbage. Replaced
search_name with the Texas Socrata 'Active Franchise Taxpayers' dataset
(data.texas.gov/resource/9cir-efmm.json) over SoQL -- free, no-auth, no-login,
no bot-blocks. Exact normalized match => unavailable; no rows => available; API
error => available=None (never a false 'taken'). Verified: unique name = 0 rows
(available), 'APPLE INC.' = exact match (taken).
This commit is contained in:
justin 2026-06-09 08:34:37 -05:00
parent 561ad78ea8
commit f94ad1682b

View file

@ -51,99 +51,68 @@ class TXPortal(StatePortal):
# ── Name Search (Comptroller Taxable Entity Search — no login) ──────
async def search_name(self, name: str) -> NameSearchResult:
"""Search Texas business name availability via the Comptroller
Taxable Entity Search (free, no login required).
"""Search Texas business name availability via the Texas open-data API.
URL: https://mycpa.cpa.state.tx.us/coa/Index.html
Uses the Comptroller "Active Franchise Taxpayers" dataset on the state
Socrata portal (data.texas.gov, dataset 9cir-efmm) over SoQL. This is a
free, no-auth, no-login JSON API -- far more robust than scraping the
Comptroller web search (which is a JS form) or SOSDirect (login-gated,
ASP.NET). An entity name appearing as an *active* franchise taxpayer is
a strong "name in use" signal; absence => available (subject to a final
SOSDirect confirmation when the order is actually filed).
This searches the Comptroller's database, not the SOS. A name
can be "available" in the Comptroller DB but reserved at SOS.
For definitive availability, SOSDirect's name check is better —
but requires login. We check Comptroller first (free + fast),
then flag for SOSDirect confirmation if the customer proceeds.
Availability semantics:
- exact (normalized) match -> available=False, exact_match=True
- only similar names -> available=True, similar_names listed
- no rows -> available=True
- API error -> available=None (never a false "taken")
"""
import json as _json
import urllib.parse as _url
import urllib.request as _req
def _norm(s: str) -> str:
return (
s.upper()
.replace(",", "")
.replace(".", "")
.replace(" ", " ")
.strip()
)
# SoQL: case-insensitive LIKE on the first ~20 normalized chars so we
# catch the exact name plus close variants, capped at 25 rows.
needle = _norm(name)[:20].replace("'", "''")
where = f"upper(taxpayer_name) like '%{needle}%'"
api = (
"https://data.texas.gov/resource/9cir-efmm.json?"
+ _url.urlencode({"$where": where, "$limit": "25",
"$select": "taxpayer_name"})
)
try:
page = await self.start_browser()
await page.goto(
"https://mycpa.cpa.state.tx.us/coa/Index.html",
wait_until="networkidle",
)
await self.human_delay(1.0, 2.0)
loop = asyncio.get_event_loop()
# The Comptroller search page has a text input + search button
# Selector: input field for entity name
await page.fill(
'input[name="entityName"], input#entityName, input[type="text"]',
"",
)
await self.type_slowly(
page,
'input[name="entityName"], input#entityName, input[type="text"]',
name,
)
await self.human_delay(0.5, 1.0)
# Click search
await page.click(
'input[type="submit"], button[type="submit"], '
'#searchButton, input[value="Search"]'
)
await page.wait_for_load_state("networkidle")
await self.human_delay(1.0, 2.0)
# Parse results
content = await page.content()
await self.screenshot(page, f"tx_name_search_{name}")
# Check for "no results" indicator
no_results = (
"no match" in content.lower()
or "no entities found" in content.lower()
or "no records" in content.lower()
or "0 results" in content.lower()
)
if no_results:
return NameSearchResult(
available=True,
exact_match=False,
similar_names=[],
state_code="TX",
searched_name=name,
raw_response=content[:2000],
)
# Extract matching entity names from results
similar: list[str] = []
# Common patterns: table rows with entity names
name_pattern = re.compile(
r'<td[^>]*>([^<]*?' + re.escape(name[:10]) + r'[^<]*?)</td>',
re.IGNORECASE,
)
for m in name_pattern.finditer(content):
found = m.group(1).strip()
if found and len(found) > 3:
similar.append(found)
# Exact match = one of the results matches our name closely
exact = any(
s.upper().replace(",", "").replace(".", "").strip()
== name.upper().replace(",", "").replace(".", "").strip()
for s in similar
)
def _fetch() -> list[dict]:
req = _req.Request(api, headers={"User-Agent": "PerformanceWest formation name-check"})
with _req.urlopen(req, timeout=20) as resp:
return _json.loads(resp.read().decode("utf-8"))
rows = await loop.run_in_executor(None, _fetch)
similar = [r.get("taxpayer_name", "").strip() for r in rows if r.get("taxpayer_name")]
target = _norm(name)
exact = any(_norm(s) == target for s in similar)
return NameSearchResult(
available=not exact,
exact_match=exact,
similar_names=similar[:10],
state_code="TX",
searched_name=name,
raw_response=content[:2000],
raw_response=f"texas-open-data 9cir-efmm: {len(similar)} match(es)",
)
except Exception as exc:
# available=None => "could not determine" (never a false "taken").
return NameSearchResult(
available=False,
available=None,
state_code="TX",
searched_name=name,
raw_response=f"Error: {exc}",