fix(formation/TX): name search via Texas open-data API, not scraping
The TX Comptroller web search is now a JS form (old input#entityName selector dead) and SOSDirect is login-gated, so the scraper returned garbage. Replaced search_name with the Texas Socrata 'Active Franchise Taxpayers' dataset (data.texas.gov/resource/9cir-efmm.json) over SoQL -- free, no-auth, no-login, no bot-blocks. Exact normalized match => unavailable; no rows => available; API error => available=None (never a false 'taken'). Verified: unique name = 0 rows (available), 'APPLE INC.' = exact match (taken).
This commit is contained in:
parent
561ad78ea8
commit
f94ad1682b
1 changed files with 47 additions and 78 deletions
|
|
@ -51,99 +51,68 @@ class TXPortal(StatePortal):
|
|||
# ── Name Search (Comptroller Taxable Entity Search — no login) ──────
|
||||
|
||||
async def search_name(self, name: str) -> NameSearchResult:
|
||||
"""Search Texas business name availability via the Comptroller
|
||||
Taxable Entity Search (free, no login required).
|
||||
"""Search Texas business name availability via the Texas open-data API.
|
||||
|
||||
URL: https://mycpa.cpa.state.tx.us/coa/Index.html
|
||||
Uses the Comptroller "Active Franchise Taxpayers" dataset on the state
|
||||
Socrata portal (data.texas.gov, dataset 9cir-efmm) over SoQL. This is a
|
||||
free, no-auth, no-login JSON API -- far more robust than scraping the
|
||||
Comptroller web search (which is a JS form) or SOSDirect (login-gated,
|
||||
ASP.NET). An entity name appearing as an *active* franchise taxpayer is
|
||||
a strong "name in use" signal; absence => available (subject to a final
|
||||
SOSDirect confirmation when the order is actually filed).
|
||||
|
||||
This searches the Comptroller's database, not the SOS. A name
|
||||
can be "available" in the Comptroller DB but reserved at SOS.
|
||||
For definitive availability, SOSDirect's name check is better —
|
||||
but requires login. We check Comptroller first (free + fast),
|
||||
then flag for SOSDirect confirmation if the customer proceeds.
|
||||
Availability semantics:
|
||||
- exact (normalized) match -> available=False, exact_match=True
|
||||
- only similar names -> available=True, similar_names listed
|
||||
- no rows -> available=True
|
||||
- API error -> available=None (never a false "taken")
|
||||
"""
|
||||
import json as _json
|
||||
import urllib.parse as _url
|
||||
import urllib.request as _req
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
return (
|
||||
s.upper()
|
||||
.replace(",", "")
|
||||
.replace(".", "")
|
||||
.replace(" ", " ")
|
||||
.strip()
|
||||
)
|
||||
|
||||
# SoQL: case-insensitive LIKE on the first ~20 normalized chars so we
|
||||
# catch the exact name plus close variants, capped at 25 rows.
|
||||
needle = _norm(name)[:20].replace("'", "''")
|
||||
where = f"upper(taxpayer_name) like '%{needle}%'"
|
||||
api = (
|
||||
"https://data.texas.gov/resource/9cir-efmm.json?"
|
||||
+ _url.urlencode({"$where": where, "$limit": "25",
|
||||
"$select": "taxpayer_name"})
|
||||
)
|
||||
try:
|
||||
page = await self.start_browser()
|
||||
await page.goto(
|
||||
"https://mycpa.cpa.state.tx.us/coa/Index.html",
|
||||
wait_until="networkidle",
|
||||
)
|
||||
await self.human_delay(1.0, 2.0)
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# The Comptroller search page has a text input + search button
|
||||
# Selector: input field for entity name
|
||||
await page.fill(
|
||||
'input[name="entityName"], input#entityName, input[type="text"]',
|
||||
"",
|
||||
)
|
||||
await self.type_slowly(
|
||||
page,
|
||||
'input[name="entityName"], input#entityName, input[type="text"]',
|
||||
name,
|
||||
)
|
||||
await self.human_delay(0.5, 1.0)
|
||||
|
||||
# Click search
|
||||
await page.click(
|
||||
'input[type="submit"], button[type="submit"], '
|
||||
'#searchButton, input[value="Search"]'
|
||||
)
|
||||
await page.wait_for_load_state("networkidle")
|
||||
await self.human_delay(1.0, 2.0)
|
||||
|
||||
# Parse results
|
||||
content = await page.content()
|
||||
await self.screenshot(page, f"tx_name_search_{name}")
|
||||
|
||||
# Check for "no results" indicator
|
||||
no_results = (
|
||||
"no match" in content.lower()
|
||||
or "no entities found" in content.lower()
|
||||
or "no records" in content.lower()
|
||||
or "0 results" in content.lower()
|
||||
)
|
||||
|
||||
if no_results:
|
||||
return NameSearchResult(
|
||||
available=True,
|
||||
exact_match=False,
|
||||
similar_names=[],
|
||||
state_code="TX",
|
||||
searched_name=name,
|
||||
raw_response=content[:2000],
|
||||
)
|
||||
|
||||
# Extract matching entity names from results
|
||||
similar: list[str] = []
|
||||
# Common patterns: table rows with entity names
|
||||
name_pattern = re.compile(
|
||||
r'<td[^>]*>([^<]*?' + re.escape(name[:10]) + r'[^<]*?)</td>',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in name_pattern.finditer(content):
|
||||
found = m.group(1).strip()
|
||||
if found and len(found) > 3:
|
||||
similar.append(found)
|
||||
|
||||
# Exact match = one of the results matches our name closely
|
||||
exact = any(
|
||||
s.upper().replace(",", "").replace(".", "").strip()
|
||||
== name.upper().replace(",", "").replace(".", "").strip()
|
||||
for s in similar
|
||||
)
|
||||
def _fetch() -> list[dict]:
|
||||
req = _req.Request(api, headers={"User-Agent": "PerformanceWest formation name-check"})
|
||||
with _req.urlopen(req, timeout=20) as resp:
|
||||
return _json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
rows = await loop.run_in_executor(None, _fetch)
|
||||
similar = [r.get("taxpayer_name", "").strip() for r in rows if r.get("taxpayer_name")]
|
||||
target = _norm(name)
|
||||
exact = any(_norm(s) == target for s in similar)
|
||||
return NameSearchResult(
|
||||
available=not exact,
|
||||
exact_match=exact,
|
||||
similar_names=similar[:10],
|
||||
state_code="TX",
|
||||
searched_name=name,
|
||||
raw_response=content[:2000],
|
||||
raw_response=f"texas-open-data 9cir-efmm: {len(similar)} match(es)",
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
# available=None => "could not determine" (never a false "taken").
|
||||
return NameSearchResult(
|
||||
available=False,
|
||||
available=None,
|
||||
state_code="TX",
|
||||
searched_name=name,
|
||||
raw_response=f"Error: {exc}",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue