Initial commit — Performance West telecom compliance platform

Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00 · 2026-04-27 06:54:22 -05:00 · f8cd37ac8c
commit f8cd37ac8c
1823 changed files with 145167 additions and 0 deletions
--- a/scripts/workers/cdr_adapters/init.py
+++ b/scripts/workers/cdr_adapters/init.py
@ -0,0 +1,34 @@
+"""CDR format adapters.
+
+Each adapter parses a switch-specific CDR file format into a normalized
+``CDRRow`` stream consumed by the ingester. Selection is driven by the
+``cdr_ingestion_profiles.format`` column (or inferred from a switch preset).
+
+Contract: ``BaseCDRAdapter.iter_rows(path_or_bytes) -> Iterator[CDRRow]``
+plus ``Adapter.FORMAT_SLUG`` and required-column metadata used by the
+validator.
+"""
+
+from .base import BaseCDRAdapter, CDRRow, ValidationError
+from .generic_csv import GenericCSVAdapter
+from .asterisk import AsteriskAdapter
+from .freeswitch import FreeSWITCHAdapter
+from .netsapiens import NetSapiensAdapter
+
+ADAPTERS: dict[str, type[BaseCDRAdapter]] = {
+    "generic_csv": GenericCSVAdapter,
+    "asterisk":    AsteriskAdapter,
+    "freeswitch":  FreeSWITCHAdapter,
+    "netsapiens":  NetSapiensAdapter,
+}
+
+__all__ = [
+    "ADAPTERS",
+    "BaseCDRAdapter",
+    "CDRRow",
+    "ValidationError",
+    "GenericCSVAdapter",
+    "AsteriskAdapter",
+    "FreeSWITCHAdapter",
+    "NetSapiensAdapter",
+]
--- a/scripts/workers/cdr_adapters/asterisk.py
+++ b/scripts/workers/cdr_adapters/asterisk.py
@ -0,0 +1,95 @@
+"""Asterisk CDR adapter.
+
+Parses the standard Asterisk `Master.csv` format. Default headerless layout
+(v1.4+):
+
+    accountcode, src, dst, dcontext, clid, channel, dstchannel, lastapp,
+    lastdata, start, answer, end, duration, billsec, disposition,
+    amaflags, uniqueid, userfield
+
+We read both headerless and header'd variants. The ``uniqueid`` column is
+Asterisk's per-call UUID and makes a perfect natural dedup key.
+
+Per-call revenue — Asterisk's built-in CDR does not include a charge
+column. Customers using ``cdr_asteriskcosts`` / ``cdr_addon_mysql`` /
+``cel_custom`` typically add columns for rate and billed amount; those
+are consumed via the generic_csv adapter with a preset mapping.
+"""
+
+from __future__ import annotations
+
+import csv
+import logging
+from typing import Iterator
+
+from .base import BaseCDRAdapter, CDRRow, ValidationError
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_HEADERS = [
+    "accountcode", "src", "dst", "dcontext", "clid", "channel",
+    "dstchannel", "lastapp", "lastdata", "start", "answer", "end",
+    "duration", "billsec", "disposition", "amaflags", "uniqueid", "userfield",
+]
+
+
+class AsteriskAdapter(BaseCDRAdapter):
+    FORMAT_SLUG = "asterisk"
+
+    def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
+        with open(local_path, "r", encoding="utf-8", errors="replace", newline="") as fh:
+            # Peek the first line: if it looks like a header row, use it;
+            # otherwise fall back to the default Asterisk header order.
+            first = fh.readline()
+            fh.seek(0)
+            has_header = "start" in first.lower() and "," in first
+            if has_header:
+                reader = csv.DictReader(fh)
+            else:
+                reader = csv.DictReader(fh, fieldnames=_DEFAULT_HEADERS)
+
+            for i, raw in enumerate(reader, start=1):
+                try:
+                    # Prefer billsec (answered portion) over duration for 499-A
+                    billsec_raw = raw.get("billsec") or raw.get("duration") or "0"
+                    duration = self.parse_duration(billsec_raw)
+                    start = self.parse_ts(raw.get("start"))
+                    caller = (raw.get("src") or raw.get("clid") or "").strip()
+                    called = (raw.get("dst") or "").strip()
+                    unique_id = (raw.get("uniqueid") or "").strip()
+
+                    row = CDRRow(
+                        start_time=start,
+                        caller_number=caller,
+                        called_number=called,
+                        duration_sec=duration,
+                        trunk_group_id=_extract_trunk(raw.get("channel") or raw.get("dstchannel")),
+                        customer_account_id=(raw.get("accountcode") or "").strip() or None,
+                        disposition=(raw.get("disposition") or "").strip().lower() or None,
+                        natural_key=unique_id or f"{caller}|{called}|{start.isoformat()}|{duration}",
+                        source_file=local_path,
+                        source_row=i,
+                        raw=dict(raw),
+                    )
+                    self.validate_row(row)
+                    yield row
+                except ValidationError:
+                    raise
+                except Exception as exc:
+                    raise ValidationError("unparseable_row", str(exc)) from exc
+
+
+def _extract_trunk(channel: str | None) -> str | None:
+    """Pull a trunk-group identifier from an Asterisk channel string.
+
+    Asterisk channels look like: ``SIP/trunk-mycarrier-0000abcd`` or
+    ``PJSIP/outbound-trunk/out-sip:+14155551212@...``. The portion right
+    after the protocol is a stable trunk id for bucketing.
+    """
+    if not channel:
+        return None
+    parts = channel.split("/")
+    if len(parts) < 2:
+        return None
+    token = parts[1].split("-")[0]
+    return token or None
--- a/scripts/workers/cdr_adapters/base.py
+++ b/scripts/workers/cdr_adapters/base.py
@ -0,0 +1,202 @@
+"""Base CDR adapter — shared interface + normalized row type + validation.
+
+All format adapters inherit from ``BaseCDRAdapter`` and implement
+``iter_rows()``. The ingester sees only this interface, so the classifier
+and quarantine logic are adapter-agnostic.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from typing import Iterator, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class ValidationError(Exception):
+    """Raised by an adapter when a row fails structural validation.
+
+    The ingester catches this and routes the offending row into
+    ``cdr_quarantine`` with ``reason_code``.
+    """
+
+    def __init__(self, reason_code: str, detail: str = ""):
+        super().__init__(f"{reason_code}: {detail}")
+        self.reason_code = reason_code
+        self.detail = detail
+
+
+@dataclass
+class CDRRow:
+    """Normalized single-call CDR row, pre-classification."""
+    # Required fields (adapters must set)
+    start_time: datetime
+    caller_number: str
+    called_number: str
+    duration_sec: int
+
+    # Strongly preferred (revenue-first attribution)
+    billed_amount_cents: Optional[int] = None
+    billed_currency: Optional[str] = None
+
+    # Optional — improves bucketing / accuracy
+    call_direction: Optional[str] = None       # inbound|outbound
+    disposition: Optional[str] = None           # answered|no_answer|busy|failed
+    trunk_group_id: Optional[str] = None
+    customer_account_id: Optional[str] = None
+    customer_type_override: Optional[str] = None  # per-row wholesale/retail tag
+
+    # Provenance — set by the adapter
+    natural_key: str = ""                       # adapter-specific uniqueness key
+    source_file: Optional[str] = None
+    source_row: Optional[int] = None
+
+    # Raw payload for quarantine re-processing
+    raw: dict = field(default_factory=dict)
+
+    def natural_key_hash(self, profile_id: int) -> str:
+        """Stable SHA-1 hash used as the dedup key in cdr_calls."""
+        basis = f"{profile_id}|{self.natural_key}"
+        return hashlib.sha1(basis.encode("utf-8")).hexdigest()
+
+    def to_db_tuple(self, profile_id: int) -> dict:
+        return {
+            **asdict(self),
+            "profile_id": profile_id,
+            "natural_key_hash": self.natural_key_hash(profile_id),
+        }
+
+
+class BaseCDRAdapter:
+    """Abstract adapter. Subclasses implement ``iter_rows()``."""
+
+    FORMAT_SLUG: str = ""              # matches cdr_ingestion_profiles.format
+    REQUIRED_FIELDS: tuple[str, ...] = ("start_time", "caller_number", "called_number", "duration_sec")
+
+    def __init__(self, profile_config: Optional[dict] = None):
+        self.profile_config = profile_config or {}
+
+    # ------------------------------------------------------------------
+    # Abstract
+    # ------------------------------------------------------------------
+
+    def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
+        """Yield one CDRRow per call record in the file."""
+        raise NotImplementedError
+
+    # ------------------------------------------------------------------
+    # Shared row validation — used by every subclass before yield
+    # ------------------------------------------------------------------
+
+    def validate_row(self, row: CDRRow) -> None:
+        """Raise ValidationError if the row fails structural checks."""
+        if not isinstance(row.start_time, datetime):
+            raise ValidationError("missing_start_time", "start_time absent or unparseable")
+        if not row.caller_number and not row.called_number:
+            # We need at least one endpoint to classify. Some inbound-only
+            # switches omit the caller; that's fine as long as called is set.
+            raise ValidationError("missing_endpoints", "neither caller nor called number present")
+        if row.duration_sec is None:
+            raise ValidationError("missing_duration", "duration_sec absent")
+        if row.duration_sec < 0:
+            raise ValidationError("bad_duration", f"negative duration {row.duration_sec}")
+        if row.duration_sec > 86400:
+            raise ValidationError("bad_duration", f"duration > 24h: {row.duration_sec}")
+        # Sanity: start_time within a reasonable window
+        now = datetime.utcnow()
+        if row.start_time.tzinfo is not None:
+            # Strip tz for the comparison
+            start_naive = row.start_time.replace(tzinfo=None)
+        else:
+            start_naive = row.start_time
+        if start_naive.year < 2000 or start_naive.year > now.year + 1:
+            raise ValidationError(
+                "bad_start_time",
+                f"start_time out of range: {row.start_time.isoformat()}",
+            )
+
+    # ------------------------------------------------------------------
+    # Helpers for subclasses
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def parse_duration(value) -> int:
+        """Normalize duration values (seconds, ms, or H:MM:SS)."""
+        if value is None or value == "":
+            return 0
+        if isinstance(value, (int, float)):
+            return int(value)
+        s = str(value).strip()
+        if ":" in s:
+            parts = [int(p) for p in s.split(":")]
+            while len(parts) < 3:
+                parts.insert(0, 0)
+            h, m, sec = parts[:3]
+            return h * 3600 + m * 60 + sec
+        try:
+            v = float(s)
+        except ValueError as exc:
+            raise ValidationError("bad_duration", f"unparseable duration: {s}") from exc
+        # Heuristic: > 100k usually means milliseconds
+        if v > 100_000:
+            return int(v / 1000)
+        return int(v)
+
+    @staticmethod
+    def parse_cents(value, *, currency: str = "USD") -> Optional[int]:
+        """Turn a revenue amount string into integer cents. Returns None if unparseable."""
+        if value is None or value == "":
+            return None
+        s = str(value).replace(",", "").replace("$", "").strip()
+        if not s:
+            return None
+        try:
+            f = float(s)
+        except ValueError:
+            return None
+        return int(round(f * 100))
+
+    @staticmethod
+    def parse_ts(value, fmt: Optional[str] = None) -> datetime:
+        """Parse a timestamp. Accepts ISO-8601, common CDR formats, or Unix epoch."""
+        if isinstance(value, datetime):
+            return value
+        s = str(value).strip()
+        if not s:
+            raise ValidationError("missing_start_time", "empty timestamp")
+        # Unix epoch
+        if s.isdigit():
+            try:
+                epoch = int(s)
+                if epoch > 10**12:         # ms
+                    epoch = epoch // 1000
+                return datetime.utcfromtimestamp(epoch)
+            except (ValueError, OverflowError):
+                pass
+        if fmt:
+            try:
+                return datetime.strptime(s, fmt)
+            except ValueError:
+                pass
+        # Try several common formats
+        for trial in (
+            "%Y-%m-%d %H:%M:%S",
+            "%Y-%m-%dT%H:%M:%S",
+            "%Y-%m-%dT%H:%M:%SZ",
+            "%Y-%m-%dT%H:%M:%S.%f",
+            "%Y-%m-%dT%H:%M:%S.%fZ",
+            "%m/%d/%Y %H:%M:%S",
+            "%d/%m/%Y %H:%M:%S",
+        ):
+            try:
+                return datetime.strptime(s, trial)
+            except ValueError:
+                continue
+        # Finally, try Python's fromisoformat
+        try:
+            return datetime.fromisoformat(s.replace("Z", "+00:00"))
+        except ValueError as exc:
+            raise ValidationError("missing_start_time", f"unparseable timestamp {s!r}") from exc
--- a/scripts/workers/cdr_adapters/freeswitch.py
+++ b/scripts/workers/cdr_adapters/freeswitch.py
@ -0,0 +1,80 @@
+"""FreeSWITCH CDR adapter.
+
+Handles the standard ``mod_cdr_csv`` output format:
+
+    "caller_id_name","caller_id_number","destination_number","context",
+    "start_stamp","answer_stamp","end_stamp","duration","billsec",
+    "hangup_cause","uuid","bleg_uuid","accountcode"
+
+Billed amount is populated via ``mod_nibblebill`` when installed — the
+additional columns ``nibble_total_billed`` / ``nibble_bill_amount`` /
+``nibble_rate`` land in the same CSV. We pick them up when present.
+
+``uuid`` is FreeSWITCH's unique call identifier and makes a perfect
+natural key for dedup.
+"""
+
+from __future__ import annotations
+
+import csv
+import logging
+from typing import Iterator
+
+from .base import BaseCDRAdapter, CDRRow, ValidationError
+
+logger = logging.getLogger(__name__)
+
+
+class FreeSWITCHAdapter(BaseCDRAdapter):
+    FORMAT_SLUG = "freeswitch"
+
+    # Columns we check for per-call billed amount (mod_nibblebill output)
+    _BILLED_COLUMNS = (
+        "nibble_total_billed",
+        "nibble_bill_amount",
+        "billed_amount",
+        "total_charge",
+        "charge",
+    )
+
+    def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
+        with open(local_path, "r", encoding="utf-8", errors="replace", newline="") as fh:
+            reader = csv.DictReader(fh)
+            for i, raw in enumerate(reader, start=1):
+                try:
+                    start_raw = raw.get("start_stamp") or raw.get("answer_stamp")
+                    start = self.parse_ts(start_raw)
+                    duration_raw = raw.get("billsec") or raw.get("duration") or "0"
+                    duration = self.parse_duration(duration_raw)
+                    caller = (raw.get("caller_id_number") or "").strip()
+                    called = (raw.get("destination_number") or "").strip()
+                    uuid = (raw.get("uuid") or "").strip()
+
+                    billed_cents = None
+                    for col in self._BILLED_COLUMNS:
+                        if raw.get(col):
+                            billed_cents = self.parse_cents(raw[col])
+                            if billed_cents is not None:
+                                break
+
+                    row = CDRRow(
+                        start_time=start,
+                        caller_number=caller,
+                        called_number=called,
+                        duration_sec=duration,
+                        billed_amount_cents=billed_cents,
+                        billed_currency=("USD" if billed_cents is not None else None),
+                        trunk_group_id=(raw.get("context") or "").strip() or None,
+                        customer_account_id=(raw.get("accountcode") or "").strip() or None,
+                        disposition=(raw.get("hangup_cause") or "").strip().lower() or None,
+                        natural_key=uuid or f"{caller}|{called}|{start.isoformat()}|{duration}",
+                        source_file=local_path,
+                        source_row=i,
+                        raw=dict(raw),
+                    )
+                    self.validate_row(row)
+                    yield row
+                except ValidationError:
+                    raise
+                except Exception as exc:
+                    raise ValidationError("unparseable_row", str(exc)) from exc
--- a/scripts/workers/cdr_adapters/generic_csv.py
+++ b/scripts/workers/cdr_adapters/generic_csv.py
@ -0,0 +1,131 @@
+"""Generic CSV adapter — configurable column mapping.
+
+For switches that don't match any of the specific presets or for
+customers whose mediation layer emits a custom CSV, the profile stores
+a column mapping in ``format_config`` JSONB and this adapter maps it
+into the normalized CDR row.
+
+Example ``format_config`` (set by customer via portal):
+
+    {
+        "start_time":    "call_date",
+        "caller_number": "source",
+        "called_number": "destination",
+        "duration_sec":  "billsec",
+        "billed_amount": "charge_usd",
+        "trunk_group":   "trunk",
+        "account_id":    "accountcode",
+        "direction":     "direction",
+        "disposition":   "disposition",
+        "customer_type_override": "cust_type",
+        "call_id":       "uniqueid",
+        "ts_format":     "%Y-%m-%d %H:%M:%S",
+        "encoding":      "utf-8",
+        "delimiter":     ","
+    }
+"""
+
+from __future__ import annotations
+
+import csv
+import logging
+from datetime import datetime
+from typing import Iterator
+
+from .base import BaseCDRAdapter, CDRRow, ValidationError
+
+logger = logging.getLogger(__name__)
+
+
+class GenericCSVAdapter(BaseCDRAdapter):
+    FORMAT_SLUG = "generic_csv"
+
+    # Required mapping keys: the profile's format_config MUST name a source
+    # column for each of these.
+    REQUIRED_MAPPING_KEYS = (
+        "start_time", "caller_number", "called_number", "duration_sec",
+    )
+    OPTIONAL_MAPPING_KEYS = (
+        "billed_amount", "trunk_group", "account_id", "direction",
+        "disposition", "customer_type_override", "call_id",
+    )
+
+    def _check_mapping(self) -> None:
+        missing = [k for k in self.REQUIRED_MAPPING_KEYS if not self.profile_config.get(k)]
+        if missing:
+            raise ValidationError(
+                "bad_mapping",
+                f"generic_csv profile config missing required keys: {missing}",
+            )
+
+    def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
+        self._check_mapping()
+        cfg = self.profile_config
+        encoding = cfg.get("encoding", "utf-8")
+        delimiter = cfg.get("delimiter", ",")
+        ts_format = cfg.get("ts_format")
+
+        col = {
+            "start_time":    cfg["start_time"],
+            "caller_number": cfg["caller_number"],
+            "called_number": cfg["called_number"],
+            "duration_sec":  cfg["duration_sec"],
+        }
+        # Optional source column names — None when not mapped
+        opt = {k: cfg.get(k) for k in self.OPTIONAL_MAPPING_KEYS}
+
+        with open(local_path, "r", encoding=encoding, errors="replace", newline="") as fh:
+            reader = csv.DictReader(fh, delimiter=delimiter)
+            for i, raw_row in enumerate(reader, start=1):
+                try:
+                    start_time = self.parse_ts(raw_row.get(col["start_time"]), ts_format)
+                    duration = self.parse_duration(raw_row.get(col["duration_sec"]))
+                    caller = (raw_row.get(col["caller_number"]) or "").strip()
+                    called = (raw_row.get(col["called_number"]) or "").strip()
+
+                    row = CDRRow(
+                        start_time=start_time,
+                        caller_number=caller,
+                        called_number=called,
+                        duration_sec=duration,
+                        billed_amount_cents=(
+                            self.parse_cents(raw_row.get(opt["billed_amount"]))
+                            if opt.get("billed_amount") else None
+                        ),
+                        billed_currency=(cfg.get("currency", "USD")
+                                         if opt.get("billed_amount") else None),
+                        trunk_group_id=(
+                            raw_row.get(opt["trunk_group"]).strip()
+                            if opt.get("trunk_group") and raw_row.get(opt["trunk_group"]) else None
+                        ),
+                        customer_account_id=(
+                            raw_row.get(opt["account_id"]).strip()
+                            if opt.get("account_id") and raw_row.get(opt["account_id"]) else None
+                        ),
+                        call_direction=(
+                            (raw_row.get(opt["direction"]) or "").strip().lower() or None
+                            if opt.get("direction") else None
+                        ),
+                        disposition=(
+                            (raw_row.get(opt["disposition"]) or "").strip().lower() or None
+                            if opt.get("disposition") else None
+                        ),
+                        customer_type_override=(
+                            (raw_row.get(opt["customer_type_override"]) or "").strip().lower() or None
+                            if opt.get("customer_type_override") else None
+                        ),
+                        natural_key=(
+                            raw_row.get(opt["call_id"]).strip()
+                            if opt.get("call_id") and raw_row.get(opt["call_id"])
+                            else f"{caller}|{called}|{start_time.isoformat()}|{duration}"
+                        ),
+                        source_file=local_path,
+                        source_row=i,
+                        raw=dict(raw_row),
+                    )
+                    self.validate_row(row)
+                    yield row
+                except ValidationError:
+                    raise  # let ingester catch + quarantine
+                except Exception as exc:
+                    raise ValidationError("unparseable_row", str(exc)) from exc
--- a/scripts/workers/cdr_adapters/netsapiens.py
+++ b/scripts/workers/cdr_adapters/netsapiens.py
@ -0,0 +1,111 @@
+"""NetSapiens CDRv2 adapter (NDJSON / JSON array).
+
+NetSapiens emits CDRs either as a JSON array (via the ``/cdr`` REST
+endpoint with paginated pages) or as newline-delimited JSON (via
+streaming export). Both shapes use the same record schema; this adapter
+accepts either.
+
+Key fields (NetSapiens CDRv2):
+
+    orig_from_uri, orig_to_uri               -> caller / called SIP URIs
+    orig_callid, term_callid                 -> two call-legs (we stitch on
+                                               the term_callid where present)
+    time_start, time_answer, time_release    -> timestamps (ISO-8601)
+    duration                                 -> seconds on the billed leg
+    charge / cost / rate                     -> per-call revenue
+    orig_sub, term_sub                       -> subscriber identifiers
+    orig_carrier, term_carrier               -> trunk/carrier IDs
+    release_code                             -> disposition
+
+Natural key: NetSapiens distinguishes orig and term call legs.  We use
+``term_callid`` when present (the billed leg), falling back to
+``orig_callid``. That dedups the two-leg SBC emission cleanly.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Iterator
+
+from .base import BaseCDRAdapter, CDRRow, ValidationError
+
+logger = logging.getLogger(__name__)
+
+
+class NetSapiensAdapter(BaseCDRAdapter):
+    FORMAT_SLUG = "netsapiens"
+
+    def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
+        with open(local_path, "r", encoding="utf-8", errors="replace") as fh:
+            first = fh.read(1)
+            fh.seek(0)
+            if first == "[":
+                records = json.load(fh)
+            else:
+                records = (json.loads(line) for line in fh if line.strip())
+
+            for i, record in enumerate(records, start=1):
+                try:
+                    start = self.parse_ts(record.get("time_start") or record.get("start_time"))
+                    duration = self.parse_duration(record.get("duration") or 0)
+                    caller = _extract_uri_number(record.get("orig_from_uri") or record.get("from_uri"))
+                    called = _extract_uri_number(record.get("orig_to_uri") or record.get("to_uri"))
+                    billed = None
+                    for col in ("charge", "cost", "total_charge"):
+                        if record.get(col) not in (None, ""):
+                            billed = self.parse_cents(record[col])
+                            if billed is not None:
+                                break
+                    # Prefer term_callid (billed leg) as the natural key —
+                    # collapses ingress+egress legs of a single call.
+                    nkey = (
+                        record.get("term_callid")
+                        or record.get("orig_callid")
+                        or f"{caller}|{called}|{start.isoformat()}|{duration}"
+                    )
+                    trunk = (
+                        record.get("term_carrier")
+                        or record.get("orig_carrier")
+                        or ""
+                    )
+
+                    row = CDRRow(
+                        start_time=start,
+                        caller_number=caller,
+                        called_number=called,
+                        duration_sec=duration,
+                        billed_amount_cents=billed,
+                        billed_currency=("USD" if billed is not None else None),
+                        trunk_group_id=trunk.strip() or None,
+                        customer_account_id=(
+                            record.get("orig_sub") or record.get("term_sub") or ""
+                        ).strip() or None,
+                        disposition=(record.get("release_code") or "").strip().lower() or None,
+                        call_direction=(record.get("direction") or "").strip().lower() or None,
+                        natural_key=nkey.strip(),
+                        source_file=local_path,
+                        source_row=i,
+                        raw=record if isinstance(record, dict) else {},
+                    )
+                    self.validate_row(row)
+                    yield row
+                except ValidationError:
+                    raise
+                except Exception as exc:
+                    raise ValidationError("unparseable_row", str(exc)) from exc
+
+
+def _extract_uri_number(uri: str | None) -> str:
+    """Pull the user portion out of a SIP URI."""
+    if not uri:
+        return ""
+    s = str(uri).strip()
+    if s.startswith("sip:") or s.startswith("sips:"):
+        s = s.split(":", 1)[1]
+    if "@" in s:
+        s = s.split("@", 1)[0]
+    # Strip parameters like ";user=phone"
+    if ";" in s:
+        s = s.split(";", 1)[0]
+    return s.strip()