Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
202 lines
7.5 KiB
Python
202 lines
7.5 KiB
Python
"""Base CDR adapter — shared interface + normalized row type + validation.
|
|
|
|
All format adapters inherit from ``BaseCDRAdapter`` and implement
|
|
``iter_rows()``. The ingester sees only this interface, so the classifier
|
|
and quarantine logic are adapter-agnostic.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import logging
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from typing import Iterator, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ValidationError(Exception):
|
|
"""Raised by an adapter when a row fails structural validation.
|
|
|
|
The ingester catches this and routes the offending row into
|
|
``cdr_quarantine`` with ``reason_code``.
|
|
"""
|
|
|
|
def __init__(self, reason_code: str, detail: str = ""):
|
|
super().__init__(f"{reason_code}: {detail}")
|
|
self.reason_code = reason_code
|
|
self.detail = detail
|
|
|
|
|
|
@dataclass
|
|
class CDRRow:
|
|
"""Normalized single-call CDR row, pre-classification."""
|
|
# Required fields (adapters must set)
|
|
start_time: datetime
|
|
caller_number: str
|
|
called_number: str
|
|
duration_sec: int
|
|
|
|
# Strongly preferred (revenue-first attribution)
|
|
billed_amount_cents: Optional[int] = None
|
|
billed_currency: Optional[str] = None
|
|
|
|
# Optional — improves bucketing / accuracy
|
|
call_direction: Optional[str] = None # inbound|outbound
|
|
disposition: Optional[str] = None # answered|no_answer|busy|failed
|
|
trunk_group_id: Optional[str] = None
|
|
customer_account_id: Optional[str] = None
|
|
customer_type_override: Optional[str] = None # per-row wholesale/retail tag
|
|
|
|
# Provenance — set by the adapter
|
|
natural_key: str = "" # adapter-specific uniqueness key
|
|
source_file: Optional[str] = None
|
|
source_row: Optional[int] = None
|
|
|
|
# Raw payload for quarantine re-processing
|
|
raw: dict = field(default_factory=dict)
|
|
|
|
def natural_key_hash(self, profile_id: int) -> str:
|
|
"""Stable SHA-1 hash used as the dedup key in cdr_calls."""
|
|
basis = f"{profile_id}|{self.natural_key}"
|
|
return hashlib.sha1(basis.encode("utf-8")).hexdigest()
|
|
|
|
def to_db_tuple(self, profile_id: int) -> dict:
|
|
return {
|
|
**asdict(self),
|
|
"profile_id": profile_id,
|
|
"natural_key_hash": self.natural_key_hash(profile_id),
|
|
}
|
|
|
|
|
|
class BaseCDRAdapter:
|
|
"""Abstract adapter. Subclasses implement ``iter_rows()``."""
|
|
|
|
FORMAT_SLUG: str = "" # matches cdr_ingestion_profiles.format
|
|
REQUIRED_FIELDS: tuple[str, ...] = ("start_time", "caller_number", "called_number", "duration_sec")
|
|
|
|
def __init__(self, profile_config: Optional[dict] = None):
|
|
self.profile_config = profile_config or {}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Abstract
|
|
# ------------------------------------------------------------------
|
|
|
|
def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
|
|
"""Yield one CDRRow per call record in the file."""
|
|
raise NotImplementedError
|
|
|
|
# ------------------------------------------------------------------
|
|
# Shared row validation — used by every subclass before yield
|
|
# ------------------------------------------------------------------
|
|
|
|
def validate_row(self, row: CDRRow) -> None:
|
|
"""Raise ValidationError if the row fails structural checks."""
|
|
if not isinstance(row.start_time, datetime):
|
|
raise ValidationError("missing_start_time", "start_time absent or unparseable")
|
|
if not row.caller_number and not row.called_number:
|
|
# We need at least one endpoint to classify. Some inbound-only
|
|
# switches omit the caller; that's fine as long as called is set.
|
|
raise ValidationError("missing_endpoints", "neither caller nor called number present")
|
|
if row.duration_sec is None:
|
|
raise ValidationError("missing_duration", "duration_sec absent")
|
|
if row.duration_sec < 0:
|
|
raise ValidationError("bad_duration", f"negative duration {row.duration_sec}")
|
|
if row.duration_sec > 86400:
|
|
raise ValidationError("bad_duration", f"duration > 24h: {row.duration_sec}")
|
|
# Sanity: start_time within a reasonable window
|
|
now = datetime.utcnow()
|
|
if row.start_time.tzinfo is not None:
|
|
# Strip tz for the comparison
|
|
start_naive = row.start_time.replace(tzinfo=None)
|
|
else:
|
|
start_naive = row.start_time
|
|
if start_naive.year < 2000 or start_naive.year > now.year + 1:
|
|
raise ValidationError(
|
|
"bad_start_time",
|
|
f"start_time out of range: {row.start_time.isoformat()}",
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Helpers for subclasses
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def parse_duration(value) -> int:
|
|
"""Normalize duration values (seconds, ms, or H:MM:SS)."""
|
|
if value is None or value == "":
|
|
return 0
|
|
if isinstance(value, (int, float)):
|
|
return int(value)
|
|
s = str(value).strip()
|
|
if ":" in s:
|
|
parts = [int(p) for p in s.split(":")]
|
|
while len(parts) < 3:
|
|
parts.insert(0, 0)
|
|
h, m, sec = parts[:3]
|
|
return h * 3600 + m * 60 + sec
|
|
try:
|
|
v = float(s)
|
|
except ValueError as exc:
|
|
raise ValidationError("bad_duration", f"unparseable duration: {s}") from exc
|
|
# Heuristic: > 100k usually means milliseconds
|
|
if v > 100_000:
|
|
return int(v / 1000)
|
|
return int(v)
|
|
|
|
@staticmethod
|
|
def parse_cents(value, *, currency: str = "USD") -> Optional[int]:
|
|
"""Turn a revenue amount string into integer cents. Returns None if unparseable."""
|
|
if value is None or value == "":
|
|
return None
|
|
s = str(value).replace(",", "").replace("$", "").strip()
|
|
if not s:
|
|
return None
|
|
try:
|
|
f = float(s)
|
|
except ValueError:
|
|
return None
|
|
return int(round(f * 100))
|
|
|
|
@staticmethod
|
|
def parse_ts(value, fmt: Optional[str] = None) -> datetime:
|
|
"""Parse a timestamp. Accepts ISO-8601, common CDR formats, or Unix epoch."""
|
|
if isinstance(value, datetime):
|
|
return value
|
|
s = str(value).strip()
|
|
if not s:
|
|
raise ValidationError("missing_start_time", "empty timestamp")
|
|
# Unix epoch
|
|
if s.isdigit():
|
|
try:
|
|
epoch = int(s)
|
|
if epoch > 10**12: # ms
|
|
epoch = epoch // 1000
|
|
return datetime.utcfromtimestamp(epoch)
|
|
except (ValueError, OverflowError):
|
|
pass
|
|
if fmt:
|
|
try:
|
|
return datetime.strptime(s, fmt)
|
|
except ValueError:
|
|
pass
|
|
# Try several common formats
|
|
for trial in (
|
|
"%Y-%m-%d %H:%M:%S",
|
|
"%Y-%m-%dT%H:%M:%S",
|
|
"%Y-%m-%dT%H:%M:%SZ",
|
|
"%Y-%m-%dT%H:%M:%S.%f",
|
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
"%m/%d/%Y %H:%M:%S",
|
|
"%d/%m/%Y %H:%M:%S",
|
|
):
|
|
try:
|
|
return datetime.strptime(s, trial)
|
|
except ValueError:
|
|
continue
|
|
# Finally, try Python's fromisoformat
|
|
try:
|
|
return datetime.fromisoformat(s.replace("Z", "+00:00"))
|
|
except ValueError as exc:
|
|
raise ValidationError("missing_start_time", f"unparseable timestamp {s!r}") from exc
|