"""Base CDR adapter — shared interface + normalized row type + validation. All format adapters inherit from ``BaseCDRAdapter`` and implement ``iter_rows()``. The ingester sees only this interface, so the classifier and quarantine logic are adapter-agnostic. """ from __future__ import annotations import hashlib import logging from dataclasses import dataclass, field, asdict from datetime import datetime from typing import Iterator, Optional logger = logging.getLogger(__name__) class ValidationError(Exception): """Raised by an adapter when a row fails structural validation. The ingester catches this and routes the offending row into ``cdr_quarantine`` with ``reason_code``. """ def __init__(self, reason_code: str, detail: str = ""): super().__init__(f"{reason_code}: {detail}") self.reason_code = reason_code self.detail = detail @dataclass class CDRRow: """Normalized single-call CDR row, pre-classification.""" # Required fields (adapters must set) start_time: datetime caller_number: str called_number: str duration_sec: int # Strongly preferred (revenue-first attribution) billed_amount_cents: Optional[int] = None billed_currency: Optional[str] = None # Optional — improves bucketing / accuracy call_direction: Optional[str] = None # inbound|outbound disposition: Optional[str] = None # answered|no_answer|busy|failed trunk_group_id: Optional[str] = None customer_account_id: Optional[str] = None customer_type_override: Optional[str] = None # per-row wholesale/retail tag # Provenance — set by the adapter natural_key: str = "" # adapter-specific uniqueness key source_file: Optional[str] = None source_row: Optional[int] = None # Raw payload for quarantine re-processing raw: dict = field(default_factory=dict) def natural_key_hash(self, profile_id: int) -> str: """Stable SHA-1 hash used as the dedup key in cdr_calls.""" basis = f"{profile_id}|{self.natural_key}" return hashlib.sha1(basis.encode("utf-8")).hexdigest() def to_db_tuple(self, profile_id: int) -> dict: return { **asdict(self), "profile_id": profile_id, "natural_key_hash": self.natural_key_hash(profile_id), } class BaseCDRAdapter: """Abstract adapter. Subclasses implement ``iter_rows()``.""" FORMAT_SLUG: str = "" # matches cdr_ingestion_profiles.format REQUIRED_FIELDS: tuple[str, ...] = ("start_time", "caller_number", "called_number", "duration_sec") def __init__(self, profile_config: Optional[dict] = None): self.profile_config = profile_config or {} # ------------------------------------------------------------------ # Abstract # ------------------------------------------------------------------ def iter_rows(self, local_path: str) -> Iterator[CDRRow]: """Yield one CDRRow per call record in the file.""" raise NotImplementedError # ------------------------------------------------------------------ # Shared row validation — used by every subclass before yield # ------------------------------------------------------------------ def validate_row(self, row: CDRRow) -> None: """Raise ValidationError if the row fails structural checks.""" if not isinstance(row.start_time, datetime): raise ValidationError("missing_start_time", "start_time absent or unparseable") if not row.caller_number and not row.called_number: # We need at least one endpoint to classify. Some inbound-only # switches omit the caller; that's fine as long as called is set. raise ValidationError("missing_endpoints", "neither caller nor called number present") if row.duration_sec is None: raise ValidationError("missing_duration", "duration_sec absent") if row.duration_sec < 0: raise ValidationError("bad_duration", f"negative duration {row.duration_sec}") if row.duration_sec > 86400: raise ValidationError("bad_duration", f"duration > 24h: {row.duration_sec}") # Sanity: start_time within a reasonable window now = datetime.utcnow() if row.start_time.tzinfo is not None: # Strip tz for the comparison start_naive = row.start_time.replace(tzinfo=None) else: start_naive = row.start_time if start_naive.year < 2000 or start_naive.year > now.year + 1: raise ValidationError( "bad_start_time", f"start_time out of range: {row.start_time.isoformat()}", ) # ------------------------------------------------------------------ # Helpers for subclasses # ------------------------------------------------------------------ @staticmethod def parse_duration(value) -> int: """Normalize duration values (seconds, ms, or H:MM:SS).""" if value is None or value == "": return 0 if isinstance(value, (int, float)): return int(value) s = str(value).strip() if ":" in s: parts = [int(p) for p in s.split(":")] while len(parts) < 3: parts.insert(0, 0) h, m, sec = parts[:3] return h * 3600 + m * 60 + sec try: v = float(s) except ValueError as exc: raise ValidationError("bad_duration", f"unparseable duration: {s}") from exc # Heuristic: > 100k usually means milliseconds if v > 100_000: return int(v / 1000) return int(v) @staticmethod def parse_cents(value, *, currency: str = "USD") -> Optional[int]: """Turn a revenue amount string into integer cents. Returns None if unparseable.""" if value is None or value == "": return None s = str(value).replace(",", "").replace("$", "").strip() if not s: return None try: f = float(s) except ValueError: return None return int(round(f * 100)) @staticmethod def parse_ts(value, fmt: Optional[str] = None) -> datetime: """Parse a timestamp. Accepts ISO-8601, common CDR formats, or Unix epoch.""" if isinstance(value, datetime): return value s = str(value).strip() if not s: raise ValidationError("missing_start_time", "empty timestamp") # Unix epoch if s.isdigit(): try: epoch = int(s) if epoch > 10**12: # ms epoch = epoch // 1000 return datetime.utcfromtimestamp(epoch) except (ValueError, OverflowError): pass if fmt: try: return datetime.strptime(s, fmt) except ValueError: pass # Try several common formats for trial in ( "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%fZ", "%m/%d/%Y %H:%M:%S", "%d/%m/%Y %H:%M:%S", ): try: return datetime.strptime(s, trial) except ValueError: continue # Finally, try Python's fromisoformat try: return datetime.fromisoformat(s.replace("Z", "+00:00")) except ValueError as exc: raise ValidationError("missing_start_time", f"unparseable timestamp {s!r}") from exc