new-site/scripts/workers/cdr_adapters/base.py

"""Base CDR adapter — shared interface + normalized row type + validation.

All format adapters inherit from ``BaseCDRAdapter`` and implement
``iter_rows()``. The ingester sees only this interface, so the classifier
and quarantine logic are adapter-agnostic.
"""

from __future__ import annotations

import hashlib
import logging
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Iterator, Optional

logger = logging.getLogger(__name__)


class ValidationError(Exception):
    """Raised by an adapter when a row fails structural validation.

    The ingester catches this and routes the offending row into
    ``cdr_quarantine`` with ``reason_code``.
    """

    def __init__(self, reason_code: str, detail: str = ""):
        super().__init__(f"{reason_code}: {detail}")
        self.reason_code = reason_code
        self.detail = detail


@dataclass
class CDRRow:
    """Normalized single-call CDR row, pre-classification."""
    # Required fields (adapters must set)
    start_time: datetime
    caller_number: str
    called_number: str
    duration_sec: int

    # Strongly preferred (revenue-first attribution)
    billed_amount_cents: Optional[int] = None
    billed_currency: Optional[str] = None

    # Optional — improves bucketing / accuracy
    call_direction: Optional[str] = None       # inbound|outbound
    disposition: Optional[str] = None           # answered|no_answer|busy|failed
    trunk_group_id: Optional[str] = None
    customer_account_id: Optional[str] = None
    customer_type_override: Optional[str] = None  # per-row wholesale/retail tag

    # Provenance — set by the adapter
    natural_key: str = ""                       # adapter-specific uniqueness key
    source_file: Optional[str] = None
    source_row: Optional[int] = None

    # Raw payload for quarantine re-processing
    raw: dict = field(default_factory=dict)

    def natural_key_hash(self, profile_id: int) -> str:
        """Stable SHA-1 hash used as the dedup key in cdr_calls."""
        basis = f"{profile_id}|{self.natural_key}"
        return hashlib.sha1(basis.encode("utf-8")).hexdigest()

    def to_db_tuple(self, profile_id: int) -> dict:
        return {
            **asdict(self),
            "profile_id": profile_id,
            "natural_key_hash": self.natural_key_hash(profile_id),
        }


class BaseCDRAdapter:
    """Abstract adapter. Subclasses implement ``iter_rows()``."""

    FORMAT_SLUG: str = ""              # matches cdr_ingestion_profiles.format
    REQUIRED_FIELDS: tuple[str, ...] = ("start_time", "caller_number", "called_number", "duration_sec")

    def __init__(self, profile_config: Optional[dict] = None):
        self.profile_config = profile_config or {}

    # ------------------------------------------------------------------
    # Abstract
    # ------------------------------------------------------------------

    def iter_rows(self, local_path: str) -> Iterator[CDRRow]:
        """Yield one CDRRow per call record in the file."""
        raise NotImplementedError

    # ------------------------------------------------------------------
    # Shared row validation — used by every subclass before yield
    # ------------------------------------------------------------------

    def validate_row(self, row: CDRRow) -> None:
        """Raise ValidationError if the row fails structural checks."""
        if not isinstance(row.start_time, datetime):
            raise ValidationError("missing_start_time", "start_time absent or unparseable")
        if not row.caller_number and not row.called_number:
            # We need at least one endpoint to classify. Some inbound-only
            # switches omit the caller; that's fine as long as called is set.
            raise ValidationError("missing_endpoints", "neither caller nor called number present")
        if row.duration_sec is None:
            raise ValidationError("missing_duration", "duration_sec absent")
        if row.duration_sec < 0:
            raise ValidationError("bad_duration", f"negative duration {row.duration_sec}")
        if row.duration_sec > 86400:
            raise ValidationError("bad_duration", f"duration > 24h: {row.duration_sec}")
        # Sanity: start_time within a reasonable window
        now = datetime.utcnow()
        if row.start_time.tzinfo is not None:
            # Strip tz for the comparison
            start_naive = row.start_time.replace(tzinfo=None)
        else:
            start_naive = row.start_time
        if start_naive.year < 2000 or start_naive.year > now.year + 1:
            raise ValidationError(
                "bad_start_time",
                f"start_time out of range: {row.start_time.isoformat()}",
            )

    # ------------------------------------------------------------------
    # Helpers for subclasses
    # ------------------------------------------------------------------

    @staticmethod
    def parse_duration(value) -> int:
        """Normalize duration values (seconds, ms, or H:MM:SS)."""
        if value is None or value == "":
            return 0
        if isinstance(value, (int, float)):
            return int(value)
        s = str(value).strip()
        if ":" in s:
            parts = [int(p) for p in s.split(":")]
            while len(parts) < 3:
                parts.insert(0, 0)
            h, m, sec = parts[:3]
            return h * 3600 + m * 60 + sec
        try:
            v = float(s)
        except ValueError as exc:
            raise ValidationError("bad_duration", f"unparseable duration: {s}") from exc
        # Heuristic: > 100k usually means milliseconds
        if v > 100_000:
            return int(v / 1000)
        return int(v)

    @staticmethod
    def parse_cents(value, *, currency: str = "USD") -> Optional[int]:
        """Turn a revenue amount string into integer cents. Returns None if unparseable."""
        if value is None or value == "":
            return None
        s = str(value).replace(",", "").replace("$", "").strip()
        if not s:
            return None
        try:
            f = float(s)
        except ValueError:
            return None
        return int(round(f * 100))

    @staticmethod
    def parse_ts(value, fmt: Optional[str] = None) -> datetime:
        """Parse a timestamp. Accepts ISO-8601, common CDR formats, or Unix epoch."""
        if isinstance(value, datetime):
            return value
        s = str(value).strip()
        if not s:
            raise ValidationError("missing_start_time", "empty timestamp")
        # Unix epoch
        if s.isdigit():
            try:
                epoch = int(s)
                if epoch > 10**12:         # ms
                    epoch = epoch // 1000
                return datetime.utcfromtimestamp(epoch)
            except (ValueError, OverflowError):
                pass
        if fmt:
            try:
                return datetime.strptime(s, fmt)
            except ValueError:
                pass
        # Try several common formats
        for trial in (
            "%Y-%m-%d %H:%M:%S",
            "%Y-%m-%dT%H:%M:%S",
            "%Y-%m-%dT%H:%M:%SZ",
            "%Y-%m-%dT%H:%M:%S.%f",
            "%Y-%m-%dT%H:%M:%S.%fZ",
            "%m/%d/%Y %H:%M:%S",
            "%d/%m/%Y %H:%M:%S",
        ):
            try:
                return datetime.strptime(s, trial)
            except ValueError:
                continue
        # Finally, try Python's fromisoformat
        try:
            return datetime.fromisoformat(s.replace("Z", "+00:00"))
        except ValueError as exc:
            raise ValidationError("missing_start_time", f"unparseable timestamp {s!r}") from exc