Add drift_data_collection/src/drift_data_collection/core.py

2026-01-29 16:23:30 +00:00 · 2026-01-29 16:23:30 +00:00 · 585d109adc
commit 585d109adc
parent e06b7cba46
1 changed files with 105 additions and 0 deletions
--- a/drift_data_collection/src/drift_data_collection/core.py
+++ b/drift_data_collection/src/drift_data_collection/core.py
@ -0,0 +1,105 @@
 import json
 import logging
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 import requests
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 class DataCollectionError(Exception):
    """Custom exception for errors during data collection."""
 class FrozenRun:
    """Represents frozen CI run data for drift analysis."""
    def __init__(self, run_id: str, status: str, timestamp: datetime, pinned_state: bool, metrics: str) -> None:
        self.run_id = run_id
        self.status = status
        self.timestamp = timestamp
        self.pinned_state = pinned_state
        self.metrics = metrics
    def to_dict(self) -> Dict[str, Any]:
        return {
            "run_id": self.run_id,
            "status": self.status,
            "timestamp": self.timestamp.isoformat(),
            "pinned_state": self.pinned_state,
            "metrics": self.metrics,
        }
 def _validate_run_ids(run_ids: List[str]) -> None:
    if not isinstance(run_ids, list):
        raise ValueError("run_ids must be a list of strings.")
    for rid in run_ids:
        if not isinstance(rid, str) or not rid.strip():
            raise ValueError(f"Invalid run_id: {rid!r}")
 def _fetch_run_data(run_id: str) -> Optional[FrozenRun]:
    """Fetch run data; fallback to simulated local info if retrieval fails."""
    try:
        # Placeholder endpoint, could be replaced with local file lookup or API
        response = requests.get(f"https://ci.example.com/api/runs/{run_id}", timeout=5)
        if response.status_code != 200:
            logger.warning("Run %s returned HTTP %s", run_id, response.status_code)
            return None
        data = response.json()
        ts_str = data.get("timestamp")
        try:
            timestamp = datetime.fromisoformat(ts_str) if ts_str else datetime.utcnow()
        except ValueError:
            timestamp = datetime.utcnow()
        return FrozenRun(
            run_id=data.get("run_id", run_id),
            status=data.get("status", "UNKNOWN"),
            timestamp=timestamp,
            pinned_state=bool(data.get("pinned_state", False)),
            metrics=json.dumps(data.get("metrics", {})),
        )
    except requests.RequestException as e:
        logger.error("Network error collecting run %s: %s", run_id, e)
        return None
    except Exception as e:
        logger.exception("Unexpected error fetching run %s: %s", run_id, e)
        return None
 def collect_frozen_runs(run_ids: List[str]) -> List[Dict[str, Any]]:
    """Collect and aggregate frozen run data for given run IDs.
    Args:
        run_ids: List of run IDs to collect.
    Returns:
        List of dictionaries containing frozen run data.
    """
    _validate_run_ids(run_ids)
    collected: List[Dict[str, Any]] = []
    for rid in run_ids:
        run = _fetch_run_data(rid)
        if run:
            collected.append(run.to_dict())
        else:
            # Build default entry if missing
            logger.info("Using fallback data for run %s", rid)
            collected.append(
                FrozenRun(
                    run_id=rid,
                    status="MISSING",
                    timestamp=datetime.utcnow(),
                    pinned_state=False,
                    metrics="{}",
                ).to_dict()
            )
    assert all("run_id" in r for r in collected), "Integrity check failed: missing run_id"
    return collected