From cbf9a22a8d1321e0a4cc2fdc75b997a694cb4340 Mon Sep 17 00:00:00 2001 From: Mika Date: Thu, 5 Feb 2026 13:42:01 +0000 Subject: [PATCH] Add audit_data_processing/src/audit_data_processing/core.py --- .../src/audit_data_processing/core.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 audit_data_processing/src/audit_data_processing/core.py diff --git a/audit_data_processing/src/audit_data_processing/core.py b/audit_data_processing/src/audit_data_processing/core.py new file mode 100644 index 0000000..8e55ae9 --- /dev/null +++ b/audit_data_processing/src/audit_data_processing/core.py @@ -0,0 +1,91 @@ +from __future__ import annotations +import logging +from dataclasses import dataclass +from typing import List, Dict, Any +import pandas as pd + +__all__ = ["RunData", "calculate_percentiles"] + + +@dataclass +class RunData: + """Datenmodell für einen einzelnen Audit-Run.""" + run_id: str + warn_rate: float + unknown_rate: float + pinned: bool + + @staticmethod + def from_dict(row: Dict[str, Any]) -> "RunData": + try: + return RunData( + run_id=str(row.get("run_id")), + warn_rate=float(row.get("warn_rate", 0.0)), + unknown_rate=float(row.get("unknown_rate", 0.0)), + pinned=bool(row.get("pinned")), + ) + except (ValueError, TypeError) as e: + raise ValueError(f"Ungültige RunData-Zeile: {row}") from e + + +def calculate_percentiles(data: List[RunData], percentile_levels: List[float]) -> Dict[str, Dict[str, float]]: + """Berechnet angegebene Perzentile für warn_rate und unknown_rate. + + Args: + data: Liste von RunData-Objekten. + percentile_levels: Liste der zu berechnenden Perzentile (z. B. [50, 75, 90, 95]). + + Returns: + Dictionary der Form: + { + "warn_rate": {"p50": ..., "p75": ...}, + "unknown_rate": {"p50": ..., ...} + } + """ + logger = logging.getLogger(__name__) + if not data: + raise ValueError("Die Eingabedatenliste ist leer.") + + if not all(isinstance(rd, RunData) for rd in data): + raise TypeError("Alle Elemente in 'data' müssen Instanzen von RunData sein.") + + if not percentile_levels: + raise ValueError("Keine Perzentil-Level angegeben.") + + df = pd.DataFrame([{ + "run_id": r.run_id, + "warn_rate": r.warn_rate, + "unknown_rate": r.unknown_rate, + "pinned": r.pinned, + } for r in data]) + + # Eingabevalidierung: Nur numerische Spalten prüfen + for col in ["warn_rate", "unknown_rate"]: + if not pd.api.types.is_numeric_dtype(df[col]): + raise TypeError(f"Spalte {col} enthält nicht-numerische Werte.") + + percentiles_summary: Dict[str, Dict[str, float]] = {} + + for metric in ["warn_rate", "unknown_rate"]: + logger.debug(f"Berechne Perzentile für {metric}.") + try: + values = df[metric].dropna() + if values.empty: + percentiles_summary[metric] = {f"p{int(p)}": None for p in percentile_levels} + continue + computed = {} + for p in percentile_levels: + # Unterstützung für Eingaben >1 oder zwischen 0-1 + quant = p if 0 <= p <= 1 else p / 100.0 + value = float(values.quantile(quant)) + computed[f"p{int(p)}"] = round(value, 6) + percentiles_summary[metric] = computed + except Exception as e: + logger.error(f"Fehler bei Perzentilberechnung für {metric}: {e}") + raise + + # Assertions für CI-Readiness + assert all(isinstance(v, dict) for v in percentiles_summary.values()), "Ungültige Struktur im Ergebnis." + assert all(all(isinstance(val, (float, type(None))) for val in d.values()) for d in percentiles_summary.values()), "Nicht-numerische Ergebnisse." + + return percentiles_summary