Add audit_data_processing/src/audit_data_processing/core.py
This commit is contained in:
commit
cbf9a22a8d
1 changed files with 91 additions and 0 deletions
91
audit_data_processing/src/audit_data_processing/core.py
Normal file
91
audit_data_processing/src/audit_data_processing/core.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
__all__ = ["RunData", "calculate_percentiles"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RunData:
|
||||||
|
"""Datenmodell für einen einzelnen Audit-Run."""
|
||||||
|
run_id: str
|
||||||
|
warn_rate: float
|
||||||
|
unknown_rate: float
|
||||||
|
pinned: bool
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_dict(row: Dict[str, Any]) -> "RunData":
|
||||||
|
try:
|
||||||
|
return RunData(
|
||||||
|
run_id=str(row.get("run_id")),
|
||||||
|
warn_rate=float(row.get("warn_rate", 0.0)),
|
||||||
|
unknown_rate=float(row.get("unknown_rate", 0.0)),
|
||||||
|
pinned=bool(row.get("pinned")),
|
||||||
|
)
|
||||||
|
except (ValueError, TypeError) as e:
|
||||||
|
raise ValueError(f"Ungültige RunData-Zeile: {row}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_percentiles(data: List[RunData], percentile_levels: List[float]) -> Dict[str, Dict[str, float]]:
|
||||||
|
"""Berechnet angegebene Perzentile für warn_rate und unknown_rate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Liste von RunData-Objekten.
|
||||||
|
percentile_levels: Liste der zu berechnenden Perzentile (z. B. [50, 75, 90, 95]).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary der Form:
|
||||||
|
{
|
||||||
|
"warn_rate": {"p50": ..., "p75": ...},
|
||||||
|
"unknown_rate": {"p50": ..., ...}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
if not data:
|
||||||
|
raise ValueError("Die Eingabedatenliste ist leer.")
|
||||||
|
|
||||||
|
if not all(isinstance(rd, RunData) for rd in data):
|
||||||
|
raise TypeError("Alle Elemente in 'data' müssen Instanzen von RunData sein.")
|
||||||
|
|
||||||
|
if not percentile_levels:
|
||||||
|
raise ValueError("Keine Perzentil-Level angegeben.")
|
||||||
|
|
||||||
|
df = pd.DataFrame([{
|
||||||
|
"run_id": r.run_id,
|
||||||
|
"warn_rate": r.warn_rate,
|
||||||
|
"unknown_rate": r.unknown_rate,
|
||||||
|
"pinned": r.pinned,
|
||||||
|
} for r in data])
|
||||||
|
|
||||||
|
# Eingabevalidierung: Nur numerische Spalten prüfen
|
||||||
|
for col in ["warn_rate", "unknown_rate"]:
|
||||||
|
if not pd.api.types.is_numeric_dtype(df[col]):
|
||||||
|
raise TypeError(f"Spalte {col} enthält nicht-numerische Werte.")
|
||||||
|
|
||||||
|
percentiles_summary: Dict[str, Dict[str, float]] = {}
|
||||||
|
|
||||||
|
for metric in ["warn_rate", "unknown_rate"]:
|
||||||
|
logger.debug(f"Berechne Perzentile für {metric}.")
|
||||||
|
try:
|
||||||
|
values = df[metric].dropna()
|
||||||
|
if values.empty:
|
||||||
|
percentiles_summary[metric] = {f"p{int(p)}": None for p in percentile_levels}
|
||||||
|
continue
|
||||||
|
computed = {}
|
||||||
|
for p in percentile_levels:
|
||||||
|
# Unterstützung für Eingaben >1 oder zwischen 0-1
|
||||||
|
quant = p if 0 <= p <= 1 else p / 100.0
|
||||||
|
value = float(values.quantile(quant))
|
||||||
|
computed[f"p{int(p)}"] = round(value, 6)
|
||||||
|
percentiles_summary[metric] = computed
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Fehler bei Perzentilberechnung für {metric}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Assertions für CI-Readiness
|
||||||
|
assert all(isinstance(v, dict) for v in percentiles_summary.values()), "Ungültige Struktur im Ergebnis."
|
||||||
|
assert all(all(isinstance(val, (float, type(None))) for val in d.values()) for d in percentiles_summary.values()), "Nicht-numerische Ergebnisse."
|
||||||
|
|
||||||
|
return percentiles_summary
|
||||||
Loading…
Reference in a new issue