Add audit_data_processing/src/audit_data_processing/core.py
This commit is contained in:
commit
cbf9a22a8d
1 changed files with 91 additions and 0 deletions
91
audit_data_processing/src/audit_data_processing/core.py
Normal file
91
audit_data_processing/src/audit_data_processing/core.py
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
from __future__ import annotations
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any
|
||||
import pandas as pd
|
||||
|
||||
__all__ = ["RunData", "calculate_percentiles"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunData:
|
||||
"""Datenmodell für einen einzelnen Audit-Run."""
|
||||
run_id: str
|
||||
warn_rate: float
|
||||
unknown_rate: float
|
||||
pinned: bool
|
||||
|
||||
@staticmethod
|
||||
def from_dict(row: Dict[str, Any]) -> "RunData":
|
||||
try:
|
||||
return RunData(
|
||||
run_id=str(row.get("run_id")),
|
||||
warn_rate=float(row.get("warn_rate", 0.0)),
|
||||
unknown_rate=float(row.get("unknown_rate", 0.0)),
|
||||
pinned=bool(row.get("pinned")),
|
||||
)
|
||||
except (ValueError, TypeError) as e:
|
||||
raise ValueError(f"Ungültige RunData-Zeile: {row}") from e
|
||||
|
||||
|
||||
def calculate_percentiles(data: List[RunData], percentile_levels: List[float]) -> Dict[str, Dict[str, float]]:
|
||||
"""Berechnet angegebene Perzentile für warn_rate und unknown_rate.
|
||||
|
||||
Args:
|
||||
data: Liste von RunData-Objekten.
|
||||
percentile_levels: Liste der zu berechnenden Perzentile (z. B. [50, 75, 90, 95]).
|
||||
|
||||
Returns:
|
||||
Dictionary der Form:
|
||||
{
|
||||
"warn_rate": {"p50": ..., "p75": ...},
|
||||
"unknown_rate": {"p50": ..., ...}
|
||||
}
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
if not data:
|
||||
raise ValueError("Die Eingabedatenliste ist leer.")
|
||||
|
||||
if not all(isinstance(rd, RunData) for rd in data):
|
||||
raise TypeError("Alle Elemente in 'data' müssen Instanzen von RunData sein.")
|
||||
|
||||
if not percentile_levels:
|
||||
raise ValueError("Keine Perzentil-Level angegeben.")
|
||||
|
||||
df = pd.DataFrame([{
|
||||
"run_id": r.run_id,
|
||||
"warn_rate": r.warn_rate,
|
||||
"unknown_rate": r.unknown_rate,
|
||||
"pinned": r.pinned,
|
||||
} for r in data])
|
||||
|
||||
# Eingabevalidierung: Nur numerische Spalten prüfen
|
||||
for col in ["warn_rate", "unknown_rate"]:
|
||||
if not pd.api.types.is_numeric_dtype(df[col]):
|
||||
raise TypeError(f"Spalte {col} enthält nicht-numerische Werte.")
|
||||
|
||||
percentiles_summary: Dict[str, Dict[str, float]] = {}
|
||||
|
||||
for metric in ["warn_rate", "unknown_rate"]:
|
||||
logger.debug(f"Berechne Perzentile für {metric}.")
|
||||
try:
|
||||
values = df[metric].dropna()
|
||||
if values.empty:
|
||||
percentiles_summary[metric] = {f"p{int(p)}": None for p in percentile_levels}
|
||||
continue
|
||||
computed = {}
|
||||
for p in percentile_levels:
|
||||
# Unterstützung für Eingaben >1 oder zwischen 0-1
|
||||
quant = p if 0 <= p <= 1 else p / 100.0
|
||||
value = float(values.quantile(quant))
|
||||
computed[f"p{int(p)}"] = round(value, 6)
|
||||
percentiles_summary[metric] = computed
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler bei Perzentilberechnung für {metric}: {e}")
|
||||
raise
|
||||
|
||||
# Assertions für CI-Readiness
|
||||
assert all(isinstance(v, dict) for v in percentiles_summary.values()), "Ungültige Struktur im Ergebnis."
|
||||
assert all(all(isinstance(val, (float, type(None))) for val in d.values()) for d in percentiles_summary.values()), "Nicht-numerische Ergebnisse."
|
||||
|
||||
return percentiles_summary
|
||||
Loading…
Reference in a new issue