Add audit_data_processing/src/audit_data_processing/cli.py

This commit is contained in:
Mika 2026-02-05 13:42:01 +00:00
parent cbf9a22a8d
commit 9c87dd9de4

View file

@ -0,0 +1,77 @@
import argparse
import json
import os
from pathlib import Path
from typing import List
import pandas as pd
def _validate_input_file(path: Path) -> None:
if not path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {path}")
if not path.is_file():
raise ValueError(f"Pfad ist keine Datei: {path}")
def _validate_percentiles(levels: List[float]) -> None:
if not levels:
raise ValueError("Es muss mindestens ein Perzentilwert angegeben werden.")
for p in levels:
if not (0 < p < 100):
raise ValueError(f"Ungültiger Perzentilwert: {p}")
def _load_data(input_path: Path) -> pd.DataFrame:
df = pd.read_csv(input_path)
expected_cols = {"run_id", "warn_rate", "unknown_rate", "pinned"}
missing = expected_cols - set(df.columns)
if missing:
raise ValueError(f"Fehlende Spalten in Eingabedatei: {missing}")
# Typprüfung und Konvertierung
df["warn_rate"] = pd.to_numeric(df["warn_rate"], errors="raise")
df["unknown_rate"] = pd.to_numeric(df["unknown_rate"], errors="raise")
df["pinned"] = df["pinned"].astype(bool)
return df
def _calculate_percentiles(df: pd.DataFrame, percentiles: List[float]) -> dict:
summary = {}
for metric in ["warn_rate", "unknown_rate"]:
summary[metric] = {}
for p in percentiles:
summary[metric][f"p{int(p)}"] = float(df[metric].quantile(p / 100.0))
return summary
def main() -> None:
parser = argparse.ArgumentParser(description="Audit-Daten analysieren und Perzentile berechnen.")
parser.add_argument("--input", required=True, help="Pfad zur audit.csv-Datei.")
parser.add_argument("--output", required=True, help="Pfad zur Ausgabe-JSON-Datei.")
parser.add_argument(
"--percentiles", nargs="*", type=float, default=[50, 75, 90, 95], help="Liste der Perzentile."
)
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
_validate_input_file(input_path)
_validate_percentiles(args.percentiles)
df = _load_data(input_path)
result = {
"pinned": _calculate_percentiles(df[df["pinned"]], args.percentiles),
"unpinned": _calculate_percentiles(df[~df["pinned"]], args.percentiles),
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"Perzentile erfolgreich berechnet und gespeichert unter: {output_path}")
if __name__ == "__main__":
main()