Add audit_data_processing/src/audit_data_processing/cli.py

2026-02-05 13:42:01 +00:00 · 2026-02-05 13:42:01 +00:00 · 9c87dd9de4
commit 9c87dd9de4
parent cbf9a22a8d
1 changed files with 77 additions and 0 deletions
--- a/audit_data_processing/src/audit_data_processing/cli.py
+++ b/audit_data_processing/src/audit_data_processing/cli.py
@ -0,0 +1,77 @@
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+
+
+def _validate_input_file(path: Path) -> None:
+    if not path.exists():
+        raise FileNotFoundError(f"Eingabedatei nicht gefunden: {path}")
+    if not path.is_file():
+        raise ValueError(f"Pfad ist keine Datei: {path}")
+
+
+def _validate_percentiles(levels: List[float]) -> None:
+    if not levels:
+        raise ValueError("Es muss mindestens ein Perzentilwert angegeben werden.")
+    for p in levels:
+        if not (0 < p < 100):
+            raise ValueError(f"Ungültiger Perzentilwert: {p}")
+
+
+def _load_data(input_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(input_path)
+    expected_cols = {"run_id", "warn_rate", "unknown_rate", "pinned"}
+    missing = expected_cols - set(df.columns)
+    if missing:
+        raise ValueError(f"Fehlende Spalten in Eingabedatei: {missing}")
+
+    # Typprüfung und Konvertierung
+    df["warn_rate"] = pd.to_numeric(df["warn_rate"], errors="raise")
+    df["unknown_rate"] = pd.to_numeric(df["unknown_rate"], errors="raise")
+    df["pinned"] = df["pinned"].astype(bool)
+    return df
+
+
+def _calculate_percentiles(df: pd.DataFrame, percentiles: List[float]) -> dict:
+    summary = {}
+    for metric in ["warn_rate", "unknown_rate"]:
+        summary[metric] = {}
+        for p in percentiles:
+            summary[metric][f"p{int(p)}"] = float(df[metric].quantile(p / 100.0))
+    return summary
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Audit-Daten analysieren und Perzentile berechnen.")
+    parser.add_argument("--input", required=True, help="Pfad zur audit.csv-Datei.")
+    parser.add_argument("--output", required=True, help="Pfad zur Ausgabe-JSON-Datei.")
+    parser.add_argument(
+        "--percentiles", nargs="*", type=float, default=[50, 75, 90, 95], help="Liste der Perzentile."
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+
+    _validate_input_file(input_path)
+    _validate_percentiles(args.percentiles)
+
+    df = _load_data(input_path)
+    result = {
+        "pinned": _calculate_percentiles(df[df["pinned"]], args.percentiles),
+        "unpinned": _calculate_percentiles(df[~df["pinned"]], args.percentiles),
+    }
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, ensure_ascii=False)
+
+    print(f"Perzentile erfolgreich berechnet und gespeichert unter: {output_path}")
+
+
+if __name__ == "__main__":
+    main()