Add data_analysis_script/src/data_analysis_script/core.py
This commit is contained in:
parent
17e9f9abf6
commit
d65baef617
1 changed files with 81 additions and 0 deletions
81
data_analysis_script/src/data_analysis_script/core.py
Normal file
81
data_analysis_script/src/data_analysis_script/core.py
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import statistics
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.addHandler(logging.NullHandler())
|
||||||
|
|
||||||
|
|
||||||
|
class LogAnalysisError(Exception):
|
||||||
|
"""Custom exception for log analysis errors."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AnalysisResults:
|
||||||
|
"""Speichert Ergebnisse der Log-Analyse."""
|
||||||
|
|
||||||
|
max_outlier: float
|
||||||
|
band_center: float
|
||||||
|
band_width: float
|
||||||
|
|
||||||
|
def to_json(self) -> Dict[str, Any]:
|
||||||
|
"""Konvertiert die Analyseergebnisse in ein JSON-kompatibles Dict."""
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_log_dataframe(df: pd.DataFrame) -> None:
|
||||||
|
required_cols = {"worker_start_offset", "expires_at_dist_hours", "retry_total_overhead_ms"}
|
||||||
|
missing = required_cols - set(df.columns)
|
||||||
|
if missing:
|
||||||
|
raise LogAnalysisError(f"Missing required columns: {', '.join(missing)}")
|
||||||
|
if df.empty:
|
||||||
|
raise LogAnalysisError("Input log file is empty.")
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_logs(log_file: str) -> AnalysisResults:
|
||||||
|
"""Analysiert eine Logdatei, extrahiert relevante Metriken und gibt strukturierte Ergebnisse zurück."""
|
||||||
|
|
||||||
|
path = Path(log_file)
|
||||||
|
if not path.exists():
|
||||||
|
raise LogAnalysisError(f"Log file not found: {log_file}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Fehler beim Einlesen der CSV-Datei")
|
||||||
|
raise LogAnalysisError(f"Fehler beim Einlesen der CSV-Datei: {exc}")
|
||||||
|
|
||||||
|
_validate_log_dataframe(df)
|
||||||
|
|
||||||
|
# Berechnungen der Metriken
|
||||||
|
latencies = df["expires_at_dist_hours"].astype(float)
|
||||||
|
start_offsets = df["worker_start_offset"].astype(float)
|
||||||
|
|
||||||
|
max_outlier = float(latencies.max())
|
||||||
|
band_center = float(latencies.mean())
|
||||||
|
try:
|
||||||
|
band_width = float(statistics.stdev(latencies))
|
||||||
|
except statistics.StatisticsError:
|
||||||
|
band_width = 0.0
|
||||||
|
|
||||||
|
result = AnalysisResults(
|
||||||
|
max_outlier=max_outlier,
|
||||||
|
band_center=band_center,
|
||||||
|
band_width=band_width,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Eingebaute Validierungen (CI-Ready)
|
||||||
|
assert result.max_outlier >= 0, "max_outlier muss nicht-negativ sein"
|
||||||
|
assert result.band_width >= 0, "band_width muss nicht-negativ sein"
|
||||||
|
|
||||||
|
logger.debug("Analyse abgeschlossen: %s", result)
|
||||||
|
return result
|
||||||
Loading…
Reference in a new issue