Add outlier_analysis/src/outlier_analysis/core.py
This commit is contained in:
parent
4b052da98a
commit
85fd92bba5
1 changed files with 76 additions and 0 deletions
76
outlier_analysis/src/outlier_analysis/core.py
Normal file
76
outlier_analysis/src/outlier_analysis/core.py
Normal file
|
|
@ -0,0 +1,76 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import statistics
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class InputValidationError(ValueError):
|
||||||
|
"""Raised when input validation fails for log entries."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_log_entries(log_entries: List[Dict[str, Any]]) -> None:
|
||||||
|
if not isinstance(log_entries, list):
|
||||||
|
raise InputValidationError("log_entries must be a list of dicts.")
|
||||||
|
for entry in log_entries:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
raise InputValidationError("Each log entry must be a dict.")
|
||||||
|
required_fields = [
|
||||||
|
"corr_id",
|
||||||
|
"stratum",
|
||||||
|
"retry_total_overhead_ms",
|
||||||
|
]
|
||||||
|
for field in required_fields:
|
||||||
|
if field not in entry:
|
||||||
|
raise InputValidationError(f"Missing required field '{field}' in log entry.")
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_outliers(log_entries: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||||
|
"""Analysiert Log-Einträge und erstellt statistische Kennzahlen zu Outliern.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
log_entries (List[Dict[str, Any]]): Liste von Log-Einträgen.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Statistik- und Clusterinformationen im JSON-kompatiblen Format.
|
||||||
|
"""
|
||||||
|
_validate_log_entries(log_entries)
|
||||||
|
|
||||||
|
df = pd.DataFrame(log_entries)
|
||||||
|
if df.empty or 'retry_total_overhead_ms' not in df.columns:
|
||||||
|
raise InputValidationError("No retry_total_overhead_ms data found.")
|
||||||
|
|
||||||
|
values = df['retry_total_overhead_ms'].dropna().astype(float).tolist()
|
||||||
|
if not values:
|
||||||
|
raise InputValidationError("No valid numerical values for retry_total_overhead_ms.")
|
||||||
|
|
||||||
|
report: Dict[str, Any] = {
|
||||||
|
"mean": float(statistics.fmean(values)),
|
||||||
|
"median": float(statistics.median(values)),
|
||||||
|
"p90": float(df['retry_total_overhead_ms'].quantile(0.90)),
|
||||||
|
"p95": float(df['retry_total_overhead_ms'].quantile(0.95)),
|
||||||
|
"p99": float(df['retry_total_overhead_ms'].quantile(0.99)),
|
||||||
|
"max": float(df['retry_total_overhead_ms'].max()),
|
||||||
|
"clusters": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
high_threshold = report["p95"]
|
||||||
|
clusters = []
|
||||||
|
high_outliers = df[df['retry_total_overhead_ms'] >= high_threshold]
|
||||||
|
if not high_outliers.empty:
|
||||||
|
grouped = high_outliers.groupby('stratum')
|
||||||
|
for name, group in grouped:
|
||||||
|
clusters.append({
|
||||||
|
"stratum": name,
|
||||||
|
"count": int(len(group)),
|
||||||
|
"mean_overhead": float(group['retry_total_overhead_ms'].mean()),
|
||||||
|
"max_overhead": float(group['retry_total_overhead_ms'].max()),
|
||||||
|
})
|
||||||
|
report["clusters"] = clusters
|
||||||
|
|
||||||
|
# CI assertions for data integrity
|
||||||
|
assert set(report.keys()) == {"mean", "median", "p90", "p95", "p99", "max", "clusters"}, "Unexpected report format"
|
||||||
|
|
||||||
|
return report
|
||||||
Loading…
Reference in a new issue