Add outlier_analysis/src/outlier_analysis/core.py
This commit is contained in:
commit
7c883811f2
1 changed files with 106 additions and 0 deletions
106
outlier_analysis/src/outlier_analysis/core.py
Normal file
106
outlier_analysis/src/outlier_analysis/core.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
from __future__ import annotations
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
import pandas as pd
|
||||
from dataclasses import dataclass, asdict
|
||||
import statistics
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogRecord:
|
||||
run_id: str
|
||||
latency_ms: float
|
||||
stratum: str
|
||||
job_parallelism: int
|
||||
retry_total_overhead_ms: float
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'LogRecord':
|
||||
required_fields = {f.name for f in cls.__dataclass_fields__.values()}
|
||||
missing = required_fields - data.keys()
|
||||
if missing:
|
||||
raise ValueError(f"Missing required fields: {missing}")
|
||||
try:
|
||||
return cls(
|
||||
run_id=str(data['run_id']),
|
||||
latency_ms=float(data['latency_ms']),
|
||||
stratum=str(data['stratum']),
|
||||
job_parallelism=int(data['job_parallelism']),
|
||||
retry_total_overhead_ms=float(data['retry_total_overhead_ms'])
|
||||
)
|
||||
except (ValueError, TypeError) as exc:
|
||||
raise ValueError(f"Invalid data in LogRecord: {data}") from exc
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutlierSummary:
|
||||
run_id: str
|
||||
outlier_count: int
|
||||
latency_distribution: Dict[str, float]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
class OutlierAnalysisError(Exception):
|
||||
"""Custom exception for outlier analysis errors."""
|
||||
pass
|
||||
|
||||
|
||||
def analyze_outliers(log_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analysiert Logdaten auf Latenz-Outlier und erzeugt eine summarische Statistik pro Run.
|
||||
|
||||
Parameter:
|
||||
log_data: Liste von Logeinträgen als Dicts.
|
||||
|
||||
Rückgabe:
|
||||
Dict mit Schlüssel = run_id, Wert = OutlierSummary.to_dict()
|
||||
"""
|
||||
if not isinstance(log_data, list):
|
||||
raise OutlierAnalysisError("Input log_data must be a list of dicts.")
|
||||
|
||||
try:
|
||||
records = [LogRecord.from_dict(item) for item in log_data]
|
||||
except ValueError as e:
|
||||
logger.error("Data validation failed: %s", e)
|
||||
raise OutlierAnalysisError(str(e))
|
||||
|
||||
df = pd.DataFrame([asdict(r) for r in records])
|
||||
if df.empty:
|
||||
return {}
|
||||
|
||||
summaries: Dict[str, Any] = {}
|
||||
|
||||
for run_id, group in df.groupby('run_id'):
|
||||
latencies = group['latency_ms']
|
||||
p50 = float(latencies.quantile(0.5))
|
||||
p95 = float(latencies.quantile(0.95))
|
||||
p99 = float(latencies.quantile(0.99))
|
||||
max_latency = float(latencies.max())
|
||||
|
||||
# Define outliers as latency > p99
|
||||
outlier_mask = latencies > p99
|
||||
outlier_count = int(outlier_mask.sum())
|
||||
|
||||
latency_distribution = {
|
||||
'p50': round(p50, 3),
|
||||
'p95': round(p95, 3),
|
||||
'p99': round(p99, 3),
|
||||
'max': round(max_latency, 3)
|
||||
}
|
||||
|
||||
summary = OutlierSummary(
|
||||
run_id=str(run_id),
|
||||
outlier_count=outlier_count,
|
||||
latency_distribution=latency_distribution
|
||||
)
|
||||
summaries[str(run_id)] = summary.to_dict()
|
||||
|
||||
logger.debug("Processed run_id=%s: %s", run_id, summary)
|
||||
|
||||
assert all('run_id' in s for s in summaries.values()), 'Validation: Missing run_id in summary.'
|
||||
return summaries
|
||||
Loading…
Reference in a new issue