Add max_outlier_analysis_script/src/max_outlier_analysis_script/core.py
This commit is contained in:
parent
cf75341bdb
commit
bd3d545b1b
1 changed files with 110 additions and 0 deletions
|
|
@ -0,0 +1,110 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any
|
||||
import pandas as pd
|
||||
import statistics
|
||||
|
||||
__all__ = ["analyze_max_outliers"]
|
||||
|
||||
|
||||
# Configure logging for CI diagnostics
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class DataValidationError(Exception):
|
||||
"""Raised when input data validation fails."""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutlierRecord:
|
||||
corr_id: str
|
||||
stratum: str
|
||||
job_parallelism: int
|
||||
expires_at_dist_hours: float
|
||||
retry_total_overhead_ms: float
|
||||
latency_max: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisResults:
|
||||
max_above_p99_count: int
|
||||
near_expiry_cluster_percentage: float
|
||||
retry_overhead_variance: float
|
||||
|
||||
|
||||
_DEF_NEAR_EXPIRY_KEY = "near-expiry-unpinned"
|
||||
|
||||
|
||||
def _validate_data(data: List[Dict[str, Any]]) -> None:
|
||||
required_fields = {f.name for f in OutlierRecord.__dataclass_fields__.values()}
|
||||
if not isinstance(data, list):
|
||||
raise DataValidationError("Input data must be a list of dictionaries.")
|
||||
for i, record in enumerate(data):
|
||||
if not isinstance(record, dict):
|
||||
raise DataValidationError(f"Element {i} is not a dict.")
|
||||
missing = required_fields - record.keys()
|
||||
if missing:
|
||||
raise DataValidationError(f"Record {i} missing fields: {missing}")
|
||||
|
||||
|
||||
|
||||
def analyze_max_outliers(data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Analysiert Max-Outlier-Daten und berechnet Summary-Kennzahlen.
|
||||
|
||||
Args:
|
||||
data: Liste von Messwert-Dictionaries aus CI-Lasttest-Exports.
|
||||
|
||||
Returns:
|
||||
dict: Ergebnisse mit Kennzahlen für Max-Ausreißer.
|
||||
"""
|
||||
logger.info("Starting max outlier analysis on %d records", len(data))
|
||||
_validate_data(data)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
if df.empty:
|
||||
return {
|
||||
"max_above_p99_count": 0,
|
||||
"near_expiry_cluster_percentage": 0.0,
|
||||
"retry_overhead_variance": 0.0,
|
||||
}
|
||||
|
||||
# Compute percentiles
|
||||
p99 = df["latency_max"].quantile(0.99)
|
||||
above_p99 = df[df["latency_max"] > p99]
|
||||
max_above_p99_count = int(above_p99.shape[0])
|
||||
|
||||
# Cluster analysis for 'near-expiry-unpinned'
|
||||
total_count = df.shape[0]
|
||||
near_expiry_count = df[df["stratum"] == _DEF_NEAR_EXPIRY_KEY].shape[0]
|
||||
near_expiry_cluster_percentage = (
|
||||
(near_expiry_count / total_count) * 100.0 if total_count > 0 else 0.0
|
||||
)
|
||||
|
||||
# Retry overhead variance
|
||||
try:
|
||||
retry_overhead_variance = statistics.variance(df["retry_total_overhead_ms"].tolist())
|
||||
except statistics.StatisticsError:
|
||||
retry_overhead_variance = 0.0
|
||||
|
||||
result = AnalysisResults(
|
||||
max_above_p99_count=max_above_p99_count,
|
||||
near_expiry_cluster_percentage=near_expiry_cluster_percentage,
|
||||
retry_overhead_variance=retry_overhead_variance,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Analysis complete: max_above_p99_count=%d, near_expiry_cluster_percentage=%.2f, retry_overhead_variance=%.3f",
|
||||
result.max_above_p99_count,
|
||||
result.near_expiry_cluster_percentage,
|
||||
result.retry_overhead_variance,
|
||||
)
|
||||
|
||||
return {
|
||||
"max_above_p99_count": result.max_above_p99_count,
|
||||
"near_expiry_cluster_percentage": result.near_expiry_cluster_percentage,
|
||||
"retry_overhead_variance": result.retry_overhead_variance,
|
||||
}
|
||||
Loading…
Reference in a new issue