Add statistical_analysis/src/statistical_analysis/core.py
This commit is contained in:
parent
268db267bc
commit
207086773d
1 changed files with 102 additions and 0 deletions
102
statistical_analysis/src/statistical_analysis/core.py
Normal file
102
statistical_analysis/src/statistical_analysis/core.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class OutlierDetectionError(Exception):
|
||||||
|
"""Custom exception raised when data validation or detection fails."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OutlierAnalysis:
|
||||||
|
column_name: str
|
||||||
|
outlier_value: float
|
||||||
|
drift_signature: str
|
||||||
|
timeout_counts: int
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"column_name": self.column_name,
|
||||||
|
"outlier_value": self.outlier_value,
|
||||||
|
"drift_signature": self.drift_signature,
|
||||||
|
"timeout_counts": self.timeout_counts,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_input_data(log_data: List[Dict[str, Any]]) -> pd.DataFrame:
|
||||||
|
if not isinstance(log_data, list) or not all(isinstance(entry, dict) for entry in log_data):
|
||||||
|
raise OutlierDetectionError("Input log_data must be a list of dictionaries.")
|
||||||
|
if len(log_data) == 0:
|
||||||
|
raise OutlierDetectionError("Input log_data is empty.")
|
||||||
|
|
||||||
|
df = pd.DataFrame(log_data)
|
||||||
|
if df.empty:
|
||||||
|
raise OutlierDetectionError("Converted DataFrame is empty.")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_outliers(log_data: List[Dict[str, Any]]) -> OutlierAnalysis:
|
||||||
|
"""Analysiert Logdaten und identifiziert Ausreißer mit Fokus auf p99-Region und Latenzverteilungen."""
|
||||||
|
logger.debug("Starting outlier analysis.")
|
||||||
|
df = _validate_input_data(log_data)
|
||||||
|
|
||||||
|
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
|
||||||
|
if not numeric_cols:
|
||||||
|
raise OutlierDetectionError("No numeric columns found for outlier analysis.")
|
||||||
|
|
||||||
|
# Compute p99 per column
|
||||||
|
outlier_scores = {}
|
||||||
|
for col in numeric_cols:
|
||||||
|
series = df[col].dropna()
|
||||||
|
if series.empty:
|
||||||
|
continue
|
||||||
|
p99_value = np.percentile(series, 99)
|
||||||
|
outlier_scores[col] = p99_value
|
||||||
|
logger.debug("Computed p99 for %s: %f", col, p99_value)
|
||||||
|
|
||||||
|
if not outlier_scores:
|
||||||
|
raise OutlierDetectionError("No valid numeric data available for outlier computation.")
|
||||||
|
|
||||||
|
outlier_col = max(outlier_scores, key=outlier_scores.get)
|
||||||
|
outlier_value = outlier_scores[outlier_col]
|
||||||
|
|
||||||
|
# Simple drift signature and timeout correlation heuristic
|
||||||
|
drift_signature = "stable"
|
||||||
|
timeout_counts = 0
|
||||||
|
|
||||||
|
if "drift_signature" in df.columns:
|
||||||
|
sig_counts = df["drift_signature"].value_counts()
|
||||||
|
if not sig_counts.empty:
|
||||||
|
drift_signature = sig_counts.idxmax()
|
||||||
|
|
||||||
|
if "timeout_counts" in df.columns:
|
||||||
|
timeout_counts = int(df["timeout_counts"].sum())
|
||||||
|
|
||||||
|
result = OutlierAnalysis(
|
||||||
|
column_name=outlier_col,
|
||||||
|
outlier_value=float(outlier_value),
|
||||||
|
drift_signature=str(drift_signature),
|
||||||
|
timeout_counts=timeout_counts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# CI-ready validation
|
||||||
|
assert isinstance(result.column_name, str)
|
||||||
|
assert isinstance(result.outlier_value, float)
|
||||||
|
assert isinstance(result.drift_signature, str)
|
||||||
|
assert isinstance(result.timeout_counts, int)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Outlier analysis completed: column=%s, value=%f, drift=%s, timeouts=%d",
|
||||||
|
result.column_name,
|
||||||
|
result.outlier_value,
|
||||||
|
result.drift_signature,
|
||||||
|
result.timeout_counts,
|
||||||
|
)
|
||||||
|
return result
|
||||||
Loading…
Reference in a new issue