Add unknown_analysis/src/unknown_analysis/core.py
This commit is contained in:
commit
5dab9594dd
1 changed files with 84 additions and 0 deletions
84
unknown_analysis/src/unknown_analysis/core.py
Normal file
84
unknown_analysis/src/unknown_analysis/core.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
class InputValidationError(ValueError):
|
||||
"""Exception raised when input log_data is invalid."""
|
||||
pass
|
||||
|
||||
|
||||
def _validate_log_data(log_data: list[dict]) -> None:
|
||||
if not isinstance(log_data, list):
|
||||
raise InputValidationError("log_data must be a list of dictionaries")
|
||||
required_fields = {"artifact_key", "status", "cause", "path", "error"}
|
||||
for i, entry in enumerate(log_data):
|
||||
if not isinstance(entry, dict):
|
||||
raise InputValidationError(f"Entry {i} is not a dictionary")
|
||||
missing = required_fields - set(entry.keys())
|
||||
if missing:
|
||||
raise InputValidationError(f"Entry {i} missing required keys: {missing}")
|
||||
|
||||
|
||||
def calculate_unknown_rates(log_data: List[Dict]) -> Dict[str, float]:
|
||||
"""Berechnet die Unknown-Artifakt- und Unknown-Schema-Quoten aus Logdaten.
|
||||
|
||||
Args:
|
||||
log_data: Liste von Dictionaries mit Artefaktinformationen.
|
||||
|
||||
Returns:
|
||||
Dictionary mit den Raten der Unknown-Kategorien.
|
||||
"""
|
||||
_validate_log_data(log_data)
|
||||
if not log_data:
|
||||
return {"unknown_artifact_missing_rate": 0.0, "unknown_schema_rate": 0.0}
|
||||
|
||||
df = pd.DataFrame(log_data)
|
||||
total_count = max(len(df), 1)
|
||||
|
||||
unknown_mask = df["status"].str.upper() == "UNKNOWN"
|
||||
unknown_df = df[unknown_mask]
|
||||
|
||||
missing_rate = (unknown_df["cause"].str.contains("missing", case=False, na=False).sum()) / total_count
|
||||
schema_rate = (unknown_df["cause"].str.contains("schema", case=False, na=False).sum()) / total_count
|
||||
|
||||
return {
|
||||
"unknown_artifact_missing_rate": round(missing_rate, 4),
|
||||
"unknown_schema_rate": round(schema_rate, 4),
|
||||
}
|
||||
|
||||
|
||||
def get_top_pass_unknown_switches(log_data: List[Dict]) -> List[Dict]:
|
||||
"""Analysiert die häufigsten PASS→Unknown-Umschaltungen.
|
||||
|
||||
Args:
|
||||
log_data: Liste der Logeinträge mit Statusänderungen und Fehlern.
|
||||
|
||||
Returns:
|
||||
Liste der Top-Umschaltungen mit 'cause', 'path' und 'error'.
|
||||
"""
|
||||
_validate_log_data(log_data)
|
||||
if not log_data:
|
||||
return []
|
||||
|
||||
df = pd.DataFrame(log_data)
|
||||
df = df.sort_values(by=["artifact_key"]) # Ensure grouping order is stable
|
||||
|
||||
pass_unknown = df.query('status.str.upper() == "UNKNOWN"', engine="python")
|
||||
# For simplicity, treat any UNKNOWN with a previous PASS artifact as a switch
|
||||
known_pass_artifacts = set(df.loc[df["status"].str.upper() == "PASS", "artifact_key"])
|
||||
switches = pass_unknown[pass_unknown["artifact_key"].isin(known_pass_artifacts)]
|
||||
|
||||
if switches.empty:
|
||||
return []
|
||||
|
||||
grouped = (
|
||||
switches.groupby(["cause", "path", "error"])
|
||||
.size()
|
||||
.reset_index(name="count")
|
||||
.sort_values(by="count", ascending=False)
|
||||
)
|
||||
|
||||
top = grouped.head(10)[["cause", "path", "error"]]
|
||||
return top.to_dict(orient="records")
|
||||
Loading…
Reference in a new issue