Add data_analysis/src/data_analysis/core.py
This commit is contained in:
parent
b2c3d2719d
commit
d1e357fa3c
1 changed files with 71 additions and 0 deletions
71
data_analysis/src/data_analysis/core.py
Normal file
71
data_analysis/src/data_analysis/core.py
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DataValidationError(Exception):
|
||||||
|
"""Raised when input data validation fails."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_input_data(data: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Validiert, ob die Eingabedaten die erforderlichen Felder enthalten."""
|
||||||
|
required_fields = {"context", "pinned_status", "delta_t", "warn"}
|
||||||
|
if not isinstance(data, list):
|
||||||
|
raise DataValidationError("Input data must be a list of dicts.")
|
||||||
|
for i, record in enumerate(data):
|
||||||
|
if not isinstance(record, dict):
|
||||||
|
raise DataValidationError(f"Each entry must be a dict (index {i}).")
|
||||||
|
if not required_fields.issubset(record.keys()):
|
||||||
|
missing = required_fields - record.keys()
|
||||||
|
raise DataValidationError(f"Missing fields {missing} in record at index {i}.")
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_data(data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||||
|
"""Analysiert Datensätze und identifiziert Δt<0-Fälle sowie berechnet Aggregationen.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : list[dict]
|
||||||
|
Liste von Datensätzen mit Feldern: context, pinned_status, delta_t, warn.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict
|
||||||
|
Analyseergebnis: near_expiry_count, fresh_count, warn_rate.
|
||||||
|
"""
|
||||||
|
logger.debug("Starting analyze_data with %d records", len(data))
|
||||||
|
|
||||||
|
_validate_input_data(data)
|
||||||
|
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
required_columns = ["context", "pinned_status", "delta_t", "warn"]
|
||||||
|
if not all(col in df.columns for col in required_columns):
|
||||||
|
raise DataValidationError("Input data missing required columns.")
|
||||||
|
|
||||||
|
# Filter nur auf unpinned und negative delta_t-Fälle
|
||||||
|
unpinned_df = df[(df["pinned_status"] == "unpinned") & (df["delta_t"] < 0)]
|
||||||
|
logger.debug("Filtered unpinned negative delta_t count: %d", len(unpinned_df))
|
||||||
|
|
||||||
|
near_expiry_count = int((unpinned_df["context"] == "near_expiry").sum())
|
||||||
|
fresh_count = int((unpinned_df["context"] == "fresh").sum())
|
||||||
|
|
||||||
|
warn_total = int(unpinned_df["warn"].sum())
|
||||||
|
warn_rate = float(warn_total / len(unpinned_df)) if len(unpinned_df) > 0 else 0.0
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"near_expiry_count": near_expiry_count,
|
||||||
|
"fresh_count": fresh_count,
|
||||||
|
"warn_rate": round(warn_rate, 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Analysis complete: %s", result)
|
||||||
|
assert all(key in result for key in ["near_expiry_count", "fresh_count", "warn_rate"]), (
|
||||||
|
"Result missing expected keys!"
|
||||||
|
)
|
||||||
|
return result
|
||||||
Loading…
Reference in a new issue