diff --git a/data_analysis/src/data_analysis/core.py b/data_analysis/src/data_analysis/core.py new file mode 100644 index 0000000..14a3d10 --- /dev/null +++ b/data_analysis/src/data_analysis/core.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import List +import pandas as pd +import math + + +logger = logging.getLogger(__name__) + + +@dataclass +class SensorData: + timestamp: str + voltage_mv: float + temperature_c: float + humidity_percent: float + + @staticmethod + def validate(data: List[SensorData]) -> None: + if not isinstance(data, list): + raise TypeError("data must be a list of SensorData instances") + for i, d in enumerate(data): + if not isinstance(d, SensorData): + raise TypeError(f"Item {i} is not a SensorData instance") + if not isinstance(d.voltage_mv, (int, float)): + raise TypeError(f"Invalid voltage_mv type in item {i}") + if not isinstance(d.temperature_c, (int, float)): + raise TypeError(f"Invalid temperature_c type in item {i}") + if not isinstance(d.humidity_percent, (int, float)): + raise TypeError(f"Invalid humidity_percent type in item {i}") + + +@dataclass +class SummaryReport: + max_voltage: float + min_voltage: float + average_voltage: float + correlation_with_weather: float + + +def analyze_data(data: List[SensorData]) -> SummaryReport: + """Analysiert Sensordaten und berechnet statistische Kennzahlen und Korrelationen.""" + + SensorData.validate(data) + if not data: + raise ValueError("No sensor data provided for analysis.") + + logger.debug("Starting analysis of %d sensor data entries", len(data)) + + df = pd.DataFrame([{ + "timestamp": d.timestamp, + "voltage_mv": d.voltage_mv, + "temperature_c": d.temperature_c, + "humidity_percent": d.humidity_percent, + } for d in data]) + + max_voltage = float(df["voltage_mv"].max()) + min_voltage = float(df["voltage_mv"].min()) + average_voltage = float(df["voltage_mv"].mean()) + + # Berechne Korrelation zwischen Spannung und Wetterparametern (Temperatur & Feuchte) + corr_temp = df["voltage_mv"].corr(df["temperature_c"]) + corr_hum = df["voltage_mv"].corr(df["humidity_percent"]) + + # Fasse Gesamt-Korrelation als Durchschnitt zusammen, NaN wird ignoriert + correlations = [c for c in [corr_temp, corr_hum] if not math.isnan(c)] + correlation_with_weather = float(sum(correlations) / len(correlations)) if correlations else 0.0 + + report = SummaryReport( + max_voltage=max_voltage, + min_voltage=min_voltage, + average_voltage=average_voltage, + correlation_with_weather=correlation_with_weather, + ) + + logger.debug( + "Finished analysis: max=%.3f, min=%.3f, avg=%.3f, corr=%.3f", + report.max_voltage, report.min_voltage, report.average_voltage, report.correlation_with_weather, + ) + + return report