Add rerun_evaluator/src/rerun_evaluator/core.py

This commit is contained in:
Mika 2026-01-31 13:07:41 +00:00
parent f7af088970
commit fa99a0b2bf

View file

@ -0,0 +1,102 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import List, Dict, Any
import statistics
import logging
# Configure basic logging for CI-ready environments
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class InvalidRunDataError(ValueError):
"""Raised when a RunData instance is invalid or has missing/incorrect fields."""
pass
@dataclass
class RunData:
"""Represents a single CI run entry with metadata for rerun evaluation."""
run_id: str
label_triggered: bool
flappy: bool
pinned: bool
unknown_rate: float
def __post_init__(self) -> None:
if not isinstance(self.run_id, str) or not self.run_id:
raise InvalidRunDataError("run_id must be a non-empty string")
if not isinstance(self.label_triggered, bool):
raise InvalidRunDataError("label_triggered must be a bool")
if not isinstance(self.flappy, bool):
raise InvalidRunDataError("flappy must be a bool")
if not isinstance(self.pinned, bool):
raise InvalidRunDataError("pinned must be a bool")
if not isinstance(self.unknown_rate, (float, int)) or not (0.0 <= self.unknown_rate <= 1.0):
raise InvalidRunDataError("unknown_rate must be a float between 0.0 and 1.0")
def evaluate_rerun_needs(runs_data: List[RunData]) -> Dict[str, Any]:
"""Evaluate whether reruns are needed based on CI run historical data.
Args:
runs_data: List of RunData entries.
Returns:
dict: Evaluation metrics for rerun necessity and stability.
"""
assert isinstance(runs_data, list), "runs_data must be a list of RunData instances"
if not runs_data:
return {
"total_runs": 0,
"rerun_recommended": False,
"metrics": {
"label_trigger_rate": 0.0,
"flappy_rate": 0.0,
"unknown_rate_avg": 0.0,
"pinned_rate": 0.0,
},
}
# Validate all elements
for item in runs_data:
if not isinstance(item, RunData):
raise InvalidRunDataError("All elements in runs_data must be RunData instances")
# Compute basic statistics
label_triggered_values = [r.label_triggered for r in runs_data]
flappy_values = [r.flappy for r in runs_data]
pinned_values = [r.pinned for r in runs_data]
unknown_rates = [r.unknown_rate for r in runs_data]
metrics = {
"label_trigger_rate": sum(label_triggered_values) / len(runs_data),
"flappy_rate": sum(flappy_values) / len(runs_data),
"unknown_rate_avg": statistics.fmean(unknown_rates) if unknown_rates else 0.0,
"pinned_rate": sum(pinned_values) / len(runs_data),
}
# Simple heuristic for rerun recommendation
rerun_recommended = (
metrics["flappy_rate"] > 0.2 or
metrics["unknown_rate_avg"] > 0.15 or
metrics["label_trigger_rate"] < 0.3
)
result = {
"total_runs": len(runs_data),
"rerun_recommended": bool(rerun_recommended),
"metrics": metrics,
}
logger.info(
"Rerun evaluation completed: total_runs=%d, rerun_recommended=%s",
result["total_runs"],
result["rerun_recommended"],
)
return result