import pytest import math import numpy as np from metrics_analysis_script import main @pytest.fixture def sample_data(): return [ { 'metric_name': 'mix_window_p95', 'values': [10.0, 12.0, 11.0, 9.0, 13.0, 10.5], 'pinned_flag': True }, { 'metric_name': 'mix_window_p95', 'values': [20.0, 21.0, 19.0, 22.0, 23.0], 'pinned_flag': False } ] def test_analyze_metrics_structure(sample_data): results = main.analyze_metrics(sample_data) assert isinstance(results, list) assert all(isinstance(r, dict) for r in results) required_fields = {'metric_name', 'p50', 'p95', 'max', 'retry_free_rate', 'bootstrap_ci'} for r in results: assert required_fields.issubset(r.keys()) assert isinstance(r['bootstrap_ci'], list) assert len(r['bootstrap_ci']) == 2 def test_statistical_consistency(sample_data): results = main.analyze_metrics(sample_data) for r in results: p50 = r['p50'] p95 = r['p95'] maximum = r['max'] # Ensure ordering assert p50 <= p95 <= maximum # Bootstrap CI bounds make sense low, high = r['bootstrap_ci'] assert low <= p50 <= high def test_invalid_input_rejected(): invalid_data = [ {'metric_name': 'bad_metric', 'values': 'not_a_list', 'pinned_flag': True} ] with pytest.raises((AssertionError, ValueError, TypeError)): main.analyze_metrics(invalid_data) def test_bootstrap_variability(sample_data): results1 = main.analyze_metrics(sample_data) results2 = main.analyze_metrics(sample_data) # Same input should yield statistically consistent but not identical bootstrap for CI boundaries # So we check for approximate stability for r1, r2 in zip(results1, results2): assert math.isclose(r1['p50'], r2['p50'], rel_tol=0.05) ci_diff = np.abs(np.array(r1['bootstrap_ci']) - np.array(r2['bootstrap_ci'])) assert ci_diff.mean() < max(0.1 * np.mean(r1['bootstrap_ci']), 1.0)