commit d6e7ff219418ff1deebaaf8cd2ffa52efeb6a782 Author: Mika Date: Sat Dec 6 13:10:44 2025 +0000 Add bootstrap_analysis_tool/main.c diff --git a/bootstrap_analysis_tool/main.c b/bootstrap_analysis_tool/main.c new file mode 100644 index 0000000..7ce2655 --- /dev/null +++ b/bootstrap_analysis_tool/main.c @@ -0,0 +1,263 @@ +#include +#include +#include +#include +#include + +/* + * bootstrap_analysis_tool + * ------------------------ + * Einfaches Linux-CLI-Programm zur Bootstrap-Auswertung von Outlier-Raten. + * + * Eingabeformat (ASCII, whitespace-separiert), eine Zahl pro Zeile: + * + * + * Ein Wert gilt als Outlier, wenn er außerhalb [Q1 - 1.5*IQR, Q3 + 1.5*IQR] liegt. + * Auf Basis der Outlier-Indikatorvariable (0/1) wird die Outlier-Rate geschätzt. + * + * Ablauf: + * 1. Daten von stdin einlesen. + * 2. Outlier nach Median/IQR-Regel bestimmen. + * 3. B-Proben (Bootstrap-Resamples mit Zurücklegen) der Outlier-Indikatoren ziehen. + * 4. Für jede Probe die Outlier-Rate berechnen. + * 5. 95%-Konfidenzintervall der Outlier-Rate per Percentile-Methode ausgeben. + * 6. Ergebnis als JSON-Objekt auf stdout ausgeben, kompatibel zu BootstrapResult. + * + * CLI: + * ./bootstrap_tool + * + * - n_resamples : Anzahl der Bootstrap-Resamples (z. B. 10000) + * - seed : Zufalls-Seed (z. B. 42), zur Reproduzierbarkeit + * + * Ausgabe (JSON, eine Zeile): + * { + * "mean": , + * "ci_lower": , + * "ci_upper": , + * "outliers": + * } + * + * Hinweis: + * - mean : mittlere Outlier-Rate über alle Resamples + * - ci_lower : untere 2.5%-Perzentile der Outlier-Rate + * - ci_upper : obere 97.5%-Perzentile der Outlier-Rate + * - outliers : Anzahl Outlier in der Originalstichprobe + */ + +/* Dynamischer Puffer zum Einlesen von double-Werten */ +static double *read_data(size_t *n_out) { + size_t cap = 1024; + size_t n = 0; + double *data = (double *)malloc(cap * sizeof(double)); + if (!data) { + fprintf(stderr, "Memory allocation failed\n"); + return NULL; + } + + while (1) { + double v; + int r = scanf("%lf", &v); + if (r == EOF || r == 0) + break; + if (n >= cap) { + cap *= 2; + double *tmp = (double *)realloc(data, cap * sizeof(double)); + if (!tmp) { + fprintf(stderr, "Memory reallocation failed\n"); + free(data); + return NULL; + } + data = tmp; + } + data[n++] = v; + } + + if (n == 0) { + free(data); + *n_out = 0; + return NULL; + } + + *n_out = n; + return data; +} + +/* Vergleichsfunktion für qsort (double) */ +static int cmp_double(const void *a, const void *b) { + double da = *(const double *)a; + double db = *(const double *)b; + if (da < db) return -1; + if (da > db) return 1; + return 0; +} + +/* Berechnung von Median, Q1, Q3 per sortierter Kopie (Tukey-ähnliche Definition). */ +static void compute_quartiles(const double *x, size_t n, double *median, double *q1, double *q3) { + double *tmp = (double *)malloc(n * sizeof(double)); + if (!tmp) { + fprintf(stderr, "Memory allocation failed in compute_quartiles\n"); + exit(EXIT_FAILURE); + } + memcpy(tmp, x, n * sizeof(double)); + qsort(tmp, n, sizeof(double), cmp_double); + + /* Median */ + if (n % 2 == 0) { + *median = 0.5 * (tmp[n/2 - 1] + tmp[n/2]); + } else { + *median = tmp[n/2]; + } + + /* Untere und obere Hälfte für Q1/Q3 */ + size_t n_low, n_high; + const double *low, *high; + + if (n % 2 == 0) { + n_low = n / 2; + n_high = n / 2; + low = tmp; + high = tmp + n/2; + } else { + n_low = n / 2; + n_high = n / 2; + low = tmp; + high = tmp + n/2 + 1; + } + + /* Median einer Hälfte als Quartil */ + if (n_low == 0 || n_high == 0) { + *q1 = *median; + *q3 = *median; + } else { + if (n_low % 2 == 0) + *q1 = 0.5 * (low[n_low/2 - 1] + low[n_low/2]); + else + *q1 = low[n_low/2]; + + if (n_high % 2 == 0) + *q3 = 0.5 * (high[n_high/2 - 1] + high[n_high/2]); + else + *q3 = high[n_high/2]; + } + + free(tmp); +} + +/* Erzeuge Outlier-Indikatorarray basierend auf IQR-Regel. */ +static int *compute_outlier_flags(const double *x, size_t n, int *n_outliers, double *lower, double *upper) { + double median, q1, q3; + compute_quartiles(x, n, &median, &q1, &q3); + double iqr = q3 - q1; + double lo = q1 - 1.5 * iqr; + double hi = q3 + 1.5 * iqr; + + if (lower) *lower = lo; + if (upper) *upper = hi; + + int *flags = (int *)malloc(n * sizeof(int)); + if (!flags) { + fprintf(stderr, "Memory allocation failed for flags\n"); + exit(EXIT_FAILURE); + } + + int cnt = 0; + for (size_t i = 0; i < n; ++i) { + if (x[i] < lo || x[i] > hi) { + flags[i] = 1; + cnt++; + } else { + flags[i] = 0; + } + } + if (n_outliers) *n_outliers = cnt; + return flags; +} + +/* Zufällige Ganzzahl im Bereich [0, n-1] */ +static inline size_t rand_index(size_t n) { + return (size_t)((double)rand() / ((double)RAND_MAX + 1.0) * n); +} + +/* Percentile (0..1) eines sortierten double-Arrays (lineare Interpolation). */ +static double percentile(const double *x_sorted, size_t n, double p) { + if (n == 0) return NAN; + if (p <= 0.0) return x_sorted[0]; + if (p >= 1.0) return x_sorted[n-1]; + + double idx = p * (n - 1); + size_t i = (size_t)floor(idx); + double frac = idx - (double)i; + if (i + 1 >= n) return x_sorted[n-1]; + return x_sorted[i] * (1.0 - frac) + x_sorted[i+1] * frac; +} + +int main(int argc, char **argv) { + if (argc < 3) { + fprintf(stderr, "Usage: %s < input_data\n", argv[0]); + return EXIT_FAILURE; + } + + long n_resamples = strtol(argv[1], NULL, 10); + if (n_resamples <= 0) { + fprintf(stderr, "n_resamples must be positive\n"); + return EXIT_FAILURE; + } + + long seed = strtol(argv[2], NULL, 10); + if (seed <= 0) seed = 1; + srand((unsigned int)seed); + + size_t n = 0; + double *data = read_data(&n); + if (!data || n == 0) { + fprintf(stderr, "No input data read from stdin\n"); + free(data); + return EXIT_FAILURE; + } + + int n_outliers = 0; + double lo, hi; + int *flags = compute_outlier_flags(data, n, &n_outliers, &lo, &hi); + + double *boot_props = (double *)malloc((size_t)n_resamples * sizeof(double)); + if (!boot_props) { + fprintf(stderr, "Memory allocation failed for bootstrap results\n"); + free(data); + free(flags); + return EXIT_FAILURE; + } + + /* Bootstrap über die 0/1-Outlier-Indikatoren */ + for (long b = 0; b < n_resamples; ++b) { + int sum = 0; + for (size_t i = 0; i < n; ++i) { + size_t idx = rand_index(n); + sum += flags[idx]; + } + boot_props[b] = (double)sum / (double)n; + } + + /* Kennzahlen aus Bootstrap-Proportionen */ + double mean = 0.0; + for (long b = 0; b < n_resamples; ++b) { + mean += boot_props[b]; + } + mean /= (double)n_resamples; + + qsort(boot_props, (size_t)n_resamples, sizeof(double), cmp_double); + double ci_lower = percentile(boot_props, (size_t)n_resamples, 0.025); + double ci_upper = percentile(boot_props, (size_t)n_resamples, 0.975); + + /* JSON-Ausgabe passend zu BootstrapResult */ + printf("{\n"); + printf(" \"mean\": %.10f,\n", mean); + printf(" \"ci_lower\": %.10f,\n", ci_lower); + printf(" \"ci_upper\": %.10f,\n", ci_upper); + printf(" \"outliers\": %d\n", n_outliers); + printf("}\n"); + + free(data); + free(flags); + free(boot_props); + return EXIT_SUCCESS; +}