bootstrap_analysis/bootstrap_analysis_tool/main.c

263 lines
7.3 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
/*
* bootstrap_analysis_tool
* ------------------------
* Einfaches Linux-CLI-Programm zur Bootstrap-Auswertung von Outlier-Raten.
*
* Eingabeformat (ASCII, whitespace-separiert), eine Zahl pro Zeile:
* <value>
*
* Ein Wert gilt als Outlier, wenn er außerhalb [Q1 - 1.5*IQR, Q3 + 1.5*IQR] liegt.
* Auf Basis der Outlier-Indikatorvariable (0/1) wird die Outlier-Rate geschätzt.
*
* Ablauf:
* 1. Daten von stdin einlesen.
* 2. Outlier nach Median/IQR-Regel bestimmen.
* 3. B-Proben (Bootstrap-Resamples mit Zurücklegen) der Outlier-Indikatoren ziehen.
* 4. Für jede Probe die Outlier-Rate berechnen.
* 5. 95%-Konfidenzintervall der Outlier-Rate per Percentile-Methode ausgeben.
* 6. Ergebnis als JSON-Objekt auf stdout ausgeben, kompatibel zu BootstrapResult.
*
* CLI:
* ./bootstrap_tool <n_resamples> <seed>
*
* - n_resamples : Anzahl der Bootstrap-Resamples (z. B. 10000)
* - seed : Zufalls-Seed (z. B. 42), zur Reproduzierbarkeit
*
* Ausgabe (JSON, eine Zeile):
* {
* "mean": <double>,
* "ci_lower": <double>,
* "ci_upper": <double>,
* "outliers": <int>
* }
*
* Hinweis:
* - mean : mittlere Outlier-Rate über alle Resamples
* - ci_lower : untere 2.5%-Perzentile der Outlier-Rate
* - ci_upper : obere 97.5%-Perzentile der Outlier-Rate
* - outliers : Anzahl Outlier in der Originalstichprobe
*/
/* Dynamischer Puffer zum Einlesen von double-Werten */
static double *read_data(size_t *n_out) {
size_t cap = 1024;
size_t n = 0;
double *data = (double *)malloc(cap * sizeof(double));
if (!data) {
fprintf(stderr, "Memory allocation failed\n");
return NULL;
}
while (1) {
double v;
int r = scanf("%lf", &v);
if (r == EOF || r == 0)
break;
if (n >= cap) {
cap *= 2;
double *tmp = (double *)realloc(data, cap * sizeof(double));
if (!tmp) {
fprintf(stderr, "Memory reallocation failed\n");
free(data);
return NULL;
}
data = tmp;
}
data[n++] = v;
}
if (n == 0) {
free(data);
*n_out = 0;
return NULL;
}
*n_out = n;
return data;
}
/* Vergleichsfunktion für qsort (double) */
static int cmp_double(const void *a, const void *b) {
double da = *(const double *)a;
double db = *(const double *)b;
if (da < db) return -1;
if (da > db) return 1;
return 0;
}
/* Berechnung von Median, Q1, Q3 per sortierter Kopie (Tukey-ähnliche Definition). */
static void compute_quartiles(const double *x, size_t n, double *median, double *q1, double *q3) {
double *tmp = (double *)malloc(n * sizeof(double));
if (!tmp) {
fprintf(stderr, "Memory allocation failed in compute_quartiles\n");
exit(EXIT_FAILURE);
}
memcpy(tmp, x, n * sizeof(double));
qsort(tmp, n, sizeof(double), cmp_double);
/* Median */
if (n % 2 == 0) {
*median = 0.5 * (tmp[n/2 - 1] + tmp[n/2]);
} else {
*median = tmp[n/2];
}
/* Untere und obere Hälfte für Q1/Q3 */
size_t n_low, n_high;
const double *low, *high;
if (n % 2 == 0) {
n_low = n / 2;
n_high = n / 2;
low = tmp;
high = tmp + n/2;
} else {
n_low = n / 2;
n_high = n / 2;
low = tmp;
high = tmp + n/2 + 1;
}
/* Median einer Hälfte als Quartil */
if (n_low == 0 || n_high == 0) {
*q1 = *median;
*q3 = *median;
} else {
if (n_low % 2 == 0)
*q1 = 0.5 * (low[n_low/2 - 1] + low[n_low/2]);
else
*q1 = low[n_low/2];
if (n_high % 2 == 0)
*q3 = 0.5 * (high[n_high/2 - 1] + high[n_high/2]);
else
*q3 = high[n_high/2];
}
free(tmp);
}
/* Erzeuge Outlier-Indikatorarray basierend auf IQR-Regel. */
static int *compute_outlier_flags(const double *x, size_t n, int *n_outliers, double *lower, double *upper) {
double median, q1, q3;
compute_quartiles(x, n, &median, &q1, &q3);
double iqr = q3 - q1;
double lo = q1 - 1.5 * iqr;
double hi = q3 + 1.5 * iqr;
if (lower) *lower = lo;
if (upper) *upper = hi;
int *flags = (int *)malloc(n * sizeof(int));
if (!flags) {
fprintf(stderr, "Memory allocation failed for flags\n");
exit(EXIT_FAILURE);
}
int cnt = 0;
for (size_t i = 0; i < n; ++i) {
if (x[i] < lo || x[i] > hi) {
flags[i] = 1;
cnt++;
} else {
flags[i] = 0;
}
}
if (n_outliers) *n_outliers = cnt;
return flags;
}
/* Zufällige Ganzzahl im Bereich [0, n-1] */
static inline size_t rand_index(size_t n) {
return (size_t)((double)rand() / ((double)RAND_MAX + 1.0) * n);
}
/* Percentile (0..1) eines sortierten double-Arrays (lineare Interpolation). */
static double percentile(const double *x_sorted, size_t n, double p) {
if (n == 0) return NAN;
if (p <= 0.0) return x_sorted[0];
if (p >= 1.0) return x_sorted[n-1];
double idx = p * (n - 1);
size_t i = (size_t)floor(idx);
double frac = idx - (double)i;
if (i + 1 >= n) return x_sorted[n-1];
return x_sorted[i] * (1.0 - frac) + x_sorted[i+1] * frac;
}
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "Usage: %s <n_resamples> <seed> < input_data\n", argv[0]);
return EXIT_FAILURE;
}
long n_resamples = strtol(argv[1], NULL, 10);
if (n_resamples <= 0) {
fprintf(stderr, "n_resamples must be positive\n");
return EXIT_FAILURE;
}
long seed = strtol(argv[2], NULL, 10);
if (seed <= 0) seed = 1;
srand((unsigned int)seed);
size_t n = 0;
double *data = read_data(&n);
if (!data || n == 0) {
fprintf(stderr, "No input data read from stdin\n");
free(data);
return EXIT_FAILURE;
}
int n_outliers = 0;
double lo, hi;
int *flags = compute_outlier_flags(data, n, &n_outliers, &lo, &hi);
double *boot_props = (double *)malloc((size_t)n_resamples * sizeof(double));
if (!boot_props) {
fprintf(stderr, "Memory allocation failed for bootstrap results\n");
free(data);
free(flags);
return EXIT_FAILURE;
}
/* Bootstrap über die 0/1-Outlier-Indikatoren */
for (long b = 0; b < n_resamples; ++b) {
int sum = 0;
for (size_t i = 0; i < n; ++i) {
size_t idx = rand_index(n);
sum += flags[idx];
}
boot_props[b] = (double)sum / (double)n;
}
/* Kennzahlen aus Bootstrap-Proportionen */
double mean = 0.0;
for (long b = 0; b < n_resamples; ++b) {
mean += boot_props[b];
}
mean /= (double)n_resamples;
qsort(boot_props, (size_t)n_resamples, sizeof(double), cmp_double);
double ci_lower = percentile(boot_props, (size_t)n_resamples, 0.025);
double ci_upper = percentile(boot_props, (size_t)n_resamples, 0.975);
/* JSON-Ausgabe passend zu BootstrapResult */
printf("{\n");
printf(" \"mean\": %.10f,\n", mean);
printf(" \"ci_lower\": %.10f,\n", ci_lower);
printf(" \"ci_upper\": %.10f,\n", ci_upper);
printf(" \"outliers\": %d\n", n_outliers);
printf("}\n");
free(data);
free(flags);
free(boot_props);
return EXIT_SUCCESS;
}