263 lines
7.3 KiB
C
263 lines
7.3 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
#include <math.h>
|
|
|
|
/*
|
|
* bootstrap_analysis_tool
|
|
* ------------------------
|
|
* Einfaches Linux-CLI-Programm zur Bootstrap-Auswertung von Outlier-Raten.
|
|
*
|
|
* Eingabeformat (ASCII, whitespace-separiert), eine Zahl pro Zeile:
|
|
* <value>
|
|
*
|
|
* Ein Wert gilt als Outlier, wenn er außerhalb [Q1 - 1.5*IQR, Q3 + 1.5*IQR] liegt.
|
|
* Auf Basis der Outlier-Indikatorvariable (0/1) wird die Outlier-Rate geschätzt.
|
|
*
|
|
* Ablauf:
|
|
* 1. Daten von stdin einlesen.
|
|
* 2. Outlier nach Median/IQR-Regel bestimmen.
|
|
* 3. B-Proben (Bootstrap-Resamples mit Zurücklegen) der Outlier-Indikatoren ziehen.
|
|
* 4. Für jede Probe die Outlier-Rate berechnen.
|
|
* 5. 95%-Konfidenzintervall der Outlier-Rate per Percentile-Methode ausgeben.
|
|
* 6. Ergebnis als JSON-Objekt auf stdout ausgeben, kompatibel zu BootstrapResult.
|
|
*
|
|
* CLI:
|
|
* ./bootstrap_tool <n_resamples> <seed>
|
|
*
|
|
* - n_resamples : Anzahl der Bootstrap-Resamples (z. B. 10000)
|
|
* - seed : Zufalls-Seed (z. B. 42), zur Reproduzierbarkeit
|
|
*
|
|
* Ausgabe (JSON, eine Zeile):
|
|
* {
|
|
* "mean": <double>,
|
|
* "ci_lower": <double>,
|
|
* "ci_upper": <double>,
|
|
* "outliers": <int>
|
|
* }
|
|
*
|
|
* Hinweis:
|
|
* - mean : mittlere Outlier-Rate über alle Resamples
|
|
* - ci_lower : untere 2.5%-Perzentile der Outlier-Rate
|
|
* - ci_upper : obere 97.5%-Perzentile der Outlier-Rate
|
|
* - outliers : Anzahl Outlier in der Originalstichprobe
|
|
*/
|
|
|
|
/* Dynamischer Puffer zum Einlesen von double-Werten */
|
|
static double *read_data(size_t *n_out) {
|
|
size_t cap = 1024;
|
|
size_t n = 0;
|
|
double *data = (double *)malloc(cap * sizeof(double));
|
|
if (!data) {
|
|
fprintf(stderr, "Memory allocation failed\n");
|
|
return NULL;
|
|
}
|
|
|
|
while (1) {
|
|
double v;
|
|
int r = scanf("%lf", &v);
|
|
if (r == EOF || r == 0)
|
|
break;
|
|
if (n >= cap) {
|
|
cap *= 2;
|
|
double *tmp = (double *)realloc(data, cap * sizeof(double));
|
|
if (!tmp) {
|
|
fprintf(stderr, "Memory reallocation failed\n");
|
|
free(data);
|
|
return NULL;
|
|
}
|
|
data = tmp;
|
|
}
|
|
data[n++] = v;
|
|
}
|
|
|
|
if (n == 0) {
|
|
free(data);
|
|
*n_out = 0;
|
|
return NULL;
|
|
}
|
|
|
|
*n_out = n;
|
|
return data;
|
|
}
|
|
|
|
/* Vergleichsfunktion für qsort (double) */
|
|
static int cmp_double(const void *a, const void *b) {
|
|
double da = *(const double *)a;
|
|
double db = *(const double *)b;
|
|
if (da < db) return -1;
|
|
if (da > db) return 1;
|
|
return 0;
|
|
}
|
|
|
|
/* Berechnung von Median, Q1, Q3 per sortierter Kopie (Tukey-ähnliche Definition). */
|
|
static void compute_quartiles(const double *x, size_t n, double *median, double *q1, double *q3) {
|
|
double *tmp = (double *)malloc(n * sizeof(double));
|
|
if (!tmp) {
|
|
fprintf(stderr, "Memory allocation failed in compute_quartiles\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
memcpy(tmp, x, n * sizeof(double));
|
|
qsort(tmp, n, sizeof(double), cmp_double);
|
|
|
|
/* Median */
|
|
if (n % 2 == 0) {
|
|
*median = 0.5 * (tmp[n/2 - 1] + tmp[n/2]);
|
|
} else {
|
|
*median = tmp[n/2];
|
|
}
|
|
|
|
/* Untere und obere Hälfte für Q1/Q3 */
|
|
size_t n_low, n_high;
|
|
const double *low, *high;
|
|
|
|
if (n % 2 == 0) {
|
|
n_low = n / 2;
|
|
n_high = n / 2;
|
|
low = tmp;
|
|
high = tmp + n/2;
|
|
} else {
|
|
n_low = n / 2;
|
|
n_high = n / 2;
|
|
low = tmp;
|
|
high = tmp + n/2 + 1;
|
|
}
|
|
|
|
/* Median einer Hälfte als Quartil */
|
|
if (n_low == 0 || n_high == 0) {
|
|
*q1 = *median;
|
|
*q3 = *median;
|
|
} else {
|
|
if (n_low % 2 == 0)
|
|
*q1 = 0.5 * (low[n_low/2 - 1] + low[n_low/2]);
|
|
else
|
|
*q1 = low[n_low/2];
|
|
|
|
if (n_high % 2 == 0)
|
|
*q3 = 0.5 * (high[n_high/2 - 1] + high[n_high/2]);
|
|
else
|
|
*q3 = high[n_high/2];
|
|
}
|
|
|
|
free(tmp);
|
|
}
|
|
|
|
/* Erzeuge Outlier-Indikatorarray basierend auf IQR-Regel. */
|
|
static int *compute_outlier_flags(const double *x, size_t n, int *n_outliers, double *lower, double *upper) {
|
|
double median, q1, q3;
|
|
compute_quartiles(x, n, &median, &q1, &q3);
|
|
double iqr = q3 - q1;
|
|
double lo = q1 - 1.5 * iqr;
|
|
double hi = q3 + 1.5 * iqr;
|
|
|
|
if (lower) *lower = lo;
|
|
if (upper) *upper = hi;
|
|
|
|
int *flags = (int *)malloc(n * sizeof(int));
|
|
if (!flags) {
|
|
fprintf(stderr, "Memory allocation failed for flags\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
int cnt = 0;
|
|
for (size_t i = 0; i < n; ++i) {
|
|
if (x[i] < lo || x[i] > hi) {
|
|
flags[i] = 1;
|
|
cnt++;
|
|
} else {
|
|
flags[i] = 0;
|
|
}
|
|
}
|
|
if (n_outliers) *n_outliers = cnt;
|
|
return flags;
|
|
}
|
|
|
|
/* Zufällige Ganzzahl im Bereich [0, n-1] */
|
|
static inline size_t rand_index(size_t n) {
|
|
return (size_t)((double)rand() / ((double)RAND_MAX + 1.0) * n);
|
|
}
|
|
|
|
/* Percentile (0..1) eines sortierten double-Arrays (lineare Interpolation). */
|
|
static double percentile(const double *x_sorted, size_t n, double p) {
|
|
if (n == 0) return NAN;
|
|
if (p <= 0.0) return x_sorted[0];
|
|
if (p >= 1.0) return x_sorted[n-1];
|
|
|
|
double idx = p * (n - 1);
|
|
size_t i = (size_t)floor(idx);
|
|
double frac = idx - (double)i;
|
|
if (i + 1 >= n) return x_sorted[n-1];
|
|
return x_sorted[i] * (1.0 - frac) + x_sorted[i+1] * frac;
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
if (argc < 3) {
|
|
fprintf(stderr, "Usage: %s <n_resamples> <seed> < input_data\n", argv[0]);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
long n_resamples = strtol(argv[1], NULL, 10);
|
|
if (n_resamples <= 0) {
|
|
fprintf(stderr, "n_resamples must be positive\n");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
long seed = strtol(argv[2], NULL, 10);
|
|
if (seed <= 0) seed = 1;
|
|
srand((unsigned int)seed);
|
|
|
|
size_t n = 0;
|
|
double *data = read_data(&n);
|
|
if (!data || n == 0) {
|
|
fprintf(stderr, "No input data read from stdin\n");
|
|
free(data);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
int n_outliers = 0;
|
|
double lo, hi;
|
|
int *flags = compute_outlier_flags(data, n, &n_outliers, &lo, &hi);
|
|
|
|
double *boot_props = (double *)malloc((size_t)n_resamples * sizeof(double));
|
|
if (!boot_props) {
|
|
fprintf(stderr, "Memory allocation failed for bootstrap results\n");
|
|
free(data);
|
|
free(flags);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
/* Bootstrap über die 0/1-Outlier-Indikatoren */
|
|
for (long b = 0; b < n_resamples; ++b) {
|
|
int sum = 0;
|
|
for (size_t i = 0; i < n; ++i) {
|
|
size_t idx = rand_index(n);
|
|
sum += flags[idx];
|
|
}
|
|
boot_props[b] = (double)sum / (double)n;
|
|
}
|
|
|
|
/* Kennzahlen aus Bootstrap-Proportionen */
|
|
double mean = 0.0;
|
|
for (long b = 0; b < n_resamples; ++b) {
|
|
mean += boot_props[b];
|
|
}
|
|
mean /= (double)n_resamples;
|
|
|
|
qsort(boot_props, (size_t)n_resamples, sizeof(double), cmp_double);
|
|
double ci_lower = percentile(boot_props, (size_t)n_resamples, 0.025);
|
|
double ci_upper = percentile(boot_props, (size_t)n_resamples, 0.975);
|
|
|
|
/* JSON-Ausgabe passend zu BootstrapResult */
|
|
printf("{\n");
|
|
printf(" \"mean\": %.10f,\n", mean);
|
|
printf(" \"ci_lower\": %.10f,\n", ci_lower);
|
|
printf(" \"ci_upper\": %.10f,\n", ci_upper);
|
|
printf(" \"outliers\": %d\n", n_outliers);
|
|
printf("}\n");
|
|
|
|
free(data);
|
|
free(flags);
|
|
free(boot_props);
|
|
return EXIT_SUCCESS;
|
|
}
|