Add bootstrap_resampling/main.c
This commit is contained in:
commit
ebd28ec51d
1 changed files with 286 additions and 0 deletions
286
bootstrap_resampling/main.c
Normal file
286
bootstrap_resampling/main.c
Normal file
|
|
@ -0,0 +1,286 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* bootstrap_resampling CLI
|
||||||
|
*
|
||||||
|
* Zweck:
|
||||||
|
* Einfaches Linux-Kommandozeilenprogramm, das Bootstrap-Resampling
|
||||||
|
* auf eindimensionalen Leistungsdaten durchführt.
|
||||||
|
*
|
||||||
|
* - Liest Werte (double) aus einer Textdatei, ein Wert pro Zeile.
|
||||||
|
* - Fuehrt Bootstrap-Resampling mit einer konfigurierbaren Anzahl
|
||||||
|
* an Iterationen durch (Standard: 10000).
|
||||||
|
* - Berechnet den Bootstrap-Mittelwert, 95-%-Konfidenzintervall
|
||||||
|
* sowie Outlier anhand einer einfachen z-Score-Heuristik.
|
||||||
|
* - Ausgabe erfolgt als JSON im Format "bootstrap_results".
|
||||||
|
*
|
||||||
|
* Nutzung:
|
||||||
|
* ./bootstrap_resampling <input_file> [iterations]
|
||||||
|
*
|
||||||
|
* Beispiel:
|
||||||
|
* ./bootstrap_resampling data.txt 10000
|
||||||
|
*
|
||||||
|
* Einschränkungen / Annahmen:
|
||||||
|
* - Maximal 1e6 Datenpunkte (anpassbar über MAX_POINTS).
|
||||||
|
* - Es wird ein sehr einfacher Bootstrap-Ansatz mit fester
|
||||||
|
* Konfidenz von 95 % (2.5- und 97.5-Perzentil) verwendet.
|
||||||
|
* - Outlier-Erkennung: |z| > 3 relativ zu Stichprobenmittelwert
|
||||||
|
* und -standardabweichung der Originaldaten.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define MAX_POINTS 1000000
|
||||||
|
|
||||||
|
/* Struktur für Ergebnisse, passend zur geforderten bootstrap_results-Form:
|
||||||
|
* {
|
||||||
|
* "mean": <double>,
|
||||||
|
* "ci_lower": <double>,
|
||||||
|
* "ci_upper": <double>,
|
||||||
|
* "outliers": [<double>, ...]
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
double mean;
|
||||||
|
double ci_lower;
|
||||||
|
double ci_upper;
|
||||||
|
double *outliers;
|
||||||
|
size_t outlier_count;
|
||||||
|
} bootstrap_results;
|
||||||
|
|
||||||
|
/* Einfache Funktion zum Einlesen von double-Werten aus einer Datei. */
|
||||||
|
static size_t read_data(const char *path, double *buffer, size_t max_points) {
|
||||||
|
FILE *f = fopen(path, "r");
|
||||||
|
if (!f) {
|
||||||
|
fprintf(stderr, "Error: cannot open input file '%s'\n", path);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t n = 0;
|
||||||
|
while (n < max_points && fscanf(f, "%lf", &buffer[n]) == 1) {
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(f);
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Berechnung von Mittelwert und Standardabweichung einer Stichprobe. */
|
||||||
|
static void mean_std(const double *data, size_t n, double *mean, double *stddev) {
|
||||||
|
if (n == 0) {
|
||||||
|
*mean = 0.0;
|
||||||
|
*stddev = 0.0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
double sum = 0.0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
sum += data[i];
|
||||||
|
}
|
||||||
|
double m = sum / (double)n;
|
||||||
|
|
||||||
|
double var = 0.0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
double d = data[i] - m;
|
||||||
|
var += d * d;
|
||||||
|
}
|
||||||
|
if (n > 1) {
|
||||||
|
var /= (double)(n - 1);
|
||||||
|
} else {
|
||||||
|
var = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
*mean = m;
|
||||||
|
*stddev = var > 0.0 ? sqrt(var) : 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Vergleichsfunktion für qsort von double-Werten. */
|
||||||
|
static int cmp_double(const void *a, const void *b) {
|
||||||
|
double da = *(const double *)a;
|
||||||
|
double db = *(const double *)b;
|
||||||
|
if (da < db) return -1;
|
||||||
|
if (da > db) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Erzeugt eine gleichverteilte Zufallszahl im Bereich [0,1). */
|
||||||
|
static double urand(void) {
|
||||||
|
return (double)rand() / ((double)RAND_MAX + 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Führt das eigentliche Bootstrap-Resampling aus. */
|
||||||
|
static int run_bootstrap(const double *data, size_t n, long iterations, bootstrap_results *res) {
|
||||||
|
if (n == 0 || iterations <= 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
double *boot_means = (double *)malloc((size_t)iterations * sizeof(double));
|
||||||
|
if (!boot_means) {
|
||||||
|
fprintf(stderr, "Error: memory allocation failed for bootstrap means.\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Bootstrap: mit Zurücklegen aus Originaldaten ziehen und Mittelwert berechnen. */
|
||||||
|
for (long it = 0; it < iterations; ++it) {
|
||||||
|
double sum = 0.0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
size_t idx = (size_t)(urand() * (double)n);
|
||||||
|
if (idx >= n) {
|
||||||
|
idx = n - 1; /* Sicherheitskappe bei Randfällen */
|
||||||
|
}
|
||||||
|
sum += data[idx];
|
||||||
|
}
|
||||||
|
boot_means[it] = sum / (double)n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sortieren für Perzentile. */
|
||||||
|
qsort(boot_means, (size_t)iterations, sizeof(double), cmp_double);
|
||||||
|
|
||||||
|
/* Perzentil-Indices für 95-%-CI. */
|
||||||
|
double lower_p = 0.025;
|
||||||
|
double upper_p = 0.975;
|
||||||
|
|
||||||
|
long lower_idx = (long)(lower_p * (double)iterations);
|
||||||
|
long upper_idx = (long)(upper_p * (double)iterations);
|
||||||
|
|
||||||
|
if (lower_idx < 0) lower_idx = 0;
|
||||||
|
if (upper_idx >= iterations) upper_idx = iterations - 1;
|
||||||
|
|
||||||
|
/* Schätzer für den Bootstrap-Mittelwert: Mittel über alle Bootstrap-Mittelwerte. */
|
||||||
|
double mean_boot = 0.0;
|
||||||
|
for (long it = 0; it < iterations; ++it) {
|
||||||
|
mean_boot += boot_means[it];
|
||||||
|
}
|
||||||
|
mean_boot /= (double)iterations;
|
||||||
|
|
||||||
|
res->mean = mean_boot;
|
||||||
|
res->ci_lower = boot_means[lower_idx];
|
||||||
|
res->ci_upper = boot_means[upper_idx];
|
||||||
|
|
||||||
|
free(boot_means);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Einfache Outlier-Erkennung via z-Score (|z| > 3). */
|
||||||
|
static int detect_outliers(const double *data, size_t n, bootstrap_results *res) {
|
||||||
|
double mean, stddev;
|
||||||
|
mean_std(data, n, &mean, &stddev);
|
||||||
|
|
||||||
|
if (stddev == 0.0 || n == 0) {
|
||||||
|
/* Keine Ausreißer, wenn alle Werte identisch sind oder keine Daten vorliegen. */
|
||||||
|
res->outliers = NULL;
|
||||||
|
res->outlier_count = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Zuerst Anzahl bestimmen. */
|
||||||
|
size_t count = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
double z = (data[i] - mean) / stddev;
|
||||||
|
if (z > 3.0 || z < -3.0) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count == 0) {
|
||||||
|
res->outliers = NULL;
|
||||||
|
res->outlier_count = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double *outs = (double *)malloc(count * sizeof(double));
|
||||||
|
if (!outs) {
|
||||||
|
fprintf(stderr, "Error: memory allocation failed for outliers.\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t idx = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
double z = (data[i] - mean) / stddev;
|
||||||
|
if (z > 3.0 || z < -3.0) {
|
||||||
|
outs[idx++] = data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res->outliers = outs;
|
||||||
|
res->outlier_count = count;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Gibt die Ergebnisse als JSON-Objekt auf stdout aus. */
|
||||||
|
static void print_results_json(const bootstrap_results *res) {
|
||||||
|
printf("{\n");
|
||||||
|
printf(" \"mean\": %.17g,\n", res->mean);
|
||||||
|
printf(" \"ci_lower\": %.17g,\n", res->ci_lower);
|
||||||
|
printf(" \"ci_upper\": %.17g,\n", res->ci_upper);
|
||||||
|
printf(" \"outliers\": [");
|
||||||
|
for (size_t i = 0; i < res->outlier_count; ++i) {
|
||||||
|
printf("%.17g", res->outliers[i]);
|
||||||
|
if (i + 1 < res->outlier_count) {
|
||||||
|
printf(", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("]\n");
|
||||||
|
printf("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s <input_file> [iterations]\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *input_path = argv[1];
|
||||||
|
long iterations = 10000; /* Standardwert analog zur Beschreibung. */
|
||||||
|
|
||||||
|
if (argc >= 3) {
|
||||||
|
iterations = strtol(argv[2], NULL, 10);
|
||||||
|
if (iterations <= 0) {
|
||||||
|
fprintf(stderr, "Error: iterations must be positive.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double *data = (double *)malloc(MAX_POINTS * sizeof(double));
|
||||||
|
if (!data) {
|
||||||
|
fprintf(stderr, "Error: memory allocation failed for input data.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t n = read_data(input_path, data, MAX_POINTS);
|
||||||
|
if (n == 0) {
|
||||||
|
fprintf(stderr, "Error: no data read from '%s'.\n", input_path);
|
||||||
|
free(data);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* RNG initialisieren: einfache, reproduzierbare Basis mit Zeitstempel. */
|
||||||
|
srand((unsigned int)time(NULL));
|
||||||
|
|
||||||
|
bootstrap_results res;
|
||||||
|
res.mean = 0.0;
|
||||||
|
res.ci_lower = 0.0;
|
||||||
|
res.ci_upper = 0.0;
|
||||||
|
res.outliers = NULL;
|
||||||
|
res.outlier_count = 0;
|
||||||
|
|
||||||
|
if (run_bootstrap(data, n, iterations, &res) != 0) {
|
||||||
|
free(data);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (detect_outliers(data, n, &res) != 0) {
|
||||||
|
free(data);
|
||||||
|
if (res.outliers) free(res.outliers);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ausgabe strikt als JSON im Format bootstrap_results. */
|
||||||
|
print_results_json(&res);
|
||||||
|
|
||||||
|
free(data);
|
||||||
|
if (res.outliers) free(res.outliers);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue