Add dataset_exporter/src/dataset_exporter/core.py

2026-01-31 13:07:40 +00:00 · 2026-01-31 13:07:40 +00:00 · 9449d4a70e
commit 9449d4a70e
parent 1de3654dab
1 changed files with 93 additions and 0 deletions
--- a/dataset_exporter/src/dataset_exporter/core.py
+++ b/dataset_exporter/src/dataset_exporter/core.py
@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import json
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Dict
+from datetime import datetime
+
+
+class ExportError(Exception):
+    """Custom exception for export-related errors."""
+
+
+@dataclass
+class ExportOptions:
+    """Configuration options for dataset export."""
+
+    output_format: str
+    output_path: str
+
+    def __post_init__(self) -> None:
+        valid_formats = {"jsonl", "csv"}
+        if self.output_format not in valid_formats:
+            raise ValueError(f"Invalid format: {self.output_format}. Must be one of {valid_formats}.")
+        if not isinstance(self.output_path, str) or not self.output_path:
+            raise ValueError("output_path must be a non-empty string.")
+
+
+@dataclass
+class DriftRunRecord:
+    """Represents a single CI analysis run record."""
+
+    timestamp: datetime
+    decision: str
+    warn_rate: float
+    fail_count: int
+    unknown_count: int
+    pinned: bool
+
+
+def _validate_dataset(dataset: List[Dict]) -> None:
+    if not isinstance(dataset, list):
+        raise ValueError("dataset must be a list of dictionaries.")
+    for entry in dataset:
+        if not isinstance(entry, dict):
+            raise ValueError("Each item in dataset must be a dictionary.")
+
+
+def export_dataset(dataset: List[Dict], output_format: str, output_path: str) -> None:
+    """Exports an in-memory dataset into a structured file (JSONL or CSV)."""
+    _validate_dataset(dataset)
+
+    options = ExportOptions(output_format=output_format, output_path=output_path)
+    output_file = Path(options.output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    if options.output_format == "jsonl":
+        _export_to_jsonl(dataset, output_file)
+    elif options.output_format == "csv":
+        _export_to_csv(dataset, output_file)
+    else:
+        raise ExportError(f"Unsupported output format: {options.output_format}")
+
+
+def _export_to_jsonl(dataset: List[Dict], output_file: Path) -> None:
+    try:
+        with output_file.open('w', encoding='utf-8') as f:
+            for record in dataset:
+                json.dump(record, f, default=str)
+                f.write('\n')
+    except OSError as e:
+        raise ExportError(f"Failed to write JSONL file: {e}") from e
+
+
+def _export_to_csv(dataset: List[Dict], output_file: Path) -> None:
+    if not dataset:
+        with output_file.open('w', newline='', encoding='utf-8') as f:
+            f.write("")
+        return
+
+    try:
+        with output_file.open('w', newline='', encoding='utf-8') as f:
+            fieldnames = list(dataset[0].keys())
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for record in dataset:
+                writer.writerow(record)
+    except OSError as e:
+        raise ExportError(f"Failed to write CSV file: {e}") from e
+
+
+__all__ = ["ExportOptions", "export_dataset"]