diff --git a/dataset_exporter/src/dataset_exporter/core.py b/dataset_exporter/src/dataset_exporter/core.py new file mode 100644 index 0000000..634d5ea --- /dev/null +++ b/dataset_exporter/src/dataset_exporter/core.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import json +import csv +from dataclasses import dataclass +from pathlib import Path +from typing import List, Dict +from datetime import datetime + + +class ExportError(Exception): + """Custom exception for export-related errors.""" + + +@dataclass +class ExportOptions: + """Configuration options for dataset export.""" + + output_format: str + output_path: str + + def __post_init__(self) -> None: + valid_formats = {"jsonl", "csv"} + if self.output_format not in valid_formats: + raise ValueError(f"Invalid format: {self.output_format}. Must be one of {valid_formats}.") + if not isinstance(self.output_path, str) or not self.output_path: + raise ValueError("output_path must be a non-empty string.") + + +@dataclass +class DriftRunRecord: + """Represents a single CI analysis run record.""" + + timestamp: datetime + decision: str + warn_rate: float + fail_count: int + unknown_count: int + pinned: bool + + +def _validate_dataset(dataset: List[Dict]) -> None: + if not isinstance(dataset, list): + raise ValueError("dataset must be a list of dictionaries.") + for entry in dataset: + if not isinstance(entry, dict): + raise ValueError("Each item in dataset must be a dictionary.") + + +def export_dataset(dataset: List[Dict], output_format: str, output_path: str) -> None: + """Exports an in-memory dataset into a structured file (JSONL or CSV).""" + _validate_dataset(dataset) + + options = ExportOptions(output_format=output_format, output_path=output_path) + output_file = Path(options.output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + + if options.output_format == "jsonl": + _export_to_jsonl(dataset, output_file) + elif options.output_format == "csv": + _export_to_csv(dataset, output_file) + else: + raise ExportError(f"Unsupported output format: {options.output_format}") + + +def _export_to_jsonl(dataset: List[Dict], output_file: Path) -> None: + try: + with output_file.open('w', encoding='utf-8') as f: + for record in dataset: + json.dump(record, f, default=str) + f.write('\n') + except OSError as e: + raise ExportError(f"Failed to write JSONL file: {e}") from e + + +def _export_to_csv(dataset: List[Dict], output_file: Path) -> None: + if not dataset: + with output_file.open('w', newline='', encoding='utf-8') as f: + f.write("") + return + + try: + with output_file.open('w', newline='', encoding='utf-8') as f: + fieldnames = list(dataset[0].keys()) + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for record in dataset: + writer.writerow(record) + except OSError as e: + raise ExportError(f"Failed to write CSV file: {e}") from e + + +__all__ = ["ExportOptions", "export_dataset"]