Source code for hyrax.datasets.result_factories

"""Factory functions for creating result dataset writers and readers.

These factories handle the selection between Lance and .npy formats.
"""

import logging
from pathlib import Path
from typing import Union

[docs] logger = logging.getLogger(__name__)
[docs] LANCE_DB_DIR = "lance_db"
[docs] def create_results_writer(result_dir: Union[str, Path]): """Create a writer for results (Lance format). This factory creates a ResultDatasetWriter for writing inference results to Lance format. New writes always use Lance format going forward. Parameters ---------- result_dir : Union[str, Path] Directory where results should be saved Returns ------- ResultDatasetWriter Writer instance for Lance storage """ from hyrax.datasets.result_dataset import ResultDatasetWriter return ResultDatasetWriter(result_dir)
[docs] def load_results_dataset( config: dict, results_dir: Union[Path, str, None] = None, verb: Union[str, None] = None ): """Load a results dataset, auto-detecting format. This factory auto-detects whether the results are in Lance or .npy format and returns the appropriate dataset class. Parameters ---------- config : dict The hyrax config dictionary results_dir : Union[Path, str, None], optional The results subdirectory to load from verb : Union[str, None], optional The name of the verb that generated the results (for auto-discovery) Returns ------- Union[ResultDataset, InferenceDataset] The appropriate dataset instance based on detected format """ from hyrax.config_utils import resolve_results_dir from hyrax.datasets.inference_dataset import InferenceDataset from hyrax.datasets.result_dataset import ResultDataset # Resolve results directory resolved_dir = resolve_results_dir(config, results_dir, verb) # Check if Lance format exists lance_dir = resolved_dir / LANCE_DB_DIR if lance_dir.exists(): logger.debug(f"Detected Lance format in {results_dir}") return ResultDataset(config, resolved_dir) else: logger.debug(f"Detected .npy format in {results_dir}") return InferenceDataset(config, resolved_dir, verb)