Source code for hyrax.verbs.reduce_dimensions

import gc
import logging
import warnings
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Union

import numpy as np

from .verb_registry import Verb, hyrax_verb


[docs]
logger = logging.getLogger(__name__)



@hyrax_verb

[docs]
class ReduceDimensions(Verb):
    """Verb to reduce the dimensionality of a dataset"""

    # Use an attribute-friendly name so `hyrax.reduce_dimensions` resolves.

[docs]
    cli_name = "reduce_dimensions"


[docs]
    add_parser_kwargs = {}


[docs]
    description = "Reduce the dimensionality of a dataset using provided or default reduction algorithm."


    @staticmethod

[docs]

[docs]
    def setup_parser(parser: ArgumentParser):
        """Setup parser for reduce-dimensions verb"""
        parser.add_argument(
            "-a",
            "--algorithm",
            type=str,
            required=False,
            help="Dimensionality reduction algorithm to use (default: umap).",

        )
        parser.add_argument(
            "-i",
            "--input-dir",
            type=str,
            required=False,
            help="Directory containing the dataset to reduce dimensions for.",
        )
        parser.add_argument(
            "-m",
            "--model-path",
            type=str,
            required=False,
            help="Path to a previously saved reducer model.",
        )



[docs]
    def run_cli(self, args: Namespace | None = None):
        """CLI stub for ReduceDimensions verb"""
        logger.info("`reduce-dimensions` run from CLI.")

        if args is None:
            raise RuntimeError("Run CLI called with no arguments.")

        return self.run(algorithm=args.algorithm, input_dir=args.input_dir, model_path=args.model_path)



[docs]
    def run(
        self,
        algorithm: str | None = None,
        input_dir: Union[Path, str] | None = None,
        model_path: Union[Path, str] | None = None,
    ):
        """
        Run dimensionality reduction on a dataset

        This method loads the latent space representations from an inference run and applies
        the selected dimensionality reduction algorithm.

        Algorithms that support reusable fitted models may either:

        - fit a new model using a sampled subset of the data, or
        - load an existing model if a model path is provided.

        Algorithms without a separate fitting stage do not support model loading and
        directly transform the input data.

        The full dataset is then transformed into the target lower-dimensional space,
        and the resulting embeddings are saved.

        Parameters
        ----------
        algorithm : str, Optional
            The dimensionality reduction algorithm to use.
            If not specified, the method will look in the config for a default algorithm.

        input_dir : str or Path, Optional
            Directory containing the dataset to reduce dimensions for.

        model_path : str or Path, Optional
            Path to a previously saved reducer model.

        Returns
        -------
        None
            The method does not return anything but saves the algorithm reducer representations to disk.
        """
        with warnings.catch_warnings():
            warnings.simplefilter(action="ignore", category=FutureWarning)
            return self._run(algorithm, input_dir, model_path)



[docs]
    def _run(
        self, algorithm: str | None, input_dir: Union[Path, str] | None, model_path: Union[Path, str] | None
    ):
        """See run()"""
        from hyrax.config_utils import create_results_dir
        from hyrax.datasets.result_factories import create_results_writer, load_results_dataset
        from hyrax.verbs.reduction_algorithms.algorithm_registry import fetch_reducer_class

        # Get reducer class
        algorithm_name = algorithm or self.config["reduce"]["algorithm"]
        reducer_cls = fetch_reducer_class(algorithm_name)

        results_dir = create_results_dir(self.config, f"{algorithm_name}")
        logger.info(f"Saving reduction results using {algorithm_name} to {results_dir}")
        reduction_results = create_results_writer(results_dir)

        algo_reducer = reducer_cls(self.config, reduction_results)

        inference_results = load_results_dataset(self.config, results_dir=input_dir, verb="infer")
        total_length = len(inference_results)

        # Prepare data sample for either fitting a new model or validating a pre-trained model loaded.
        config_sample_size = self.config["reduce"][algorithm_name].get("fit_sample_size", None)
        sample_size = int(np.min([config_sample_size if config_sample_size else np.inf, total_length]))
        rng = np.random.default_rng()
        sample_indexes = rng.choice(np.arange(total_length), size=sample_size, replace=False)
        data_sample = np.asarray(inference_results[sample_indexes]).reshape((sample_size, -1))

        # Load model if path provided, otherwise fit new model
        # Getting the model of current algorithm specified.
        if model_path is None:
            model_path = self.config["reduce"][algorithm_name].get("model_path", None)

        if model_path:
            logger.info(f"Loading pre-existing reducer model from {model_path}")
            algo_reducer.load_model(data_sample.shape[1], model_path)
        else:
            logger.info("No model_path specified. A new model will be fitted.")
            algo_reducer.fit(data_sample)

            if self.config["reduce"].get("save_fit_model", False):
                logger.info(f"Saving fitted {algorithm_name} reducer to result directory")
                algo_reducer.save_model(results_dir)

        del data_sample
        gc.collect()

        # Transform dataset
        batch_size = self.config["reduce"]["batch_size"]
        num_batches = int(np.ceil(total_length / batch_size))

        all_indexes = np.arange(0, total_length)
        all_ids = np.array(inference_results.ids())

        args = (
            (
                all_ids[batch_indexes],
                inference_results[batch_indexes].reshape(len(batch_indexes), -1),
            )
            for batch_indexes in np.array_split(all_indexes, num_batches)
        )
        algo_reducer.transform(args, num_batches)

        logger.info(f"Finished transforming all data with {algorithm_name}")

        return load_results_dataset(self.config, results_dir)