Source code for hyrax.verbs.train

import logging
from pathlib import Path

from colorama import Back, Fore, Style

from .verb_registry import Verb, hyrax_verb

[docs] logger = logging.getLogger(__name__)
@hyrax_verb
[docs] class Train(Verb): """Train verb"""
[docs] cli_name = "train"
[docs]
[docs] add_parser_kwargs = {}
@staticmethod
[docs] def setup_parser(parser): """We don't need any parser setup for CLI opts""" pass
[docs] def run_cli(self, args=None): """CLI stub for Train verb""" logger.info("train run from CLI.") self.run()
[docs] def run(self): """ Run the training process for the configured model and data loader. Returns the trained model. """ import inspect import mlflow from tensorboardX import SummaryWriter from hyrax.config_utils import create_results_dir, log_runtime_config from hyrax.gpu_monitor import GpuMonitor from hyrax.model_exporters import export_to_onnx from hyrax.pytorch_ignite import ( create_trainer, create_validator, dist_data_loader, setup_dataset, setup_model, ) config = self.config # Create a results directory results_dir = create_results_dir(config, "train") log_runtime_config(config, results_dir) # Create a tensorboardX logger tensorboardx_logger = SummaryWriter(log_dir=results_dir) # Instantiate the model and dataset dataset = setup_dataset(config, tensorboardx_logger) logger.info(f"{Style.BRIGHT}{Fore.BLACK}{Back.GREEN}Training dataset(s):{Style.RESET_ALL}\n{dataset}") model = setup_model(config, dataset) logger.info(f"{Style.BRIGHT}{Fore.BLACK}{Back.GREEN}Training model:{Style.RESET_ALL}\n{model}") # Create a data loader for the training set (and validation split if configured) data_loaders = dist_data_loader(dataset, config, ["train", "validate"]) train_data_loader, _ = data_loaders["train"] validation_data_loader, _ = data_loaders.get("validate", (None, None)) # Create trainer, a pytorch-ignite `Engine` object trainer = create_trainer(model, config, results_dir, tensorboardx_logger) # Create a validator if a validation data loader is available if validation_data_loader is not None: create_validator(model, config, results_dir, tensorboardx_logger, validation_data_loader, trainer) monitor = GpuMonitor(tensorboard_logger=tensorboardx_logger) results_root_dir = Path(config["general"]["results_dir"]).expanduser().resolve() mlflow.set_tracking_uri("file://" + str(results_root_dir / "mlflow")) # Get experiment_name and cast to string (it's a tomlkit.string by default) experiment_name = str(config["train"]["experiment_name"]) # This will create the experiment if it doesn't exist mlflow.set_experiment(experiment_name) # If run_name is not `false` in the config, use it as the MLFlow run name in # this experiment. Otherwise use the name of the results directory run_name = str(config["train"]["run_name"]) if config["train"]["run_name"] else results_dir.name with mlflow.start_run(log_system_metrics=True, run_name=run_name): Train._log_params(config, results_dir) # Run the training process trainer.run(train_data_loader, max_epochs=config["train"]["epochs"]) # Save the trained model model.save(results_dir / config["train"]["weights_filename"]) with open(results_dir / "to_tensor.py", "w") as f: try: f.write(inspect.getsource(model.to_tensor)) except (OSError, TypeError) as e: logger.warning(f"Could not retrieve source for model.to_tensor: {e}") f.write("# Source code for model.to_tensor could not be retrieved.\n") monitor.stop() logger.info("Finished Training") tensorboardx_logger.close() context = { "ml_framework": "pytorch", "results_dir": results_dir, } # Get a sample of input data. If the data is labeled, only return the input data. batch_sample = next(iter(train_data_loader)) if isinstance(batch_sample, dict): batch_sample = model.to_tensor(batch_sample) sample = batch_sample[0] if isinstance(batch_sample, (list, tuple)) else batch_sample export_to_onnx(model, sample, config, context) return model
@staticmethod
[docs] def _log_params(config, results_dir): """Log the various parameters to mlflow from the config file. Parameters ---------- config : dict The main configuration dictionary results_dir: str The full path to the results sub-directory """ import mlflow # Log full path to results subdirectory mlflow.log_param("Results Directory", results_dir) # Log all model params mlflow.log_params(config["model"]) # Log some training and data loader params mlflow.log_param("epochs", config["train"]["epochs"]) mlflow.log_param("batch_size", config["data_loader"]["batch_size"]) # Log the criterion and optimizer params criterion_name = config["criterion"]["name"] mlflow.log_param("criterion", criterion_name) if criterion_name in config: mlflow.log_params(config[criterion_name]) optimizer_name = config["optimizer"]["name"] mlflow.log_param("optimizer", optimizer_name) if optimizer_name in config: mlflow.log_params(config[optimizer_name])