Source code for hyrax.config_utils

import base64
import copy
import datetime
import importlib
import logging
import random
import re
from importlib import util as importlib_util
from pathlib import Path
from typing import Any, Union

import tomlkit
from tomlkit.toml_document import TOMLDocument

[docs] DEFAULT_CONFIG_FILEPATH = Path(__file__).parent.resolve() / "hyrax_default_config.toml"
[docs] DEFAULT_USER_CONFIG_FILEPATH = Path.cwd() / "hyrax_config.toml"
# There are only a couple of configuration keys where we would expect to find an # external library string, so we specify those here.
[docs] KEYS_WITH_EXTERNAL_LIBS = ["name", "dataset_class"]
[docs] logger = logging.getLogger(__name__)
[docs] def config_help(config: TOMLDocument, *args): """ A simple config help function. It's a bit difficult to parse through the Tomlkit Table to print just one item such that it would include the comments preceding it. For now, we support the following cases, and generally print out the entire table for the given key. Cases: - if no args, prints the whole config. - if args[0] is a table name, print the whole table - if args[0] is not a table, assume it's a key and search -- print each one of the tables that it is found in. Parameters ---------- config : TOMLDocument A configuration dictionary that will be used to search for specified tables and keys. args : str A variable number of string arguments that specify the table name or key to search for in the configuration dictionary. """ # Get the config as a dictionary config_dict = config.value # No tables provided, print the whole config if not args: print(config.as_string()) # Table name provided as args[0], print that config table if len(args) == 1 and args[0] in config_dict: print(f"[{args[0]}]") print(config[args[0]].as_string()) # One arg provided, but it's not a table name. # Assume it's a config key and search the config for it. # Print each table that it is found in. if len(args) == 1 and args[0] not in config_dict: matching = find_keys(config_dict, args[0]) if len(matching): tables = [m.split(".")[0] for m in matching] print(f"Found '{args[0]}' in the following config sections: [{'], ['.join(tables)}]") for t in tables: config_help(config, t) else: print(f"Could not find '{args[0]}' in the config") if len(args) == 2: if args[0] in config_dict and args[1] in config_dict[args[0]]: print(f"[{args[0]}]") print(config[args[0]].as_string()) else: print(f"Cannot find ['{args[0]}']['{args[1]}'] in the current configuration.") if len(args) > 2: print("Too many arguments provided. Expecting 0, 1, or 2 arguments.") print("Usage: config.help(['table_name'|'key_name']), config.help('table_name', 'key_name')")
[docs] def parse_dotted_key(key: str) -> list[str]: """ Parse a dotted key string, respecting quoted sections. Quoted sections (using single or double quotes) are treated as a single key component, even if they contain dots. This allows for keys like 'torch.optim.Adam' to be used as a single table name in TOML configuration files. Parameters ---------- key : str The dotted key to parse. Examples: - "model.name" -> ['model', 'name'] - "'torch.optim.Adam'.lr" -> ['torch.optim.Adam', 'lr'] - '"torch.optim.Adam".lr' -> ['torch.optim.Adam', 'lr'] - "optimizer.'torch.optim.Adam'.lr" -> ['optimizer', 'torch.optim.Adam', 'lr'] Returns ------- list[str] A list of key components """ pattern = r"""(['"])(.*?)\1|([^.'"]+)""" matches = re.findall(pattern, key) parts = [] # For the example input key = '"torch.optim.Adam".lr' # `matches` = [('"', 'torch.optim.Adam', ''), ('', '', 'lr')] for _, quoted, unquoted in matches: if quoted: parts.append(quoted) elif unquoted: parts.append(unquoted) return parts
[docs] def find_keys(config: dict[str, Any], key_name: str): """ Recursively find all keys in a nested dictionary that match the given key name. Parameters ---------- config : dict The nested dictionary to search. key_name : str The name of the key to find. Returns ------- list A list of matching keys. """ matching_keys = [] def _find_keys(d, parent_key=""): if isinstance(d, dict): for k, v in d.items(): if k == key_name: matching_keys.append((parent_key + "." + k).strip(".")) _find_keys(v, parent_key + "." + k) _find_keys(config) return matching_keys
TOMLDocument.help = config_help # type: ignore
[docs] class ConfigManager: """A class to manage the runtime configuration for a Hyrax object. This class will contain all the logic and methods for reading, merging, and validating the runtime configuration. """ # True when we are called from a test, so we maintain isolation from any # user config that may be in the cwd of the running test process
[docs] _called_from_test = False
""" Hardcoded set of config keys which we know to contain paths, and we resolve to global paths during initialization in ConfigManager._resolve_config_paths(). """
[docs] PATH_CONFIG_KEYS = [ # TODO: external library config defaults # However we define config defaults from external libraries ought allow them to designate config keys # which contain relative paths. ultimately these should end up on the list and be resolved. ["data_set", "filter_catalog"], ["general", "data_dir"], ]
def __init__( self, runtime_config_filepath: Union[Path, str] | None = None, default_config_filepath: Union[Path, str] = DEFAULT_CONFIG_FILEPATH, ):
[docs] self.hyrax_default_config: TOMLDocument = ConfigManager.read_runtime_config(default_config_filepath)
[docs] self.runtime_config_filepath = ConfigManager.resolve_runtime_config(runtime_config_filepath)
if self.runtime_config_filepath is DEFAULT_CONFIG_FILEPATH: self.user_specific_config = TOMLDocument() else: self.user_specific_config = ConfigManager.read_runtime_config(self.runtime_config_filepath)
[docs] self.config = self._render_config(self.user_specific_config, self.hyrax_default_config)
[docs] self.original_config = copy.deepcopy(self.config)
@staticmethod
[docs] def _render_config( user_specific_config: TOMLDocument = None, hyrax_default_config: TOMLDocument = None, ): user_specific_config = user_specific_config if user_specific_config is not None else TOMLDocument() hyrax_default_config = hyrax_default_config if hyrax_default_config is not None else TOMLDocument() external_library_config_paths = set() external_library_config_paths |= ConfigManager._find_external_library_default_config_paths( user_specific_config ) # 1) merge all the external library config dictionaries together external_default_config = ConfigManager.merge_external_default_configs(external_library_config_paths) # 2) merge the external library configs on top of the hyrax defaults overall_default_config = ConfigManager.merge_default_configs( hyrax_default_config, external_default_config ) # 3) merge the user config on top of the overall defaults config = ConfigManager.merge_configs(overall_default_config, user_specific_config) ConfigManager._resolve_config_paths(config) if not config["general"]["dev_mode"]: ConfigManager._validate_runtime_config(config, overall_default_config) return config
[docs] def set_config(self, key: str, value: Any): """Set a config value at runtime. This modifies the in-memory config object. Once the configuration is updated, the entire config is re-rendered to ensure that any requested external library default configs are incorporated. Parameters ---------- key : str The dotted key to set, e.g. "model.name" or "'torch.optim.Adam'.lr" Quoted sections (using single or double quotes) are treated as single key components, allowing for table names like 'torch.optim.Adam'. value : Any The value to set the key to. """ keys = parse_dotted_key(key) d = self.config for k in keys[:-1]: d = d[k] d[keys[-1]] = value self.config = self._render_config(self.config, self.original_config) self.original_config = copy.deepcopy(self.config)
@staticmethod
[docs] def read_runtime_config(config_filepath: Union[Path, str] = DEFAULT_CONFIG_FILEPATH) -> TOMLDocument: """Read a single toml file and return a TOMLDocument Parameters ---------- config_filepath : Union[Path, str], optional The path to the config file, by default DEFAULT_CONFIG_FILEPATH Returns ------- TOMLDocument The contents of the toml file as a tomlkit.TOMLDocument """ config_filepath = Path(config_filepath) parsed_dict: TOMLDocument = TOMLDocument() if config_filepath.exists(): with open(config_filepath, "r") as f: parsed_dict = tomlkit.load(f) return parsed_dict
@staticmethod
[docs] def _find_external_library_default_config_paths(runtime_config: dict) -> set: """Search for external libraries in the runtime configuration and gather the libpath specifications so that we can load the default configs for the libraries. Parameters ---------- runtime_config : dict The runtime configuration as a tomlkit.TOMLDocument. Returns ------- set A tuple containing the default configuration Paths for the external libraries that are requested in the users configuration file. """ default_config_paths = set() for key, value in runtime_config.items(): if isinstance(value, dict): default_config_paths |= ConfigManager._find_external_library_default_config_paths(value) else: # We expect that values we are interested in will be of type string. if key in KEYS_WITH_EXTERNAL_LIBS and isinstance(value, str) and "." in value: external_library = value.split(".")[0] if importlib_util.find_spec(external_library) is not None: try: lib = importlib.import_module(external_library) if lib.__file__ is None: raise RuntimeError() lib_default_config_path = Path(lib.__file__).parent / "default_config.toml" if lib_default_config_path.exists(): default_config_paths.add(lib_default_config_path) else: logger.warning(f"Cannot find default_config.toml for {value}.") except ModuleNotFoundError: logger.error( f"External library {lib} not found. Please install it before running." ) raise else: raise ModuleNotFoundError( f"External library {external_library} not found. Check installation." ) return default_config_paths
@staticmethod
[docs] def merge_external_default_configs(external_default_config_paths): """Merge the default configurations from external libraries into the overall default configuration. Parameters ---------- external_default_config_paths : set A set containing the default configuration Paths for the external libraries that are requested in the users configuration file. Returns ------- dict The merged overall default configuration including the external library defaults. """ overall_default_config = TOMLDocument() # Merge all external library default configurations first for path in external_default_config_paths: logger.info(f"Merging external default config from {path}") external_library_config = ConfigManager.read_runtime_config(path) overall_default_config = ConfigManager.merge_configs( overall_default_config, external_library_config ) return overall_default_config
@staticmethod
[docs] def merge_default_configs(hyrax_defaults, external_defaults): """Merge the default configurations of external libraries on top of the Hyrax default configuration. Parameters ---------- hyrax_defaults : dict The default configuration from hyrax. external_defaults : dict The default configuration from external libraries. Returns ------- dict The merged overall default configuration including the external library defaults. """ return ConfigManager.merge_configs(hyrax_defaults, external_defaults)
@staticmethod
[docs] def merge_configs(base_config: dict, overriding_config: dict) -> dict: """Merge two config dictionaries with the overriding_config values overriding the base_config values. Parameters ---------- base_config : dict The base configuration with keys that may be overridden by the overriding_config. overriding_config : dict The new configuration values that will override the values in base_config. Returns ------- dict The merged configuration. """ final_config = base_config.copy() for k, v in overriding_config.items(): if k in final_config and isinstance(final_config[k], dict) and isinstance(v, dict): final_config[k] = ConfigManager.merge_configs(base_config[k], v) else: final_config[k] = v return final_config
@staticmethod
[docs] def _validate_runtime_config(runtime_config: dict, default_config: dict): """Recursive helper to check that all keys in runtime_config have a default in the merged default_config. The two arguments passed in must represent the same nesting level of the runtime config and all default config parameters respectively. Parameters ---------- runtime_config : dict Nested config dictionary representing the runtime config. default_config : dict Nested config dictionary representing the defaults Raises ------ RuntimeError Raised if any config that exists in the runtime config does not have a default defined in default_config """ for key in runtime_config: if key not in default_config: msg = f"Runtime config contains key or section '{key}' which has no default defined. " msg += f"All configuration keys and sections must be defined in {DEFAULT_CONFIG_FILEPATH}" logger.warning(msg) continue if isinstance(runtime_config[key], dict): if not isinstance(default_config[key], dict): msg = f"Runtime config contains a section named '{key}' which is the name of a " msg += "value in the default config. Please choose another name for this section." logger.warning(msg) continue ConfigManager._validate_runtime_config(runtime_config[key], default_config[key])
@staticmethod
[docs] def _resolve_config_paths(runtime_config: dict) -> None: """Convert all paths in a runtime config to global paths in the current environment. Uses the hardcoded list of paths in ConfigManager.PATH_CONFIG_KEYS This mutates the config dictionary passed. Parameters ---------- runtime_config : dict Current runtime config nested dictionary """ for key_spec in ConfigManager.PATH_CONFIG_KEYS: # Recursively look up a list of keys. current_dict = runtime_config current_key = None current_val = None # At the end of each loop current_* are always the dict, key, and value of the # last lookup. for key in key_spec: current_key = key try: current_val = current_dict[key] except KeyError: break if isinstance(current_val, dict): current_dict = current_val # On the non-break end of the loop we do path resolution, preserving falsy values # as false. else: new_val = str(Path(current_val).expanduser().resolve()) if current_val else False current_dict[current_key] = new_val
@staticmethod
[docs] def resolve_runtime_config(runtime_config_filepath: Union[Path, str, None] = None) -> Path: """Resolve a user-supplied runtime config to where we will actually pull config from. #. If a runtime config file is specified, we will use that file. #. If no file is specified and there is a file named "hyrax_config.toml" in the cwd we will use it. #. If no file is specified and there is no file named "hyrax_config.toml" in the cwd we will exclusively work off the configuration defaults in the packaged "hyrax_default_config.toml" file. Parameters ---------- runtime_config_filepath : Union[Path, str, None], optional Location of the supplied config file, by default None Returns ------- Path Path to the configuration file ultimately used for config resolution. When we fall back to the package supplied default config file, the Path to that file is returned. Raises ------ FileNotFoundError If a runtime config file is specified but does not exist. """ if isinstance(runtime_config_filepath, str): runtime_config_filepath = Path(runtime_config_filepath) # If a runtime config file is explicitly specified, validate it exists if isinstance(runtime_config_filepath, Path) and not runtime_config_filepath.exists(): raise FileNotFoundError(f"Cannot find config file {runtime_config_filepath}") # If a named config exists in cwd, and no config specified on cmdline, use cwd. if ( runtime_config_filepath is None and DEFAULT_USER_CONFIG_FILEPATH.exists() and not ConfigManager._called_from_test ): runtime_config_filepath = DEFAULT_USER_CONFIG_FILEPATH if runtime_config_filepath is None: runtime_config_filepath = DEFAULT_CONFIG_FILEPATH return runtime_config_filepath
[docs] def create_results_dir(config: dict, postfix: str) -> Path: """Creates a results directory for this run. Postfix is the verb name of the run e.g. (infer, train, etc) The directory is created within the results dir (set with config results_dir) and follows the pattern <timestamp>-<postfix> The resulting directory is returned. Parameters ---------- config : dict The full runtime configuration for this run postfix : str The verb name of the run. Returns ------- Path The path created by this function """ results_root = Path(config["general"]["results_dir"]).expanduser().resolve() # This date format is chosen specifically to create a lexical search order # which matches the date order. timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Generate 4 random ascii characters to avoid collisions from multiple hyrax processes # started in a shared filesystem environment. random_str = base64.urlsafe_b64encode(random.randbytes(3)).decode("ascii") directory = results_root / f"{timestamp}-{postfix}-{random_str}" directory.mkdir(parents=True, exist_ok=False) return directory
[docs] def find_most_recent_results_dir(config: dict, verb: str) -> Path | None: """Find the most recent results directory corresponding to a particular verb This is a best effort search in the currently configured results root. If result directories are created within 1 second of one another this function will return one of the directories but it is undefined which one it will return. This function may return None indicating it could not find a directory matching the query verb """ results_root = Path(config["general"]["results_dir"]).expanduser().resolve() max_timestamp = 0 best_path = None for path in results_root.glob(f"*-{verb}-*"): if path.is_dir(): regex = r"([0-9]{8})-([0-9]{6})-.*" m = re.match(regex, path.name) if m is None: continue timestamp = int(m[1] + m[2]) if timestamp > max_timestamp: max_timestamp = timestamp best_path = path return best_path
[docs] def log_runtime_config(runtime_config: dict, output_path: Path, file_name: str = "runtime_config.toml"): """Log a runtime configuration. Parameters ---------- runtime_config : dict A dictionary object containing runtime configuration values. output_path : str The path to put the config file file_name : str, Optional Optional name for the config file, defaults to "runtime_config.toml" """ with open(output_path / file_name, "w") as f: f.write(tomlkit.dumps(runtime_config))