Hyrax default configuration

Hyrax default configuration#

The following is the complete default configuration file for Hyrax. This file is used when no custom configuration file is provided. See Configuration for how to override individual values and The Hyrax Configuration System for how Hyrax layers multiple config sources.
[general]
# Set to `true` during development to skip checking for default config values
# in external libraries. Use `false` otherwise.
dev_mode = false

# Destination of log messages. Options: 'stderr', 'stdout' specify the console,
# "path/to/hyrax.log" specifies a file.
log_destination = "stderr"

# Lowest log level to emit. Options: "critical", "error", "warning", "info", "debug".
log_level = "info"

# Directory where data is stored.
data_dir = "./data"

#Top level directory for writing results.
results_dir = "./results"


[download]
# Cut out width in arcseconds.
sw = "22asec"

# Cut out height in arcseconds.
sh = "22asec"

# The filters to download.
filter = ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"]

# The type of data to download.
type = "coadd"

# The data release to download from.
rerun = "pdr3_wide"

# Path to credentials.ini file for the downloader. File contents should be:
# username = "<your username>"
# password = "<your password>"
credentials_file = "./credentials.ini"

# Alternate way to pass credentials to the downloader. Users should prefer a
# credentials.ini file to avoid exposing credentials with source control.
username = false
password = false

# The number of sources to download from the catalog. Default is -1, which
# downloads all sources in the catalog.
num_sources = -1

# The number of concurrent connections to use when downloading data.
concurrent_connections = 4

# The number of seconds between printing download statistics.
stats_print_interval = 60

# The path to the catalog file that defines which cutouts to download.
fits_file = "./catalog.fits"

# The number of seconds to wait before retrying a failed HTTP request in seconds.
retry_wait = 30

# How many times to retry a failed HTTP request before moving on to the next one.
retries = 3

# Number of seconds to wait for a full HTTP response from the server.
timeout = 3600

# The number of sky location rectangles should we request in a single request.
chunk_size = 990

# Request the image layer from the cutout service
image = true

# Request the variance layer from the cutout service
variance = false

# Request the mask layer from the cutout service
mask = false


[model]
# NOTE: All parameters are NOT used by all models. Check the model code before training.

# The name of the model to use. Option are a built-in model class name or import path
# to an external model. e.g. "HyraxAutoencoder", "user_pkg.model.ExternalModel"
name = ""


[model.HyraxAutoencoder]
# The number of output channels from the first layer.
base_channel_size = 32

# The length of the latent space vector. 
latent_dim = 64


[model.HyraxAutoencoderV2]
# The number of output channels from the first layer.
base_channel_size = 32

# The length of the latent space vector. 
latent_dim = 64

# The activation function of the final layer.
final_layer = "tanh"

[model.ImageDCAE]
# The number of output channels from the first layer.
base_channel_size = 32

# The length of the latent space vector.
latent_dim = 512

# The activation function of the final layer.
final_layer = "identity"


[model.SimCLR]
# The dimension of the projection head for SimCLR
projection_dimension = 128

# The scalar temperature parameter for its loss function, NTXentLoss, for SimCLR
temperature = 0.5

# The probability of applying horizontal flip augmentation for SimCLR
horizontal_flip_probability = 0.5

# The parameters for color jitter augmentation for SimCLR
# [brightness, contrast, saturation, hue]
color_jitter_params = [0.8, 0.8, 0.8, 0.2]

# The probability of applying color jitter augmentation for SimCLR
color_jitter_probability = 0.8

# The probability of applying grayscale augmentation for SimCLR
grayscale_probability = 0.2

# The kernel size of Gaussian blur augmentation for SimCLR
gaussian_blur_kernel_size = 9

# The sigma range used in Gaussian blur augmentation for SimCLR
gaussian_blur_sigma_range = [0.1, 2.0]


[model.HyraxCNN]
# The number of classes to predict as the output of the model. i.e. 2 would be a
# binary classifer, 10 would predict the 10 classes in the CiFAR dataset.
output_classes = 10


[criterion]
# The name of the built-in criterion to use or the import path to an external criterion
name = "torch.nn.CrossEntropyLoss"

# Whether to "sum" or "mean" loss across channels. Only used by HyraxAutoencoderV2
band_loss_reduction = "mean"


[optimizer]
# The name of the built-in optimizer to use or the import path to an external optimizer
name = "torch.optim.SGD"


["torch.optim.SGD"]
# learning rate for torch.optim.SGD optimizer.
lr = 0.01

# momentum for torch.optim.SGD optimizer.
momentum = 0.9

["torch.optim.Adam"]
# learning rate for torch.optim.SGD optimizer.
lr = 0.01

[scheduler]
# name of the learning rate scheduler
# With gamma=1, ExponentialLR will keep the learning rate constant
# https://docs.pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ExponentialLR.html
name = "torch.optim.lr_scheduler.ExponentialLR"

["torch.optim.lr_scheduler.ExponentialLR"]
# the decay multipler on each epoch
gamma = 1

["torch.optim.lr_scheduler.ConstantLR"]

last_epoch = -1

[train]
# The name of the file were the model weights will be saved after training.
weights_filename = "example_model.pth"

#The number of epochs to train for.
epochs = 10

# If resuming from a check point, set to the path of the checkpoint file.
# Otherwise set to `false` to start training from the beginning.
resume = false

# Path to a pre-trained model weights file to use as the starting point for fine-tuning.
# If `false`, training starts from randomly initialized weights. Cannot be set with `resume`.
model_weights_file = false

# The data_set split to use when training a model.
split = "train"

# The name of the experiment when logging training results to mlflow
experiment_name = "notebook"

# The name of the run when logging training results to mlflow.
# If false, uses result directory string, <timestamp>-train-<uid>, as run name.
run_name = false

# If true, shuffle training samples each epoch. Only the train verb uses this option.
shuffle = true


[test]
# The path to the model weights file to use for testing. If `false`, the most recent
# training results (from [train]) will be used.
model_weights_file = false

# The name of the experiment when logging test results to mlflow.
# If not set, uses the experiment name from [train].
experiment_name = "notebook"

# The name of the run when logging test results to mlflow.
# If false, uses result directory string, <timestamp>-test-<uid>, as run name.
run_name = false



[onnx]

# The operator set version to use when exporting a model. See the following for info:
# https://onnxruntime.ai/docs/reference/compatibility.html#onnx-opset-support
opset_version = 20

# The directory to find input model files to convert to ONNX. ONNX-ified models
# will be written to this directory as well.
input_model_directory = false


# [data_request]
# Top-level table that defines the dataset(s) used for training, validation, and inference.
# Configure with [data_request.train], [data_request.validate], or [data_request.infer] sections.


[data_set]
# Crop pixel dimensions for images, e.g., [100, 100]. If false, scans for the
# smallest image size in [general].data_dir and uses it.
crop_to = false

# Used by HSCDataset, LSSTDataset, and DownloadedLSSTDataset. 
# Limit to only particular filters. When `false`, use all filters. 
# Options: ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"] for HSC
# Options: ["u", "g", "r", "i", "z" , "y"] for LSST
filters = false

# Path to a fits file that specifies object IDs to use from the data stored in
# [general].data_dir. Implementation is data_set class dependent. Use `false` for no filtering.
filter_catalog = false

# The transformation to be applied to images before being passed on to the model
# This must be a valid Numpy function. Passing false will result in no transformations
# (other than cropping) be applied to the images.  
transform = "tanh"

# Number to seed with for generating a random split. Use `false` to seed from a
# system source at runtime.
seed = false

# If `true`, cache samples in memory during training to reduce runtime after the
# first epoch. Set to `false` when running inference or on memory-constrained systems.
use_cache = true


# Override the name of the object_id column for FitsImageDataset, HSCDataset and DownloadedLSSTDataset
object_id_column_name = false

# Override the name of the filter column for FitsImageDataset and HSCDataset
filter_column_name = false

# Override the name of the filename column for FitsImageDataset and HSCDataset
filename_column_name = false

# Replace NaN in input data with a value, modes are false for no replacement or "quantile" to replace with a 
# defined quantile of the non-NaN data, see nan_quantile.
nan_mode = false

# When replacing NaN values with a quantile, which quantile in the non-nan tensor should be used.
nan_quantile = 0.05

# The astropy table to use as a catalog in LSSTDataset and friends
astropy_table = false

# Semi width in degrees of cutouts made from the butler (17 arcsec)
semi_width_deg = 0.00472

# Semi height in degrees of cutouts made from the butler (17 arcsec)
semi_height_deg = 0.00472


[data_set.HyraxCifarDataset]
# If `true`, download CIFAR10 training set, otherwise download test set.
use_training_data = true


[data_set.HyraxRandomDataset]
# Total number of samples produced by the random dataset
size = 100

# The dimensions of the numpy arrays that will be produced for each sample represented
# as a list where each element is the size of dimension.
shape = [2,5,5]

# Seed to use for random number generation
seed = 42

# If a list is provided, the data will have randomly labeled with values from the list
# If set to false, no labels will be included with the data.
provided_labels = [0, 1, 2]

# List of metadata field names. These will be populated with dummy data.
metadata_fields = ["meta_field_1", "meta_field_2"]

# Set this to a positive integer to randomly replace some values with an "invalid" value.
number_invalid_values = 0

# The value to use for invalid values in the data. Must be one of the following:
# "nan", "inf", "-inf", "none" or a float value.
invalid_value_type = "nan"

[data_set.HyraxHATSDataset]
# Extra keyword arguments passed directly to lsdb.open_catalog
[data_set.HyraxHATSDataset.open_catalog_kwargs]

[data_set.NestedPandasDataset]
# Extra keyword arguments passed directly to the selected nested_pandas reader.
[data_set.NestedPandasDataset.read_kwargs]
# engine = "auto" # pandas default, but can be "pyarrow", "fastparquet"
# See pandas.read_parquet API reference for all possible arguments.

[data_set.LanceDBDataset]
# Table name to open inside the LanceDB database.
table_name = false

# Extra keyword arguments passed directly to lancedb.connect.
[data_set.LanceDBDataset.connect_kwargs]

# Extra keyword arguments passed directly to db.open_table.
[data_set.LanceDBDataset.open_table_kwargs]

[data_set.MultimodalUniverseDataset]
# Hugging Face split name to load (for example: "train", "validation", or "test").
split = "train"

# Maximum number of rows to load from the split. Set to false for no explicit limit.
max_samples = 100

# Stream rows from Hugging Face instead of downloading full files.
# If true, `max_samples` must be set.
streaming = true


[data_loader]
# The number of data points to load at once.
batch_size = 512



[infer]
# The path to the model weights file to use for inference.
model_weights_file = false

# >>> I believe that we can simply remove this entry <<<<<<<<<<<<<<<<<<<<<<<<<<<
# The data_set split to use for inference. Use `false` for entire dataset.
split = "infer"


[vector_db]
# The type of vector db to use. Use "false" to disable vector database.
name = "chromadb"

# The directory where the vector database will be stored. Use "false" to create
# a new vector database in a timestamped directory. Otherwise set to a path.
vector_db_dir = false

# The path to inference results. Setting to "false" will use the most recent
# inference results.
infer_results_dir = false


[vector_db.chromadb]
# The approximate maximum size of a shard before creating a new one. A smaller
# value will decrease insert times while increasing search times.
shard_size_limit = 65536

# Inserting vectors with more than this many elements logs a warning message. ChromaDB
# performance degrades with vectors of this size. Set to "false" to disable warning.
vector_size_warning = 10000


[vector_db.qdrant]
# The number of elements in the vectors that will be stored in the vector database.
# This must be the same as the size of the vectors produced by the model.
vector_size = 64


[results]
# Path to inference results to use for visualization and lookups. Uses latest inference run if none provided.
inference_dir = false


[reduce]
# Name of the reduction algorithm to use
algorithm = "umap"

# Save the fitted reducer model as a pickle file 
save_fit_model = true

# The number of data points to use when transforming with reduction algorithm at once
batch_size = 1024

# Use multiprocessing during transforming with redudction algorithm (More memory intensive)
parallel = false


[reduce.umap]
# Number of data points used to fit the umap model.
fit_sample_size = 1024

# Path to a pre-existing umap reducer model
model_path = false


[reduce.umap.kwargs]
# Specify any parameter accepted by https://umap-learn.readthedocs.io/en/latest/api.html#umap
# Dimension of the embedded space
n_components = 2

# Controls how UMAP balances local versus global structure in the data.
# See official documentation for details.
n_neighbors = 15


[reduce.tsne]
# Placeholder for config values of tsne model


[reduce.tsne.kwargs]
# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Dimension of the embedded space
n_components = 2

# Number of nearest neighbors that is used in other manifold learning algorithms
# See official documentation for details.
perplexity = 30.0


[reduce.pca]
# Number of data points used to fit the pca model.
fit_sample_size = 1024

# Path to a pre-existing pca reducer model
model_path = false


[reduce.pca.kwargs]
# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#
# Dimension of the embedded space
n_components=2


[visualize]

# List of metadata field names to use in visualizer. Must be available as metadata in your dataset
fields = []

# Whether to display a panel of randomly chosen images corresponding to the selected points
display_images = false

# Name of catalog column to use for coloring points in the scatter plot. Use false for no coloring.
color_column = false

# Colormap to use for coloring points in the scatter plot when color_column is specified
cmap = "viridis"

# Only valid for .pt tensor images. Which bands should be loaded for display
# [0,3,5] would map bands in that order to R,G,B. Single band will be grayscale.
torch_tensor_bands = [3]

# Whether to rasterize plot. Will break coloring (Haloviews Bug)
# Helpful to reduce lag in large datasets. 
rasterize_plot = false


[visualize_v2]

# Number of hexagonal bins across the x-axis at any zoom level
target_bins = 50

# Extra padding around the viewport as a fraction of viewport width/height
buffer_factor = 0.2

# Plot dimensions in pixels
plot_width = 1000
plot_height = 1000

# Colormap for hexbin density
cmap = "Viridis"

# Maximum rows rendered in the selection table
max_table_rows = 1000

# Number of detail plots per tab row
num_detail_plots = 6



[engine]

# The directory containing the ONNX model used for inference in production
model_directory = false


[split]
# Per-group split configuration.  Keys correspond to groups in [data_request].
# Values are a fraction of the group's primary dataset to use — (0.0, 1.0]
# or a path to a previously generated <group>_split.npz file.
# Groups absent from this table default to 1.0 (use the full dataset).

# RNG seed for shuffling/partitioning. `false` uses config["data_set"]["seed"].
# A non-empty value uses a dedicated generator and does affect global RNG state.
rng_seed = false


[balance]
# Name of the field identifying each item's class.  Must resolve to a
# get_<field> getter on the primary dataset.  `false` disables stratification.
field = false

# Data groups whose sampling weights are adjusted to the target distribution.
# Empty list disables rebalancing.  See [balance.distribution] for the target.
groups = []

[balance.distribution]
# Maps class label → target fraction in (0.0, 1.0].  Empty table means equal
# target across all observed classes (uniform).  Values must sum to exactly 1.0.
# Example:
# cat = 0.25
# dog = 0.75

[label]
# Optional. Maps human-readable string aliases to the raw values returned by
# get_<balance.field>.  Required when balance.distribution is used and 
# get_<balance.field> returns non-string values (e.g. integers),
# All balance.distribution keys must appear here.
# Example:
#   cat  = 0
#   dog  = 1
#   bird = 2