Hyrax default configuration

Hyrax default configuration#

The following is the complete default configuration file for Hyrax. This file is used when no custom configuration file is provided. See Configuration for how to override individual values and The Hyrax Configuration System for how Hyrax layers multiple config sources.

  1[general]
  2# Set to `true` during development to skip checking for default config values
  3# in external libraries. Use `false` otherwise.
  4dev_mode = false
  5
  6# Destination of log messages. Options: 'stderr', 'stdout' specify the console,
  7# "path/to/hyrax.log" specifies a file.
  8log_destination = "stderr"
  9
 10# Lowest log level to emit. Options: "critical", "error", "warning", "info", "debug".
 11log_level = "info"
 12
 13# Directory where data is stored.
 14data_dir = "./data"
 15
 16#Top level directory for writing results.
 17results_dir = "./results"
 18
 19
 20[download]
 21# Cut out width in arcseconds.
 22sw = "22asec"
 23
 24# Cut out height in arcseconds.
 25sh = "22asec"
 26
 27# The filters to download.
 28filter = ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"]
 29
 30# The type of data to download.
 31type = "coadd"
 32
 33# The data release to download from.
 34rerun = "pdr3_wide"
 35
 36# Path to credentials.ini file for the downloader. File contents should be:
 37# username = "<your username>"
 38# password = "<your password>"
 39credentials_file = "./credentials.ini"
 40
 41# Alternate way to pass credentials to the downloader. Users should prefer a
 42# credentials.ini file to avoid exposing credentials with source control.
 43username = false
 44password = false
 45
 46# The number of sources to download from the catalog. Default is -1, which
 47# downloads all sources in the catalog.
 48num_sources = -1
 49
 50# The number of concurrent connections to use when downloading data.
 51concurrent_connections = 4
 52
 53# The number of seconds between printing download statistics.
 54stats_print_interval = 60
 55
 56# The path to the catalog file that defines which cutouts to download.
 57fits_file = "./catalog.fits"
 58
 59# The number of seconds to wait before retrying a failed HTTP request in seconds.
 60retry_wait = 30
 61
 62# How many times to retry a failed HTTP request before moving on to the next one.
 63retries = 3
 64
 65# Number of seconds to wait for a full HTTP response from the server.
 66timeout = 3600
 67
 68# The number of sky location rectangles should we request in a single request.
 69chunk_size = 990
 70
 71# Request the image layer from the cutout service
 72image = true
 73
 74# Request the variance layer from the cutout service
 75variance = false
 76
 77# Request the mask layer from the cutout service
 78mask = false
 79
 80
 81[model]
 82# NOTE: All parameters are NOT used by all models. Check the model code before training.
 83
 84# The name of the model to use. Option are a built-in model class name or import path
 85# to an external model. e.g. "HyraxAutoencoder", "user_pkg.model.ExternalModel"
 86name = ""
 87
 88
 89[model.HyraxAutoencoder]
 90# The number of output channels from the first layer.
 91base_channel_size = 32
 92
 93# The length of the latent space vector. 
 94latent_dim = 64
 95
 96
 97[model.HyraxAutoencoderV2]
 98# The number of output channels from the first layer.
 99base_channel_size = 32
100
101# The length of the latent space vector. 
102latent_dim = 64
103
104# The activation function of the final layer.
105final_layer = "tanh"
106
107[model.ImageDCAE]
108# The number of output channels from the first layer.
109base_channel_size = 32
110
111# The length of the latent space vector.
112latent_dim = 512
113
114# The activation function of the final layer.
115final_layer = "identity"
116
117
118[model.SimCLR]
119# The dimension of the projection head for SimCLR
120projection_dimension = 128
121
122# The scalar temperature parameter for its loss function, NTXentLoss, for SimCLR
123temperature = 0.5
124
125# The probability of applying horizontal flip augmentation for SimCLR
126horizontal_flip_probability = 0.5
127
128# The parameters for color jitter augmentation for SimCLR
129# [brightness, contrast, saturation, hue]
130color_jitter_params = [0.8, 0.8, 0.8, 0.2]
131
132# The probability of applying color jitter augmentation for SimCLR
133color_jitter_probability = 0.8
134
135# The probability of applying grayscale augmentation for SimCLR
136grayscale_probability = 0.2
137
138# The kernel size of Gaussian blur augmentation for SimCLR
139gaussian_blur_kernel_size = 9
140
141# The sigma range used in Gaussian blur augmentation for SimCLR
142gaussian_blur_sigma_range = [0.1, 2.0]
143
144
145[model.HyraxCNN]
146# The number of classes to predict as the output of the model. i.e. 2 would be a
147# binary classifer, 10 would predict the 10 classes in the CiFAR dataset.
148output_classes = 10
149
150
151[criterion]
152# The name of the built-in criterion to use or the import path to an external criterion
153name = "torch.nn.CrossEntropyLoss"
154
155# Whether to "sum" or "mean" loss across channels. Only used by HyraxAutoencoderV2
156band_loss_reduction = "mean"
157
158
159[optimizer]
160# The name of the built-in optimizer to use or the import path to an external optimizer
161name = "torch.optim.SGD"
162
163
164["torch.optim.SGD"]
165# learning rate for torch.optim.SGD optimizer.
166lr = 0.01
167
168# momentum for torch.optim.SGD optimizer.
169momentum = 0.9
170
171["torch.optim.Adam"]
172# learning rate for torch.optim.SGD optimizer.
173lr = 0.01
174
175[scheduler]
176# name of the learning rate scheduler
177# With gamma=1, ExponentialLR will keep the learning rate constant
178# https://docs.pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ExponentialLR.html
179name = "torch.optim.lr_scheduler.ExponentialLR"
180
181["torch.optim.lr_scheduler.ExponentialLR"]
182# the decay multipler on each epoch
183gamma = 1
184
185["torch.optim.lr_scheduler.ConstantLR"]
186
187last_epoch = -1
188
189[train]
190# The name of the file were the model weights will be saved after training.
191weights_filename = "example_model.pth"
192
193#The number of epochs to train for.
194epochs = 10
195
196# If resuming from a check point, set to the path of the checkpoint file.
197# Otherwise set to `false` to start training from the beginning.
198resume = false
199
200# Path to a pre-trained model weights file to use as the starting point for fine-tuning.
201# If `false`, training starts from randomly initialized weights. Cannot be set with `resume`.
202model_weights_file = false
203
204# The data_set split to use when training a model.
205split = "train"
206
207# The name of the experiment when logging training results to mlflow
208experiment_name = "notebook"
209
210# The name of the run when logging training results to mlflow.
211# If false, uses result directory string, <timestamp>-train-<uid>, as run name.
212run_name = false
213
214# If true, shuffle training samples each epoch. Only the train verb uses this option.
215shuffle = true
216
217
218[test]
219# The path to the model weights file to use for testing. If `false`, the most recent
220# training results (from [train]) will be used.
221model_weights_file = false
222
223# The name of the experiment when logging test results to mlflow.
224# If not set, uses the experiment name from [train].
225experiment_name = "notebook"
226
227# The name of the run when logging test results to mlflow.
228# If false, uses result directory string, <timestamp>-test-<uid>, as run name.
229run_name = false
230
231
232
233[onnx]
234
235# The operator set version to use when exporting a model. See the following for info:
236# https://onnxruntime.ai/docs/reference/compatibility.html#onnx-opset-support
237opset_version = 20
238
239# The directory to find input model files to convert to ONNX. ONNX-ified models
240# will be written to this directory as well.
241input_model_directory = false
242
243
244# [data_request]
245# Top-level table that defines the dataset(s) used for training, validation, and inference.
246# Configure with [data_request.train], [data_request.validate], or [data_request.infer] sections.
247
248
249[data_set]
250# Crop pixel dimensions for images, e.g., [100, 100]. If false, scans for the
251# smallest image size in [general].data_dir and uses it.
252crop_to = false
253
254# Used by HSCDataset, LSSTDataset, and DownloadedLSSTDataset. 
255# Limit to only particular filters. When `false`, use all filters. 
256# Options: ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"] for HSC
257# Options: ["u", "g", "r", "i", "z" , "y"] for LSST
258filters = false
259
260# Path to a fits file that specifies object IDs to use from the data stored in
261# [general].data_dir. Implementation is data_set class dependent. Use `false` for no filtering.
262filter_catalog = false
263
264# The transformation to be applied to images before being passed on to the model
265# This must be a valid Numpy function. Passing false will result in no transformations
266# (other than cropping) be applied to the images.  
267transform = "tanh"
268
269# Number to seed with for generating a random split. Use `false` to seed from a
270# system source at runtime.
271seed = false
272
273# If `true`, cache samples in memory during training to reduce runtime after the
274# first epoch. Set to `false` when running inference or on memory-constrained systems.
275use_cache = true
276
277
278# Override the name of the object_id column for FitsImageDataset, HSCDataset and DownloadedLSSTDataset
279object_id_column_name = false
280
281# Override the name of the filter column for FitsImageDataset and HSCDataset
282filter_column_name = false
283
284# Override the name of the filename column for FitsImageDataset and HSCDataset
285filename_column_name = false
286
287# Replace NaN in input data with a value, modes are false for no replacement or "quantile" to replace with a 
288# defined quantile of the non-NaN data, see nan_quantile.
289nan_mode = false
290
291# When replacing NaN values with a quantile, which quantile in the non-nan tensor should be used.
292nan_quantile = 0.05
293
294# The astropy table to use as a catalog in LSSTDataset and friends
295astropy_table = false
296
297# Semi width in degrees of cutouts made from the butler (17 arcsec)
298semi_width_deg = 0.00472
299
300# Semi height in degrees of cutouts made from the butler (17 arcsec)
301semi_height_deg = 0.00472
302
303
304[data_set.HyraxCifarDataset]
305# If `true`, download CIFAR10 training set, otherwise download test set.
306use_training_data = true
307
308
309[data_set.HyraxRandomDataset]
310# Total number of samples produced by the random dataset
311size = 100
312
313# The dimensions of the numpy arrays that will be produced for each sample represented
314# as a list where each element is the size of dimension.
315shape = [2,5,5]
316
317# Seed to use for random number generation
318seed = 42
319
320# If a list is provided, the data will have randomly labeled with values from the list
321# If set to false, no labels will be included with the data.
322provided_labels = [0, 1, 2]
323
324# List of metadata field names. These will be populated with dummy data.
325metadata_fields = ["meta_field_1", "meta_field_2"]
326
327# Set this to a positive integer to randomly replace some values with an "invalid" value.
328number_invalid_values = 0
329
330# The value to use for invalid values in the data. Must be one of the following:
331# "nan", "inf", "-inf", "none" or a float value.
332invalid_value_type = "nan"
333
334[data_set.HyraxHATSDataset]
335# Extra keyword arguments passed directly to lsdb.open_catalog
336[data_set.HyraxHATSDataset.open_catalog_kwargs]
337
338[data_set.NestedPandasDataset]
339# Extra keyword arguments passed directly to the selected nested_pandas reader.
340[data_set.NestedPandasDataset.read_kwargs]
341# engine = "auto" # pandas default, but can be "pyarrow", "fastparquet"
342# See pandas.read_parquet API reference for all possible arguments.
343
344[data_set.LanceDBDataset]
345# Table name to open inside the LanceDB database.
346table_name = false
347
348# Extra keyword arguments passed directly to lancedb.connect.
349[data_set.LanceDBDataset.connect_kwargs]
350
351# Extra keyword arguments passed directly to db.open_table.
352[data_set.LanceDBDataset.open_table_kwargs]
353
354[data_set.MultimodalUniverseDataset]
355# Hugging Face split name to load (for example: "train", "validation", or "test").
356split = "train"
357
358# Maximum number of rows to load from the split. Set to false for no explicit limit.
359max_samples = 100
360
361# Stream rows from Hugging Face instead of downloading full files.
362# If true, `max_samples` must be set.
363streaming = true
364
365
366[data_loader]
367# The number of data points to load at once.
368batch_size = 512
369
370
371
372[infer]
373# The path to the model weights file to use for inference.
374model_weights_file = false
375
376# >>> I believe that we can simply remove this entry <<<<<<<<<<<<<<<<<<<<<<<<<<<
377# The data_set split to use for inference. Use `false` for entire dataset.
378split = "infer"
379
380
381[vector_db]
382# The type of vector db to use. Use "false" to disable vector database.
383name = "chromadb"
384
385# The directory where the vector database will be stored. Use "false" to create
386# a new vector database in a timestamped directory. Otherwise set to a path.
387vector_db_dir = false
388
389# The path to inference results. Setting to "false" will use the most recent
390# inference results.
391infer_results_dir = false
392
393
394[vector_db.chromadb]
395# The approximate maximum size of a shard before creating a new one. A smaller
396# value will decrease insert times while increasing search times.
397shard_size_limit = 65536
398
399# Inserting vectors with more than this many elements logs a warning message. ChromaDB
400# performance degrades with vectors of this size. Set to "false" to disable warning.
401vector_size_warning = 10000
402
403
404[vector_db.qdrant]
405# The number of elements in the vectors that will be stored in the vector database.
406# This must be the same as the size of the vectors produced by the model.
407vector_size = 64
408
409
410[results]
411# Path to inference results to use for visualization and lookups. Uses latest inference run if none provided.
412inference_dir = false
413
414
415[reduce]
416# Name of the reduction algorithm to use
417algorithm = "umap"
418
419# Save the fitted reducer model as a pickle file 
420save_fit_model = true
421
422# The number of data points to use when transforming with reduction algorithm at once
423batch_size = 1024
424
425# Use multiprocessing during transforming with redudction algorithm (More memory intensive)
426parallel = false
427
428
429[reduce.umap]
430# Number of data points used to fit the umap model.
431fit_sample_size = 1024
432
433# Path to a pre-existing umap reducer model
434model_path = false
435
436
437[reduce.umap.kwargs]
438# Specify any parameter accepted by https://umap-learn.readthedocs.io/en/latest/api.html#umap
439# Dimension of the embedded space
440n_components = 2
441
442# Controls how UMAP balances local versus global structure in the data.
443# See official documentation for details.
444n_neighbors = 15
445
446
447[reduce.tsne]
448# Placeholder for config values of tsne model
449
450
451[reduce.tsne.kwargs]
452# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
453# Dimension of the embedded space
454n_components = 2
455
456# Number of nearest neighbors that is used in other manifold learning algorithms
457# See official documentation for details.
458perplexity = 30.0
459
460
461[reduce.pca]
462# Number of data points used to fit the pca model.
463fit_sample_size = 1024
464
465# Path to a pre-existing pca reducer model
466model_path = false
467
468
469[reduce.pca.kwargs]
470# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#
471# Dimension of the embedded space
472n_components=2
473
474
475[visualize]
476
477# List of metadata field names to use in visualizer. Must be available as metadata in your dataset
478fields = []
479
480# Whether to display a panel of randomly chosen images corresponding to the selected points
481display_images = false
482
483# Name of catalog column to use for coloring points in the scatter plot. Use false for no coloring.
484color_column = false
485
486# Colormap to use for coloring points in the scatter plot when color_column is specified
487cmap = "viridis"
488
489# Only valid for .pt tensor images. Which bands should be loaded for display
490# [0,3,5] would map bands in that order to R,G,B. Single band will be grayscale.
491torch_tensor_bands = [3]
492
493# Whether to rasterize plot. Will break coloring (Haloviews Bug)
494# Helpful to reduce lag in large datasets. 
495rasterize_plot = false
496
497
498[visualize_v2]
499
500# Number of hexagonal bins across the x-axis at any zoom level
501target_bins = 50
502
503# Extra padding around the viewport as a fraction of viewport width/height
504buffer_factor = 0.2
505
506# Plot dimensions in pixels
507plot_width = 1000
508plot_height = 1000
509
510# Colormap for hexbin density
511cmap = "Viridis"
512
513# Maximum rows rendered in the selection table
514max_table_rows = 1000
515
516# Number of detail plots per tab row
517num_detail_plots = 6
518
519
520
521[engine]
522
523# The directory containing the ONNX model used for inference in production
524model_directory = false
525
526
527[split]
528# Per-group split configuration.  Keys correspond to groups in [data_request].
529# Values are a fraction of the group's primary dataset to use — (0.0, 1.0]
530# or a path to a previously generated <group>_split.npz file.
531# Groups absent from this table default to 1.0 (use the full dataset).
532
533# RNG seed for shuffling/partitioning. `false` uses config["data_set"]["seed"].
534# A non-empty value uses a dedicated generator and does affect global RNG state.
535rng_seed = false
536
537
538[balance]
539# Name of the field identifying each item's class.  Must resolve to a
540# get_<field> getter on the primary dataset.  `false` disables stratification.
541field = false
542
543# Data groups whose sampling weights are adjusted to the target distribution.
544# Empty list disables rebalancing.  See [balance.distribution] for the target.
545groups = []
546
547[balance.distribution]
548# Maps class label → target fraction in (0.0, 1.0].  Empty table means equal
549# target across all observed classes (uniform).  Values must sum to exactly 1.0.
550# Example:
551# cat = 0.25
552# dog = 0.75
553
554[label]
555# Optional. Maps human-readable string aliases to the raw values returned by
556# get_<balance.field>.  Required when balance.distribution is used and 
557# get_<balance.field> returns non-string values (e.g. integers),
558# All balance.distribution keys must appear here.
559# Example:
560#   cat  = 0
561#   dog  = 1
562#   bird = 2