diff --git a/orcasong_contrib/data_tools/make_data_split/example_config.toml b/orcasong_contrib/data_tools/make_data_split/example_config.toml index d8d02c8b37447ead96673ea8c0c6b06794ecb978..6d2ca026c4c2d23eee490aca58db8ba0b7c2b762 100644 --- a/orcasong_contrib/data_tools/make_data_split/example_config.toml +++ b/orcasong_contrib/data_tools/make_data_split/example_config.toml @@ -3,125 +3,132 @@ # Feel free to make a copy and keep only the lines you need! # If you don't want to have a parameter, comment out the line or delete it! -# --- Main Parameters ---# - -# -----------------------------------------------------------------------------# -n_files_train = 5 #int -# Into how many files the training dataset should be split. -# This option is needed if one of your input_groups has a run_id_train range. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -n_files_validate = 1 #int -# Into how many files the validation dataset should be split. -# This option is needed if one of your input_groups has a run_id_validate range. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -n_files_rest = 1 #int -# Into how many files the "rest" dataset should be split. -# This option is needed if one of your input_groups has a run_id_rest range. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -output_file_folder = "/project/antares/enriqueh/data/working" #str -# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -output_file_name = "full_test" #str -# Prefix of the filename of the output .list files. -# E.g. if = "xyzc_tight_0": -# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -print_only = false #bool -# true = No .list files are made, only prints information about the input_groups. -# -----------------------------------------------------------------------------# - - -# --- Job Submission Parameters ---# - -# -----------------------------------------------------------------------------# -make_qsub_bash_files = true #bool -# true = Makes the cluster submission bash files needed to actually -# concatenate the files in the .list files. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -submit_jobs = false #bool -# true = Submit the bash job scripts to the cluster after they have been made. -# CAREFUL: May only work for Erlangen-CC. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -venv_path = "/project/antares/enriqueh/gpu_venv3/" #str -# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" -# /project/antares/enriqueh/gpu_venv3/bin/activate -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -data_tools_folder = "/project/antares/enriqueh/OrcaNet/orcanet_contrib/data_tools" #str -# Dirpath, where the concatenate.py tool is located. -# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -shuffle_delete = false #bool -# True = the input file that will be deleted after the shuffling is finished. -# Option for the shuffle_h5 tool. -# -----------------------------------------------------------------------------# - -# -----------------------------------------------------------------------------# -# Concatenate.py Parameters +# --- Main Parameters --- # + +# ----------------------------------------------------------------------------- # +# Into how many files the training dataset should be split. Type: int +# This option is needed if one of your input_groups has a run_id_train range. + +n_files_train = 5 +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Into how many files the validation dataset should be split. Type: int +# This option is needed if one of your input_groups has a run_id_validate range. + +n_files_validate = 1 +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Into how many files the "rest" dataset should be split. Type: int +# This option is needed if one of your input_groups has a run_id_rest range. + +n_files_rest = 1 +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Path to the folder, where all the output .list files +# (and the bash job scripts) should be stored. Type: str + +output_file_folder = "/path/to/output/folder" +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Prefix of the filename of the output .list files. Type: str +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... + +output_file_name = "output_file_name" +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# true = No .list files are made, only prints information about the input_groups. +# Type: bool + +print_only = false +# ----------------------------------------------------------------------------- # + + +# --- Job Submission Parameters --- # + +# ----------------------------------------------------------------------------- # +# true = Makes the cluster submission bash files needed to actually +# concatenate the files in the .list files. Type: bool + +make_qsub_bash_files = true +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# true = Submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. Type: bool + +submit_jobs = false +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# Type: str + +venv_path = "/path/to/your/venv" +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Dirpath, where the concatenate.py tool is located. Type: str +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" + +data_tools_folder = "path/to/OrcaNet/orcanet_contrib/data_tools" +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# True = the input file that will be deleted after the shuffling is finished. +# Option for the shuffle_h5 tool. Type: bool +shuffle_delete = false + +# ----------------------------------------------------------------------------- # + +# ----------------------------------------------------------------------------- # +# Concatenate.py & shuffle.py Parameters # If they are commented, it will be set None on concatenate.py, -# and the script will use the configurations that are already in the file. +# and the script will use the chunksize/complib/complevel that are already +# used in the input files for these scripts. -#chunksize = 32 #int -#complib = "gzip" #str -#complevel = 1 #int -# +#chunksize = 32 # Type: int +#complib = "gzip" # Type: str +#complevel = 1 # Type: int -# -----------------------------------------------------------------------------# +# ----------------------------------------------------------------------------- # # --- Input Group Parameters: Datafiles to be concatenated --- # -# -----------------------------------------------------------------------------# -[elec_cc_3_100] -# Name of the group, don't make whitespaces! +# ----------------------------------------------------------------------------- # + +[input_group_1] # Name of the group, don't make whitespaces! -dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-CC" -# "/path/to/the/folder/of/the/data/for/this/input_1/group" #str -# Path of the directory where the files for this input group are located. +# Type: str +# Path of the directory where the files for this input group are located. +dir = "/path/to/the/folder/of/the/data/for/this/input_1/group" -run_ids_train = [1, 1000] #array -run_ids_validate = [1001, 1200] -#run_ids_rest = [1001, 1300] -# Arrays with the range of the run_ids that should be used for the -# training, validation and rest datasets of this input group. -# E.g. if [1,5] = Files from this input group with run_ids from 1 to 5 -# (including 1 and 5!!) will go to the training/validation/rest dataset. -# If you don't want to use a specific dataset for this input group, -# you can comment out the corresponding line or delete it! +# Arrays with the range of the run_ids that should be used for the +# training, validation and rest datasets of this input group. +# E.g. if [1,5] = Files from this input group with run_ids from 1 to 5 +# (including 1 and 5!) will go to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, +# you can comment out the corresponding line or delete it! +run_ids_train = [1, 200] +run_ids_validate = [201, 1200] +#run_ids_rest = [1201, 1300] -# -----------------------------------------------------------------------------# -# You can have more than 1 input group!! +# ----------------------------------------------------------------------------- # +# You can have more than 1 input group! -[elec_nc_3_100] -# Name of the group, don't make whitespaces! +[input_group_2] # Name of the group, don't make whitespaces! -dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-NC" -# "/path/to/the/folder/of/the/data/for/this/input_1/group" #str -# Path of the directory where the files for this input group are located. +# Type: str +# Path of the directory where the files for this input group are located. +dir = "/path/to/the/folder/of/the/data/for/this/input_2/group" -run_ids_train = [1, 1000] #array +run_ids_train = [1, 1000] run_ids_validate = [1001, 1188] -# [input_group_2] # 1 to 1500 -# dir = "/path/to/the/folder/of/the/data/for/this/input_2/group" -# run_ids_train = [101, 500] -# run_ids_validate = [1, 100] -# #run_ids_rest = [501, 600] -# -----------------------------------------------------------------------------# +# ----------------------------------------------------------------------------- #