diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_ts_xyz-t_tight-1_60b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_ts_xyz-t_tight-1_60b.toml new file mode 100644 index 0000000000000000000000000000000000000000..b913e8451dd61ce50af6bcdf850d6549336decad --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_ts_xyz-t_tight-1_60b.toml @@ -0,0 +1,129 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 10 +n_files_validate = 5 +n_files_rest = 1 +output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/data_splits/xyzt" +output_file_name = "xyzt_tight_1_60b_ts_dataset" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = true +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = true + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # + + +[muon_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/muon-CC/3-100GeV/xyzt" +run_ids_train = [721, 2400] +run_ids_validate = [1, 720] + + +[muon_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/muon-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-CC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_nc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-NC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_nc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-NC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[tau_cc_3_100] # 1 to 1800 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/tau-CC/3-100GeV/xyzt" +run_ids_rest = [1, 1800] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_ts_xyz-t_tight-2_60b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_ts_xyz-t_tight-2_60b.toml new file mode 100644 index 0000000000000000000000000000000000000000..5fcc6daab6c1a2beb7a3a47aa265807dc8a3193f --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_ts_xyz-t_tight-2_60b.toml @@ -0,0 +1,129 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 10 +n_files_validate = 5 +n_files_rest = 1 +output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/data_splits/xyzt" +output_file_name = "xyzt_tight_2_60b_ts_dataset" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = true +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = true + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # + + +[muon_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/muon-CC/3-100GeV/xyzt" +run_ids_train = [721, 2400] +run_ids_validate = [1, 720] + + +[muon_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/muon-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-CC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_nc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-NC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_nc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-NC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[tau_cc_3_100] # 1 to 1800 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/tau-CC/3-100GeV/xyzt" +run_ids_rest = [1, 1800] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/make_data_split.py b/orcasong_contrib/data_tools/make_data_split/make_data_split.py index d52ee300cc7e6e1ed79cd5a6654349feb2c2038e..cce5f2fef4a523dfbf9e09e8503bb803d8e9808a 100644 --- a/orcasong_contrib/data_tools/make_data_split/make_data_split.py +++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py @@ -309,9 +309,7 @@ def make_concatenate_and_shuffle_list_files(cfg): f.write('\n') f.write('# Concatenate the files in the list\n') - f.write( - 'python concatenate_h5.py' #Nikhef -# 'time python concatenate_h5.py' + f.write('time python concatenate/concatenate_h5.py' + chunksize + complib + complevel + ' -l ' + listfile_fpath + ' ' + conc_outputfile_fpath) @@ -341,9 +339,7 @@ def make_concatenate_and_shuffle_list_files(cfg): f.write('\n') f.write('# Shuffle the h5 file \n') - f.write( - 'python shuffle_h5.py' #Nikhef -# 'time python shuffle_h5.py' + f.write('time python shuffle/shuffle_h5.py' + delete_flag_shuffle_tool + chunksize + complib + complevel + ' ' + conc_outputfile_fpath)