From 95c7eeff1d8e494912745ca3f90ba31f36b68315 Mon Sep 17 00:00:00 2001 From: ViaFerrata <michimoser@onlinehome.de> Date: Thu, 7 Feb 2019 15:31:48 +0100 Subject: [PATCH] - Bugfix due to moving the data_tools folder from OrcaNet to OrcaSong - Add configs for regression dataset --- ...nfig_bg_classifier_xyz-c_tight-0_100b.toml | 2 +- ...nfig_bg_classifier_xyz-t_tight-0_100b.toml | 2 +- .../config_regression_xyz-c_tight-1_60b.toml | 129 ++++++++++++++++++ .../config_regression_xyz-t_tight-1_60b.toml | 129 ++++++++++++++++++ .../make_data_split/make_data_split.py | 4 +- 5 files changed, 262 insertions(+), 4 deletions(-) create mode 100644 orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml create mode 100644 orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml index 89e5c9e..c1354b6 100644 --- a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml @@ -75,7 +75,7 @@ print_only = false # only print information of your input_groups, don't make any make_qsub_bash_files = true submit_jobs = false venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" -data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools" chunksize = 32 complib = "gzip" complevel = 1 diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml index 072093e..0f22d67 100644 --- a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml @@ -75,7 +75,7 @@ print_only = false # only print information of your input_groups, don't make any make_qsub_bash_files = true submit_jobs = false venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" -data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools" chunksize = 32 complib = "gzip" complevel = 1 diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml new file mode 100644 index 0000000..8373e08 --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml @@ -0,0 +1,129 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 10 +n_files_validate = 5 +n_files_rest = 1 +output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/data_splits/xyzc" +output_file_name = "xyzc_tight_1_60b_regression_dataset" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = true +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = false + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # + + +[muon_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/3-100GeV/xyzc" +run_ids_train = [721, 2400] +run_ids_validate = [1, 720] + + +[muon_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/1-5GeV/xyzc" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/3-100GeV/xyzc" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/1-5GeV/xyzc" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_nc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/3-100GeV/xyzc" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_nc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/1-5GeV/xyzc" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[tau_cc_3_100] # 1 to 1800 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/tau-CC/3-100GeV/xyzc" +run_ids_rest = [1, 1800] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml new file mode 100644 index 0000000..80d651d --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml @@ -0,0 +1,129 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 10 +n_files_validate = 5 +n_files_rest = 1 +output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/data_splits/xyzt" +output_file_name = "xyzt_tight_1_60b_regression_dataset" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = true +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = false + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # + + +[muon_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/3-100GeV/xyzt" +run_ids_train = [721, 2400] +run_ids_validate = [1, 720] + + +[muon_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_nc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_nc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[tau_cc_3_100] # 1 to 1800 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/tau-CC/3-100GeV/xyzt" +run_ids_rest = [1, 1800] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/make_data_split.py b/orcasong_contrib/data_tools/make_data_split/make_data_split.py index b97b10e..8bfe763 100644 --- a/orcasong_contrib/data_tools/make_data_split/make_data_split.py +++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py @@ -303,7 +303,7 @@ def make_concatenate_and_shuffle_list_files(cfg): f.write('# Concatenate the files in the list\n') f.write( - 'time python concatenate_h5.py' + 'time python concatenate/concatenate_h5.py' + ' --chunksize ' + str(cfg['chunksize']) + ' --complib ' + str(cfg['complib']) + ' --complevel ' + str(cfg['complevel']) @@ -336,7 +336,7 @@ def make_concatenate_and_shuffle_list_files(cfg): f.write('# Shuffle the h5 file \n') f.write( - 'time python shuffle_h5.py' + 'time python shuffle/shuffle_h5.py' + delete_flag_shuffle_tool + ' --chunksize ' + str(cfg['chunksize']) + ' --complib ' + str(cfg['complib']) -- GitLab