From 95c7eeff1d8e494912745ca3f90ba31f36b68315 Mon Sep 17 00:00:00 2001
From: ViaFerrata <michimoser@onlinehome.de>
Date: Thu, 7 Feb 2019 15:31:48 +0100
Subject: [PATCH] - Bugfix due to moving the data_tools folder from OrcaNet to
 OrcaSong - Add configs for regression dataset

---
 ...nfig_bg_classifier_xyz-c_tight-0_100b.toml |   2 +-
 ...nfig_bg_classifier_xyz-t_tight-0_100b.toml |   2 +-
 .../config_regression_xyz-c_tight-1_60b.toml  | 129 ++++++++++++++++++
 .../config_regression_xyz-t_tight-1_60b.toml  | 129 ++++++++++++++++++
 .../make_data_split/make_data_split.py        |   4 +-
 5 files changed, 262 insertions(+), 4 deletions(-)
 create mode 100644 orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml
 create mode 100644 orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml

diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml
index 89e5c9e..c1354b6 100644
--- a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml
+++ b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml
@@ -75,7 +75,7 @@ print_only = false # only print information of your input_groups, don't make any
 make_qsub_bash_files = true
 submit_jobs = false
 venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
-data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
+data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools"
 chunksize = 32
 complib = "gzip"
 complevel = 1
diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml
index 072093e..0f22d67 100644
--- a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml
+++ b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml
@@ -75,7 +75,7 @@ print_only = false # only print information of your input_groups, don't make any
 make_qsub_bash_files = true
 submit_jobs = false
 venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
-data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
+data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools"
 chunksize = 32
 complib = "gzip"
 complevel = 1
diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml
new file mode 100644
index 0000000..8373e08
--- /dev/null
+++ b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-c_tight-1_60b.toml
@@ -0,0 +1,129 @@
+# Example configuration file for make_data_split.py
+
+# --- Documentation for every config parameter that is available --- #
+#
+#    Main Parameters
+#    ----------
+#    n_files_train : int
+#       Into how many files the training dataset should be split.
+#       If you don't want to have this dataset, comment out the line or delete it!
+#    n_files_validate : int
+#       Into how many files the validation dataset should be split.
+#       If you don't want to have this dataset, comment out the line or delete it!
+#    n_files_rest : int
+#       Into how many files the "rest" dataset should be split.
+#       If you don't want to have this dataset, comment out the line or delete it!
+#    output_file_folder : str
+#       Path to the folder, where all the output .list files (and the bash job scripts) should be stored.
+#    output_file_name : str
+#       String, that specifies the prefix of the filename of the output .list files.
+#       E.g. if = "xyzc_tight_0":
+#       xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ...
+#    print_only : bool
+#       If only informationa about the input_groups should be printed, and no .list files should be made.
+#
+#    Job submission Parameters
+#    -------------------------
+#    make_qsub_bash_files : bool
+#       If bash files should be made, that can be submitted to a cluster, in order to actually concatenate
+#       the files in the .list files.
+#    submit_jobs : bool
+#       Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made.
+#       CAREFUL: May only work for Erlangen-CC.
+#    venv_path : str
+#       Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
+#    data_tools_folder : str
+#       Dirpath, where the concatenate.py tool is located.
+#       E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
+#    chunksize : int
+#       Chunksize parameter, that is used when calling concatenate.py
+#    complib : str
+#       Complib parameter, that is used when calling concatenate.py
+#    complevel : int
+#       Complevel parameter, that is used when calling concatenate.py
+#    shuffle_delete : bool
+#       Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be
+#       deleted after the shuffling is finished.
+#
+#    Input Group Parameters
+#    ----------------------
+#    dir : str
+#       Path of the directory, where the files for this input group are located.
+#    run_ids_train/run_ids_validate/run_ids_rest : array
+#       Array, which specifies the range of the run_ids, that should be used for the training/validation.rest
+#       dataset of this input group.
+#       E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5)
+#       to the training/validation/rest dataset.
+#       If you don't want to use a specific dataset for this input group, comment out the line or delete it!
+#
+# --- Documentation for every config parameter that is available --- #
+
+# --- Main options ---#
+
+n_files_train = 10
+n_files_validate = 5
+n_files_rest = 1
+output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/data_splits/xyzc"
+output_file_name = "xyzc_tight_1_60b_regression_dataset"
+print_only = false # only print information of your input_groups, don't make any .list files
+
+# --- Main options ---#
+
+
+# --- Options, for submitting jobs to concatenate the .list files. --- #
+
+make_qsub_bash_files = true
+submit_jobs = true
+venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
+data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools"
+chunksize = 32
+complib = "gzip"
+complevel = 1
+shuffle_delete = false
+
+# --- Options, for submitting jobs to concatenate the .list files. --- #
+
+# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
+
+
+[muon_cc_3_100] # 1 to 2400
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/3-100GeV/xyzc"
+run_ids_train = [721, 2400]
+run_ids_validate = [1, 720]
+
+
+[muon_cc_1_5] # 1 to 600
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/1-5GeV/xyzc"
+run_ids_train = [181, 600]
+run_ids_validate = [1, 180]
+
+
+[elec_cc_3_100] # 1 to 2400
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/3-100GeV/xyzc"
+run_ids_train = [361, 1200]
+run_ids_validate = [1, 360]
+
+
+[elec_cc_1_5] # 1 to 600
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/1-5GeV/xyzc"
+run_ids_train = [181, 600]
+run_ids_validate = [1, 180]
+
+
+[elec_nc_3_100] # 1 to 2400
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/3-100GeV/xyzc"
+run_ids_train = [361, 1200]
+run_ids_validate = [1, 360]
+
+
+[elec_nc_1_5] # 1 to 600
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/1-5GeV/xyzc"
+run_ids_train = [181, 600]
+run_ids_validate = [1, 180]
+
+
+[tau_cc_3_100] # 1 to 1800
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/tau-CC/3-100GeV/xyzc"
+run_ids_rest = [1, 1800]
+
+# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml
new file mode 100644
index 0000000..80d651d
--- /dev/null
+++ b/orcasong_contrib/data_tools/make_data_split/configs/config_regression_xyz-t_tight-1_60b.toml
@@ -0,0 +1,129 @@
+# Example configuration file for make_data_split.py
+
+# --- Documentation for every config parameter that is available --- #
+#
+#    Main Parameters
+#    ----------
+#    n_files_train : int
+#       Into how many files the training dataset should be split.
+#       If you don't want to have this dataset, comment out the line or delete it!
+#    n_files_validate : int
+#       Into how many files the validation dataset should be split.
+#       If you don't want to have this dataset, comment out the line or delete it!
+#    n_files_rest : int
+#       Into how many files the "rest" dataset should be split.
+#       If you don't want to have this dataset, comment out the line or delete it!
+#    output_file_folder : str
+#       Path to the folder, where all the output .list files (and the bash job scripts) should be stored.
+#    output_file_name : str
+#       String, that specifies the prefix of the filename of the output .list files.
+#       E.g. if = "xyzc_tight_0":
+#       xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ...
+#    print_only : bool
+#       If only informationa about the input_groups should be printed, and no .list files should be made.
+#
+#    Job submission Parameters
+#    -------------------------
+#    make_qsub_bash_files : bool
+#       If bash files should be made, that can be submitted to a cluster, in order to actually concatenate
+#       the files in the .list files.
+#    submit_jobs : bool
+#       Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made.
+#       CAREFUL: May only work for Erlangen-CC.
+#    venv_path : str
+#       Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
+#    data_tools_folder : str
+#       Dirpath, where the concatenate.py tool is located.
+#       E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
+#    chunksize : int
+#       Chunksize parameter, that is used when calling concatenate.py
+#    complib : str
+#       Complib parameter, that is used when calling concatenate.py
+#    complevel : int
+#       Complevel parameter, that is used when calling concatenate.py
+#    shuffle_delete : bool
+#       Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be
+#       deleted after the shuffling is finished.
+#
+#    Input Group Parameters
+#    ----------------------
+#    dir : str
+#       Path of the directory, where the files for this input group are located.
+#    run_ids_train/run_ids_validate/run_ids_rest : array
+#       Array, which specifies the range of the run_ids, that should be used for the training/validation.rest
+#       dataset of this input group.
+#       E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5)
+#       to the training/validation/rest dataset.
+#       If you don't want to use a specific dataset for this input group, comment out the line or delete it!
+#
+# --- Documentation for every config parameter that is available --- #
+
+# --- Main options ---#
+
+n_files_train = 10
+n_files_validate = 5
+n_files_rest = 1
+output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/data_splits/xyzt"
+output_file_name = "xyzt_tight_1_60b_regression_dataset"
+print_only = false # only print information of your input_groups, don't make any .list files
+
+# --- Main options ---#
+
+
+# --- Options, for submitting jobs to concatenate the .list files. --- #
+
+make_qsub_bash_files = true
+submit_jobs = true
+venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
+data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools"
+chunksize = 32
+complib = "gzip"
+complevel = 1
+shuffle_delete = false
+
+# --- Options, for submitting jobs to concatenate the .list files. --- #
+
+# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
+
+
+[muon_cc_3_100] # 1 to 2400
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/3-100GeV/xyzt"
+run_ids_train = [721, 2400]
+run_ids_validate = [1, 720]
+
+
+[muon_cc_1_5] # 1 to 600
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/1-5GeV/xyzt"
+run_ids_train = [181, 600]
+run_ids_validate = [1, 180]
+
+
+[elec_cc_3_100] # 1 to 2400
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/3-100GeV/xyzt"
+run_ids_train = [361, 1200]
+run_ids_validate = [1, 360]
+
+
+[elec_cc_1_5] # 1 to 600
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/1-5GeV/xyzt"
+run_ids_train = [181, 600]
+run_ids_validate = [1, 180]
+
+
+[elec_nc_3_100] # 1 to 2400
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/3-100GeV/xyzt"
+run_ids_train = [361, 1200]
+run_ids_validate = [1, 360]
+
+
+[elec_nc_1_5] # 1 to 600
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/1-5GeV/xyzt"
+run_ids_train = [181, 600]
+run_ids_validate = [1, 180]
+
+
+[tau_cc_3_100] # 1 to 1800
+dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/tau-CC/3-100GeV/xyzt"
+run_ids_rest = [1, 1800]
+
+# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
diff --git a/orcasong_contrib/data_tools/make_data_split/make_data_split.py b/orcasong_contrib/data_tools/make_data_split/make_data_split.py
index b97b10e..8bfe763 100644
--- a/orcasong_contrib/data_tools/make_data_split/make_data_split.py
+++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py
@@ -303,7 +303,7 @@ def make_concatenate_and_shuffle_list_files(cfg):
             f.write('# Concatenate the files in the list\n')
 
             f.write(
-                    'time python concatenate_h5.py'
+                    'time python concatenate/concatenate_h5.py'
                     + ' --chunksize ' + str(cfg['chunksize'])
                     + ' --complib ' + str(cfg['complib'])
                     + ' --complevel ' + str(cfg['complevel'])
@@ -336,7 +336,7 @@ def make_concatenate_and_shuffle_list_files(cfg):
             f.write('# Shuffle the h5 file \n')
 
             f.write(
-                    'time python shuffle_h5.py'
+                    'time python shuffle/shuffle_h5.py'
                     + delete_flag_shuffle_tool
                     + ' --chunksize ' + str(cfg['chunksize'])
                     + ' --complib ' + str(cfg['complib'])
-- 
GitLab