Merge branch 'master' of git.km3net.de:ml/OrcaSong

3521d407 · ViaFerrata · 29bfa60d · 8397e533 · 3521d407 · 3521d407
Commit 3521d407 authored 6 years ago by ViaFerrata
--- a/orcasong_contrib/data_tools/make_data_split/example_config.toml
+++ b/orcasong_contrib/data_tools/make_data_split/example_config.toml
-# Example configuration file for make_data_split.py
+# --- Example configuration file for make_data_split.py --- #
+# Documentation for every config parameter that is available.
+# Feel free to make a copy and keep only the lines you need!
+# If you don't want to have a parameter, comment out the line or delete it!

-# --- Documentation for every config parameter that is available --- #
-#
-#    Main Parameters
-#    ----------
-#    n_files_train : int
+# --- Main Parameters ---#
+
+# -----------------------------------------------------------------------------#
+n_files_train = 5                                                          #int
 #       Into how many files the training dataset should be split.
-#       If you don't want to have this dataset, comment out the line or delete it!
-#    n_files_validate : int
+#       This option is needed if one of your input_groups has a run_id_train range.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+n_files_validate = 1                                                        #int
 #       Into how many files the validation dataset should be split.
-#       If you don't want to have this dataset, comment out the line or delete it!
-#    n_files_rest : int
+#       This option is needed if one of your input_groups has a run_id_validate range.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+n_files_rest = 1                                                            #int
 #       Into how many files the "rest" dataset should be split.
-#       If you don't want to have this dataset, comment out the line or delete it!
-#    output_file_folder : str
+#       This option is needed if one of your input_groups has a run_id_rest range.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+output_file_folder = "/project/antares/enriqueh/data/working"           #str
 #       Path to the folder, where all the output .list files (and the bash job scripts) should be stored.
-#    output_file_name : str
-#       String, that specifies the prefix of the filename of the output .list files.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+output_file_name = "full_test"                                           #str
+#      Prefix of the filename of the output .list files.
 #       E.g. if = "xyzc_tight_0":
 #       xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ...
-#    print_only : bool
-#       If only informationa about the input_groups should be printed, and no .list files should be made.
-#
-#    Job submission Parameters
-#    -------------------------
-#    make_qsub_bash_files : bool
-#       If bash files should be made, that can be submitted to a cluster, in order to actually concatenate
-#       the files in the .list files.
-#    submit_jobs : bool
-#       Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+print_only = false                                                         #bool
+#      true = No .list files are made, only prints information about the input_groups.
+# -----------------------------------------------------------------------------#
+
+
+# --- Job Submission Parameters ---#
+
+# -----------------------------------------------------------------------------#
+make_qsub_bash_files = true                                                #bool
+#      true = Makes the cluster submission bash files needed to actually
+#       concatenate the files in the .list files.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+submit_jobs = false                                                        #bool
+#      true = Submit the bash job scripts to the cluster after they have been made.
 #       CAREFUL: May only work for Erlangen-CC.
-#    venv_path : str
-#       Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
-#    data_tools_folder : str
-#       Dirpath, where the concatenate.py tool is located.
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+venv_path = "/project/antares/enriqueh/gpu_venv3/"              #str
+#      Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
+#      /project/antares/enriqueh/gpu_venv3/bin/activate
+# -----------------------------------------------------------------------------#
+
+# -----------------------------------------------------------------------------#
+data_tools_folder = "/project/antares/enriqueh/OrcaNet/orcanet_contrib/data_tools"            #str
+#      Dirpath, where the concatenate.py tool is located.
 #       E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
-#    chunksize : int
-#       Chunksize parameter, that is used when calling concatenate.py
-#    complib : str
-#       Complib parameter, that is used when calling concatenate.py
-#    complevel : int
-#       Complevel parameter, that is used when calling concatenate.py
-#    shuffle_delete : bool
-#       Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be
-#       deleted after the shuffling is finished.
-#
-#    Input Group Parameters
-#    ----------------------
-#    dir : str
-#       Path of the directory, where the files for this input group are located.
-#    run_ids_train/run_ids_validate/run_ids_rest : array
-#       Array, which specifies the range of the run_ids, that should be used for the training/validation.rest
-#       dataset of this input group.
-#       E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5)
-#       to the training/validation/rest dataset.
-#       If you don't want to use a specific dataset for this input group, comment out the line or delete it!
-#
-# --- Documentation for every config parameter that is available --- #
+# -----------------------------------------------------------------------------#

-# --- Main options ---#
+# -----------------------------------------------------------------------------#
+shuffle_delete = false                                                     #bool
+#       True = the input file that will be deleted after the shuffling is finished.
+#       Option for the shuffle_h5 tool.
+# -----------------------------------------------------------------------------#

-n_files_train = 5
-n_files_validate = 3
-n_files_rest = 1
-output_file_folder = "/home/woody/capn/mppi033h/make_dsplit_test"
-output_file_name = "xyzc_tight_0"
-print_only = false # only print information of your input_groups, don't make any .list files
+# -----------------------------------------------------------------------------#
+# Concatenate.py Parameters
+# If they are commented, it will be set None on concatenate.py,
+# and the script will use the configurations that are already in the file.

-# --- Main options ---#
+#chunksize = 32                                                             #int
+#complib = "gzip"                                                           #str
+#complevel = 1                                                              #int
+#

+# -----------------------------------------------------------------------------#

-# --- Options, for submitting jobs to concatenate the .list files. --- #

-make_qsub_bash_files = true
-submit_jobs = false
-venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
-data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
-chunksize = 32
-complib = "gzip"
-complevel = 1
-shuffle_delete = false
+# --- Input Group Parameters: Datafiles to be concatenated --- #

-# --- Options, for submitting jobs to concatenate the .list files. --- #
+# -----------------------------------------------------------------------------#
+[elec_cc_3_100]
+#      Name of the group, don't make whitespaces!

-# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
+dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-CC"
+#       "/path/to/the/folder/of/the/data/for/this/input_1/group"              #str
+#       Path of the directory where the files for this input group are located.

-[input_group_1] # You can assign any name to this, doesnt matter which one. However, don't make whitespaces!!
-dir = "/path/to/the/folder/of/the/data/for/this/input_1/group"
-run_ids_train = [1001, 5000]
-run_ids_validate = [1, 1000]
-run_ids_rest = [5001, 20000]
+run_ids_train = [1, 1000]                                              #array
+run_ids_validate = [1001, 1200]
+#run_ids_rest = [1001, 1300]
+#       Arrays with the range of the run_ids that should be used for the
+#       training, validation and rest datasets of this input group.
+#       E.g. if [1,5] = Files from this input group with run_ids from 1 to 5
+#      (including 1 and 5!!) will go to the training/validation/rest dataset.
+#       If you don't want to use a specific dataset for this input group,
+#       you can comment out the corresponding line or delete it!

+# -----------------------------------------------------------------------------#
+# You can have more than 1 input group!!

-[input_group_2] # 1 to 1500
-dir = "/path/to/the/folder/of/the/data/for/this/input_2/group"
-run_ids_train = [101, 500]
-run_ids_validate = [1, 100]
-#run_ids_rest = [501, 600]
+[elec_nc_3_100]
+#      Name of the group, don't make whitespaces!

+dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-NC"
+#       "/path/to/the/folder/of/the/data/for/this/input_1/group"              #str
+#       Path of the directory where the files for this input group are located.

-[input_group_3] # 1 to 2400
-dir = "/path/to/the/folder/of/the/data/for/this/input_3/group"
-run_ids_train = [601, 2400]
-#run_ids_validate = [1, 500] # comment out or delete it, if you dont want it
-run_ids_rest = [501, 600]
+run_ids_train = [1, 1000]                                              #array
+run_ids_validate = [1001, 1188]

-# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
+#     [input_group_2] # 1 to 1500
+#     dir = "/path/to/the/folder/of/the/data/for/this/input_2/group"
+#     run_ids_train = [101, 500]
+#     run_ids_validate = [1, 100]
+#     #run_ids_rest = [501, 600]
+# -----------------------------------------------------------------------------#
--- a/orcasong_contrib/data_tools/make_data_split/make_data_split.py
+++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py
@@ -39,6 +39,9 @@ def parse_input():
    cfg = toml.load(config_file)
    cfg['toml_filename'] = config_file

+    if 'chunksize' not in cfg: cfg['chunksize'] = None
+    if 'complib' not in cfg: cfg['complib'] = None
+    if 'complevel' not in cfg: cfg['complevel'] = None
    return cfg


@@ -282,6 +285,10 @@ def make_concatenate_and_shuffle_list_files(cfg):
    if not os.path.exists(dirpath + '/data_split'):  # check if /data_split folder exists, if not create it.
        os.makedirs(dirpath + '/data_split')

+    chunksize = '' if cfg['chunksize'] is None else ' --chunksize ' + str(cfg['chunksize'])
+    complib = '' if cfg['complib'] is None else ' --complib ' + str(cfg['complib'])
+    complevel = '' if cfg['complevel'] is None else ' --complevel ' + str(cfg['complevel'])
+
    # make qsub .sh file for concatenating
    for listfile_fpath in cfg['output_lists']:
        listfile_fname = os.path.basename(listfile_fpath)
@@ -303,10 +310,9 @@ def make_concatenate_and_shuffle_list_files(cfg):
            f.write('# Concatenate the files in the list\n')

            f.write(
-                    'time python concatenate/concatenate_h5.py'
-                    + ' --chunksize ' + str(cfg['chunksize'])
-                    + ' --complib ' + str(cfg['complib'])
-                    + ' --complevel ' + str(cfg['complevel'])
+                    'python concatenate_h5.py'              #Nikhef
+#                    'time python concatenate_h5.py'
+                    + chunksize + complib + complevel
                    + ' -l ' + listfile_fpath + ' ' + conc_outputfile_fpath)

        if cfg['submit_jobs'] is True:
@@ -336,11 +342,10 @@ def make_concatenate_and_shuffle_list_files(cfg):
            f.write('# Shuffle the h5 file \n')

            f.write(
-                    'time python shuffle/shuffle_h5.py'
+                    'python shuffle_h5.py'                  #Nikhef
+#                    'time python shuffle_h5.py'
                    + delete_flag_shuffle_tool
-                    + ' --chunksize ' + str(cfg['chunksize'])
-                    + ' --complib ' + str(cfg['complib'])
-                    + ' --complevel ' + str(cfg['complevel'])
+                    + chunksize + complib + complevel
                    + ' ' + conc_outputfile_fpath)