From f3838febcb2833a38d9b76c7148ca25c0235bd12 Mon Sep 17 00:00:00 2001 From: Enrique HuescaSantiago <ehuescasantiago@km3net.de> Date: Mon, 11 Feb 2019 16:23:05 +0100 Subject: [PATCH] Fix --- .../make_data_split/example_config.toml | 189 ++++++++++-------- .../make_data_split/make_data_split.py | 21 +- 2 files changed, 117 insertions(+), 93 deletions(-) diff --git a/orcasong_contrib/data_tools/make_data_split/example_config.toml b/orcasong_contrib/data_tools/make_data_split/example_config.toml index cded742..d8d02c8 100644 --- a/orcasong_contrib/data_tools/make_data_split/example_config.toml +++ b/orcasong_contrib/data_tools/make_data_split/example_config.toml @@ -1,108 +1,127 @@ -# Example configuration file for make_data_split.py +# --- Example configuration file for make_data_split.py --- # +# Documentation for every config parameter that is available. +# Feel free to make a copy and keep only the lines you need! +# If you don't want to have a parameter, comment out the line or delete it! -# --- Documentation for every config parameter that is available --- # -# -# Main Parameters -# ---------- -# n_files_train : int +# --- Main Parameters ---# + +# -----------------------------------------------------------------------------# +n_files_train = 5 #int # Into how many files the training dataset should be split. -# If you don't want to have this dataset, comment out the line or delete it! -# n_files_validate : int +# This option is needed if one of your input_groups has a run_id_train range. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +n_files_validate = 1 #int # Into how many files the validation dataset should be split. -# If you don't want to have this dataset, comment out the line or delete it! -# n_files_rest : int +# This option is needed if one of your input_groups has a run_id_validate range. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +n_files_rest = 1 #int # Into how many files the "rest" dataset should be split. -# If you don't want to have this dataset, comment out the line or delete it! -# output_file_folder : str +# This option is needed if one of your input_groups has a run_id_rest range. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +output_file_folder = "/project/antares/enriqueh/data/working" #str # Path to the folder, where all the output .list files (and the bash job scripts) should be stored. -# output_file_name : str -# String, that specifies the prefix of the filename of the output .list files. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +output_file_name = "full_test" #str +# Prefix of the filename of the output .list files. # E.g. if = "xyzc_tight_0": # xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... -# print_only : bool -# If only informationa about the input_groups should be printed, and no .list files should be made. -# -# Job submission Parameters -# ------------------------- -# make_qsub_bash_files : bool -# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate -# the files in the .list files. -# submit_jobs : bool -# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +print_only = false #bool +# true = No .list files are made, only prints information about the input_groups. +# -----------------------------------------------------------------------------# + + +# --- Job Submission Parameters ---# + +# -----------------------------------------------------------------------------# +make_qsub_bash_files = true #bool +# true = Makes the cluster submission bash files needed to actually +# concatenate the files in the .list files. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +submit_jobs = false #bool +# true = Submit the bash job scripts to the cluster after they have been made. # CAREFUL: May only work for Erlangen-CC. -# venv_path : str -# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" -# data_tools_folder : str -# Dirpath, where the concatenate.py tool is located. +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +venv_path = "/project/antares/enriqueh/gpu_venv3/" #str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# /project/antares/enriqueh/gpu_venv3/bin/activate +# -----------------------------------------------------------------------------# + +# -----------------------------------------------------------------------------# +data_tools_folder = "/project/antares/enriqueh/OrcaNet/orcanet_contrib/data_tools" #str +# Dirpath, where the concatenate.py tool is located. # E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" -# chunksize : int -# Chunksize parameter, that is used when calling concatenate.py -# complib : str -# Complib parameter, that is used when calling concatenate.py -# complevel : int -# Complevel parameter, that is used when calling concatenate.py -# shuffle_delete : bool -# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be -# deleted after the shuffling is finished. -# -# Input Group Parameters -# ---------------------- -# dir : str -# Path of the directory, where the files for this input group are located. -# run_ids_train/run_ids_validate/run_ids_rest : array -# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest -# dataset of this input group. -# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) -# to the training/validation/rest dataset. -# If you don't want to use a specific dataset for this input group, comment out the line or delete it! -# -# --- Documentation for every config parameter that is available --- # +# -----------------------------------------------------------------------------# -# --- Main options ---# +# -----------------------------------------------------------------------------# +shuffle_delete = false #bool +# True = the input file that will be deleted after the shuffling is finished. +# Option for the shuffle_h5 tool. +# -----------------------------------------------------------------------------# -n_files_train = 5 -n_files_validate = 3 -n_files_rest = 1 -output_file_folder = "/home/woody/capn/mppi033h/make_dsplit_test" -output_file_name = "xyzc_tight_0" -print_only = false # only print information of your input_groups, don't make any .list files +# -----------------------------------------------------------------------------# +# Concatenate.py Parameters +# If they are commented, it will be set None on concatenate.py, +# and the script will use the configurations that are already in the file. -# --- Main options ---# +#chunksize = 32 #int +#complib = "gzip" #str +#complevel = 1 #int +# +# -----------------------------------------------------------------------------# -# --- Options, for submitting jobs to concatenate the .list files. --- # -make_qsub_bash_files = true -submit_jobs = false -venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" -data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" -chunksize = 32 -complib = "gzip" -complevel = 1 -shuffle_delete = false +# --- Input Group Parameters: Datafiles to be concatenated --- # -# --- Options, for submitting jobs to concatenate the .list files. --- # +# -----------------------------------------------------------------------------# +[elec_cc_3_100] +# Name of the group, don't make whitespaces! -# --- Input groups : these are the datafiles, that should be concatenated somehow --- # +dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-CC" +# "/path/to/the/folder/of/the/data/for/this/input_1/group" #str +# Path of the directory where the files for this input group are located. -[input_group_1] # You can assign any name to this, doesnt matter which one. However, don't make whitespaces!! -dir = "/path/to/the/folder/of/the/data/for/this/input_1/group" -run_ids_train = [1001, 5000] -run_ids_validate = [1, 1000] -run_ids_rest = [5001, 20000] +run_ids_train = [1, 1000] #array +run_ids_validate = [1001, 1200] +#run_ids_rest = [1001, 1300] +# Arrays with the range of the run_ids that should be used for the +# training, validation and rest datasets of this input group. +# E.g. if [1,5] = Files from this input group with run_ids from 1 to 5 +# (including 1 and 5!!) will go to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, +# you can comment out the corresponding line or delete it! +# -----------------------------------------------------------------------------# +# You can have more than 1 input group!! -[input_group_2] # 1 to 1500 -dir = "/path/to/the/folder/of/the/data/for/this/input_2/group" -run_ids_train = [101, 500] -run_ids_validate = [1, 100] -#run_ids_rest = [501, 600] +[elec_nc_3_100] +# Name of the group, don't make whitespaces! +dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-NC" +# "/path/to/the/folder/of/the/data/for/this/input_1/group" #str +# Path of the directory where the files for this input group are located. -[input_group_3] # 1 to 2400 -dir = "/path/to/the/folder/of/the/data/for/this/input_3/group" -run_ids_train = [601, 2400] -#run_ids_validate = [1, 500] # comment out or delete it, if you dont want it -run_ids_rest = [501, 600] +run_ids_train = [1, 1000] #array +run_ids_validate = [1001, 1188] -# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file +# [input_group_2] # 1 to 1500 +# dir = "/path/to/the/folder/of/the/data/for/this/input_2/group" +# run_ids_train = [101, 500] +# run_ids_validate = [1, 100] +# #run_ids_rest = [501, 600] +# -----------------------------------------------------------------------------# diff --git a/orcasong_contrib/data_tools/make_data_split/make_data_split.py b/orcasong_contrib/data_tools/make_data_split/make_data_split.py index 765007d..d52ee30 100644 --- a/orcasong_contrib/data_tools/make_data_split/make_data_split.py +++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py @@ -39,6 +39,9 @@ def parse_input(): cfg = toml.load(config_file) cfg['toml_filename'] = config_file + if 'chunksize' not in cfg: cfg['chunksize'] = None + if 'complib' not in cfg: cfg['complib'] = None + if 'complevel' not in cfg: cfg['complevel'] = None return cfg @@ -282,6 +285,10 @@ def make_concatenate_and_shuffle_list_files(cfg): if not os.path.exists(dirpath + '/data_split'): # check if /data_split folder exists, if not create it. os.makedirs(dirpath + '/data_split') + chunksize = '' if cfg['chunksize'] is None else ' --chunksize ' + str(cfg['chunksize']) + complib = '' if cfg['complib'] is None else ' --complib ' + str(cfg['complib']) + complevel = '' if cfg['complevel'] is None else ' --complevel ' + str(cfg['complevel']) + # make qsub .sh file for concatenating for listfile_fpath in cfg['output_lists']: listfile_fname = os.path.basename(listfile_fpath) @@ -303,10 +310,9 @@ def make_concatenate_and_shuffle_list_files(cfg): f.write('# Concatenate the files in the list\n') f.write( - 'time python concatenate/concatenate_h5.py' - + ' --chunksize ' + str(cfg['chunksize']) - + ' --complib ' + str(cfg['complib']) - + ' --complevel ' + str(cfg['complevel']) + 'python concatenate_h5.py' #Nikhef +# 'time python concatenate_h5.py' + + chunksize + complib + complevel + ' -l ' + listfile_fpath + ' ' + conc_outputfile_fpath) if cfg['submit_jobs'] is True: @@ -336,11 +342,10 @@ def make_concatenate_and_shuffle_list_files(cfg): f.write('# Shuffle the h5 file \n') f.write( - 'time python shuffle/shuffle_h5.py' + 'python shuffle_h5.py' #Nikhef +# 'time python shuffle_h5.py' + delete_flag_shuffle_tool - + ' --chunksize ' + str(cfg['chunksize']) - + ' --complib ' + str(cfg['complib']) - + ' --complevel ' + str(cfg['complevel']) + + chunksize + complib + complevel + ' ' + conc_outputfile_fpath) -- GitLab