Skip to content
Snippets Groups Projects
Commit 3521d407 authored by ViaFerrata's avatar ViaFerrata
Browse files

Merge branch 'master' of git.km3net.de:ml/OrcaSong

parents 29bfa60d 8397e533
No related branches found
No related tags found
No related merge requests found
# Example configuration file for make_data_split.py
# --- Example configuration file for make_data_split.py --- #
# Documentation for every config parameter that is available.
# Feel free to make a copy and keep only the lines you need!
# If you don't want to have a parameter, comment out the line or delete it!
# --- Documentation for every config parameter that is available --- #
#
# Main Parameters
# ----------
# n_files_train : int
# --- Main Parameters ---#
# -----------------------------------------------------------------------------#
n_files_train = 5 #int
# Into how many files the training dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# n_files_validate : int
# This option is needed if one of your input_groups has a run_id_train range.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
n_files_validate = 1 #int
# Into how many files the validation dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# n_files_rest : int
# This option is needed if one of your input_groups has a run_id_validate range.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
n_files_rest = 1 #int
# Into how many files the "rest" dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# output_file_folder : str
# This option is needed if one of your input_groups has a run_id_rest range.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
output_file_folder = "/project/antares/enriqueh/data/working" #str
# Path to the folder, where all the output .list files (and the bash job scripts) should be stored.
# output_file_name : str
# String, that specifies the prefix of the filename of the output .list files.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
output_file_name = "full_test" #str
# Prefix of the filename of the output .list files.
# E.g. if = "xyzc_tight_0":
# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ...
# print_only : bool
# If only informationa about the input_groups should be printed, and no .list files should be made.
#
# Job submission Parameters
# -------------------------
# make_qsub_bash_files : bool
# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate
# the files in the .list files.
# submit_jobs : bool
# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
print_only = false #bool
# true = No .list files are made, only prints information about the input_groups.
# -----------------------------------------------------------------------------#
# --- Job Submission Parameters ---#
# -----------------------------------------------------------------------------#
make_qsub_bash_files = true #bool
# true = Makes the cluster submission bash files needed to actually
# concatenate the files in the .list files.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
submit_jobs = false #bool
# true = Submit the bash job scripts to the cluster after they have been made.
# CAREFUL: May only work for Erlangen-CC.
# venv_path : str
# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
# data_tools_folder : str
# Dirpath, where the concatenate.py tool is located.
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
venv_path = "/project/antares/enriqueh/gpu_venv3/" #str
# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
# /project/antares/enriqueh/gpu_venv3/bin/activate
# -----------------------------------------------------------------------------#
# -----------------------------------------------------------------------------#
data_tools_folder = "/project/antares/enriqueh/OrcaNet/orcanet_contrib/data_tools" #str
# Dirpath, where the concatenate.py tool is located.
# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
# chunksize : int
# Chunksize parameter, that is used when calling concatenate.py
# complib : str
# Complib parameter, that is used when calling concatenate.py
# complevel : int
# Complevel parameter, that is used when calling concatenate.py
# shuffle_delete : bool
# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be
# deleted after the shuffling is finished.
#
# Input Group Parameters
# ----------------------
# dir : str
# Path of the directory, where the files for this input group are located.
# run_ids_train/run_ids_validate/run_ids_rest : array
# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest
# dataset of this input group.
# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5)
# to the training/validation/rest dataset.
# If you don't want to use a specific dataset for this input group, comment out the line or delete it!
#
# --- Documentation for every config parameter that is available --- #
# -----------------------------------------------------------------------------#
# --- Main options ---#
# -----------------------------------------------------------------------------#
shuffle_delete = false #bool
# True = the input file that will be deleted after the shuffling is finished.
# Option for the shuffle_h5 tool.
# -----------------------------------------------------------------------------#
n_files_train = 5
n_files_validate = 3
n_files_rest = 1
output_file_folder = "/home/woody/capn/mppi033h/make_dsplit_test"
output_file_name = "xyzc_tight_0"
print_only = false # only print information of your input_groups, don't make any .list files
# -----------------------------------------------------------------------------#
# Concatenate.py Parameters
# If they are commented, it will be set None on concatenate.py,
# and the script will use the configurations that are already in the file.
# --- Main options ---#
#chunksize = 32 #int
#complib = "gzip" #str
#complevel = 1 #int
#
# -----------------------------------------------------------------------------#
# --- Options, for submitting jobs to concatenate the .list files. --- #
make_qsub_bash_files = true
submit_jobs = false
venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
chunksize = 32
complib = "gzip"
complevel = 1
shuffle_delete = false
# --- Input Group Parameters: Datafiles to be concatenated --- #
# --- Options, for submitting jobs to concatenate the .list files. --- #
# -----------------------------------------------------------------------------#
[elec_cc_3_100]
# Name of the group, don't make whitespaces!
# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-CC"
# "/path/to/the/folder/of/the/data/for/this/input_1/group" #str
# Path of the directory where the files for this input group are located.
[input_group_1] # You can assign any name to this, doesnt matter which one. However, don't make whitespaces!!
dir = "/path/to/the/folder/of/the/data/for/this/input_1/group"
run_ids_train = [1001, 5000]
run_ids_validate = [1, 1000]
run_ids_rest = [5001, 20000]
run_ids_train = [1, 1000] #array
run_ids_validate = [1001, 1200]
#run_ids_rest = [1001, 1300]
# Arrays with the range of the run_ids that should be used for the
# training, validation and rest datasets of this input group.
# E.g. if [1,5] = Files from this input group with run_ids from 1 to 5
# (including 1 and 5!!) will go to the training/validation/rest dataset.
# If you don't want to use a specific dataset for this input group,
# you can comment out the corresponding line or delete it!
# -----------------------------------------------------------------------------#
# You can have more than 1 input group!!
[input_group_2] # 1 to 1500
dir = "/path/to/the/folder/of/the/data/for/this/input_2/group"
run_ids_train = [101, 500]
run_ids_validate = [1, 100]
#run_ids_rest = [501, 600]
[elec_nc_3_100]
# Name of the group, don't make whitespaces!
dir = "/dcache/antares/enriqueh/orcasong_output/xyzt/3-100GeV/elec-NC"
# "/path/to/the/folder/of/the/data/for/this/input_1/group" #str
# Path of the directory where the files for this input group are located.
[input_group_3] # 1 to 2400
dir = "/path/to/the/folder/of/the/data/for/this/input_3/group"
run_ids_train = [601, 2400]
#run_ids_validate = [1, 500] # comment out or delete it, if you dont want it
run_ids_rest = [501, 600]
run_ids_train = [1, 1000] #array
run_ids_validate = [1001, 1188]
# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
# [input_group_2] # 1 to 1500
# dir = "/path/to/the/folder/of/the/data/for/this/input_2/group"
# run_ids_train = [101, 500]
# run_ids_validate = [1, 100]
# #run_ids_rest = [501, 600]
# -----------------------------------------------------------------------------#
......@@ -39,6 +39,9 @@ def parse_input():
cfg = toml.load(config_file)
cfg['toml_filename'] = config_file
if 'chunksize' not in cfg: cfg['chunksize'] = None
if 'complib' not in cfg: cfg['complib'] = None
if 'complevel' not in cfg: cfg['complevel'] = None
return cfg
......@@ -282,6 +285,10 @@ def make_concatenate_and_shuffle_list_files(cfg):
if not os.path.exists(dirpath + '/data_split'): # check if /data_split folder exists, if not create it.
os.makedirs(dirpath + '/data_split')
chunksize = '' if cfg['chunksize'] is None else ' --chunksize ' + str(cfg['chunksize'])
complib = '' if cfg['complib'] is None else ' --complib ' + str(cfg['complib'])
complevel = '' if cfg['complevel'] is None else ' --complevel ' + str(cfg['complevel'])
# make qsub .sh file for concatenating
for listfile_fpath in cfg['output_lists']:
listfile_fname = os.path.basename(listfile_fpath)
......@@ -303,10 +310,9 @@ def make_concatenate_and_shuffle_list_files(cfg):
f.write('# Concatenate the files in the list\n')
f.write(
'time python concatenate/concatenate_h5.py'
+ ' --chunksize ' + str(cfg['chunksize'])
+ ' --complib ' + str(cfg['complib'])
+ ' --complevel ' + str(cfg['complevel'])
'python concatenate_h5.py' #Nikhef
# 'time python concatenate_h5.py'
+ chunksize + complib + complevel
+ ' -l ' + listfile_fpath + ' ' + conc_outputfile_fpath)
if cfg['submit_jobs'] is True:
......@@ -336,11 +342,10 @@ def make_concatenate_and_shuffle_list_files(cfg):
f.write('# Shuffle the h5 file \n')
f.write(
'time python shuffle/shuffle_h5.py'
'python shuffle_h5.py' #Nikhef
# 'time python shuffle_h5.py'
+ delete_flag_shuffle_tool
+ ' --chunksize ' + str(cfg['chunksize'])
+ ' --complib ' + str(cfg['complib'])
+ ' --complevel ' + str(cfg['complevel'])
+ chunksize + complib + complevel
+ ' ' + conc_outputfile_fpath)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment