Skip to content
Snippets Groups Projects
Commit 8474eab5 authored by ViaFerrata's avatar ViaFerrata
Browse files

Bugfix for make_data_split.

parent 06d458cf
No related branches found
No related tags found
No related merge requests found
# Example configuration file for make_data_split.py
# --- Documentation for every config parameter that is available --- #
#
# Main Parameters
# ----------
# n_files_train : int
# Into how many files the training dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# n_files_validate : int
# Into how many files the validation dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# n_files_rest : int
# Into how many files the "rest" dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# output_file_folder : str
# Path to the folder, where all the output .list files (and the bash job scripts) should be stored.
# output_file_name : str
# String, that specifies the prefix of the filename of the output .list files.
# E.g. if = "xyzc_tight_0":
# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ...
# print_only : bool
# If only informationa about the input_groups should be printed, and no .list files should be made.
#
# Job submission Parameters
# -------------------------
# make_qsub_bash_files : bool
# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate
# the files in the .list files.
# submit_jobs : bool
# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made.
# CAREFUL: May only work for Erlangen-CC.
# venv_path : str
# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
# data_tools_folder : str
# Dirpath, where the concatenate.py tool is located.
# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
# chunksize : int
# Chunksize parameter, that is used when calling concatenate.py
# complib : str
# Complib parameter, that is used when calling concatenate.py
# complevel : int
# Complevel parameter, that is used when calling concatenate.py
# shuffle_delete : bool
# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be
# deleted after the shuffling is finished.
#
# Input Group Parameters
# ----------------------
# dir : str
# Path of the directory, where the files for this input group are located.
# run_ids_train/run_ids_validate/run_ids_rest : array
# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest
# dataset of this input group.
# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5)
# to the training/validation/rest dataset.
# If you don't want to use a specific dataset for this input group, comment out the line or delete it!
#
# --- Documentation for every config parameter that is available --- #
# --- Main options ---#
n_files_train = 10
n_files_validate = 5
n_files_rest = 1
output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/data_splits/xyzt"
output_file_name = "xyzt_tight_1_60b_ts_dataset"
print_only = false # only print information of your input_groups, don't make any .list files
# --- Main options ---#
# --- Options, for submitting jobs to concatenate the .list files. --- #
make_qsub_bash_files = true
submit_jobs = true
venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools"
chunksize = 32
complib = "gzip"
complevel = 1
shuffle_delete = true
# --- Options, for submitting jobs to concatenate the .list files. --- #
# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
[muon_cc_3_100] # 1 to 2400
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/muon-CC/3-100GeV/xyzt"
run_ids_train = [721, 2400]
run_ids_validate = [1, 720]
[muon_cc_1_5] # 1 to 600
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/muon-CC/1-5GeV/xyzt"
run_ids_train = [181, 600]
run_ids_validate = [1, 180]
[elec_cc_3_100] # 1 to 2400
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-CC/3-100GeV/xyzt"
run_ids_train = [361, 1200]
run_ids_validate = [1, 360]
[elec_cc_1_5] # 1 to 600
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-CC/1-5GeV/xyzt"
run_ids_train = [181, 600]
run_ids_validate = [1, 180]
[elec_nc_3_100] # 1 to 2400
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-NC/3-100GeV/xyzt"
run_ids_train = [361, 1200]
run_ids_validate = [1, 360]
[elec_nc_1_5] # 1 to 600
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/elec-NC/1-5GeV/xyzt"
run_ids_train = [181, 600]
run_ids_validate = [1, 180]
[tau_cc_3_100] # 1 to 1800
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_1_60b_ts_classifier/tau-CC/3-100GeV/xyzt"
run_ids_rest = [1, 1800]
# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
# Example configuration file for make_data_split.py
# --- Documentation for every config parameter that is available --- #
#
# Main Parameters
# ----------
# n_files_train : int
# Into how many files the training dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# n_files_validate : int
# Into how many files the validation dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# n_files_rest : int
# Into how many files the "rest" dataset should be split.
# If you don't want to have this dataset, comment out the line or delete it!
# output_file_folder : str
# Path to the folder, where all the output .list files (and the bash job scripts) should be stored.
# output_file_name : str
# String, that specifies the prefix of the filename of the output .list files.
# E.g. if = "xyzc_tight_0":
# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ...
# print_only : bool
# If only informationa about the input_groups should be printed, and no .list files should be made.
#
# Job submission Parameters
# -------------------------
# make_qsub_bash_files : bool
# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate
# the files in the .list files.
# submit_jobs : bool
# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made.
# CAREFUL: May only work for Erlangen-CC.
# venv_path : str
# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/"
# data_tools_folder : str
# Dirpath, where the concatenate.py tool is located.
# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools"
# chunksize : int
# Chunksize parameter, that is used when calling concatenate.py
# complib : str
# Complib parameter, that is used when calling concatenate.py
# complevel : int
# Complevel parameter, that is used when calling concatenate.py
# shuffle_delete : bool
# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be
# deleted after the shuffling is finished.
#
# Input Group Parameters
# ----------------------
# dir : str
# Path of the directory, where the files for this input group are located.
# run_ids_train/run_ids_validate/run_ids_rest : array
# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest
# dataset of this input group.
# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5)
# to the training/validation/rest dataset.
# If you don't want to use a specific dataset for this input group, comment out the line or delete it!
#
# --- Documentation for every config parameter that is available --- #
# --- Main options ---#
n_files_train = 10
n_files_validate = 5
n_files_rest = 1
output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/data_splits/xyzt"
output_file_name = "xyzt_tight_2_60b_ts_dataset"
print_only = false # only print information of your input_groups, don't make any .list files
# --- Main options ---#
# --- Options, for submitting jobs to concatenate the .list files. --- #
make_qsub_bash_files = true
submit_jobs = true
venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env"
data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaSong/orcasong_contrib/data_tools"
chunksize = 32
complib = "gzip"
complevel = 1
shuffle_delete = true
# --- Options, for submitting jobs to concatenate the .list files. --- #
# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
[muon_cc_3_100] # 1 to 2400
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/muon-CC/3-100GeV/xyzt"
run_ids_train = [721, 2400]
run_ids_validate = [1, 720]
[muon_cc_1_5] # 1 to 600
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/muon-CC/1-5GeV/xyzt"
run_ids_train = [181, 600]
run_ids_validate = [1, 180]
[elec_cc_3_100] # 1 to 2400
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-CC/3-100GeV/xyzt"
run_ids_train = [361, 1200]
run_ids_validate = [1, 360]
[elec_cc_1_5] # 1 to 600
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-CC/1-5GeV/xyzt"
run_ids_train = [181, 600]
run_ids_validate = [1, 180]
[elec_nc_3_100] # 1 to 2400
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-NC/3-100GeV/xyzt"
run_ids_train = [361, 1200]
run_ids_validate = [1, 360]
[elec_nc_1_5] # 1 to 600
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/elec-NC/1-5GeV/xyzt"
run_ids_train = [181, 600]
run_ids_validate = [1, 180]
[tau_cc_3_100] # 1 to 1800
dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_2_60b_ts_classifier/tau-CC/3-100GeV/xyzt"
run_ids_rest = [1, 1800]
# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
......@@ -309,9 +309,7 @@ def make_concatenate_and_shuffle_list_files(cfg):
f.write('\n')
f.write('# Concatenate the files in the list\n')
f.write(
'python concatenate_h5.py' #Nikhef
# 'time python concatenate_h5.py'
f.write('time python concatenate/concatenate_h5.py'
+ chunksize + complib + complevel
+ ' -l ' + listfile_fpath + ' ' + conc_outputfile_fpath)
......@@ -341,9 +339,7 @@ def make_concatenate_and_shuffle_list_files(cfg):
f.write('\n')
f.write('# Shuffle the h5 file \n')
f.write(
'python shuffle_h5.py' #Nikhef
# 'time python shuffle_h5.py'
f.write('time python shuffle/shuffle_h5.py'
+ delete_flag_shuffle_tool
+ chunksize + complib + complevel
+ ' ' + conc_outputfile_fpath)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment