diff --git a/orcasong/make_nn_images.py b/orcasong/make_nn_images.py index 5d7112d6f1d236135263a6ed047431c5ace8d5de..531564a61a0fdf11c3571a8194f5d0a869c39cda 100644 --- a/orcasong/make_nn_images.py +++ b/orcasong/make_nn_images.py @@ -477,6 +477,9 @@ def make_nn_images(fname, detx_filepath, config): # Execute Pipeline pipe.drain() + if do2d_plots[0] is True: + pdf_2d_plots.close() + def main(): """ diff --git a/utilities/__init__.py b/orcasong_contrib/__init__.py similarity index 100% rename from utilities/__init__.py rename to orcasong_contrib/__init__.py diff --git a/utilities/timecut_test/__init__.py b/orcasong_contrib/data_tools/__init__.py similarity index 100% rename from utilities/timecut_test/__init__.py rename to orcasong_contrib/data_tools/__init__.py diff --git a/orcasong_contrib/data_tools/concatenate/__init__.py b/orcasong_contrib/data_tools/concatenate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py new file mode 100644 index 0000000000000000000000000000000000000000..7c8f0a7e04010537f116c9ad4e2dcc6c8c27c6e1 --- /dev/null +++ b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Concatenates .h5 files. Works only for files where each dataset has the same number of rows.""" + +import h5py +import numpy as np +import math +from argparse import ArgumentParser, RawTextHelpFormatter +import sys +# from memory_profiler import profile # for memory profiling, call with @profile; myfunc() + +__author__ = 'Michael Moser' +__license__ = 'AGPL' +__version__ = '1.0' +__email__ = 'michael.m.moser@fau.de' +__status__ = 'Production' + + +def parse_input(): + """ + Parses the user input in order to return the most important information: + + 1) list of files that should be concatenated + 2) the filepath of the output .h5 file + 3) use custom chunksize or not. + + Returns + ------- + file_list : list + List that contains all filepaths of the input files. + output_filepath : str + String that specifies the filepath (path+name) of the output .h5 file. + chunksize : None/int + Specifies the chunksize for axis_0 in the concatenated output files. + If None, the chunksize is read from the first input file. + Else, a custom chunksize will be used. + complib : None/str + Specifies the compression library that should be used for saving the concatenated output files. + If None, the compression library is read from the first input file. + Else, a custom compression library will be used. + Currently available: 'gzip', or 'lzf'. + complevel : None/int + Specifies the compression level that should be used for saving the concatenated output files. + A compression level is only available for gzip compression, not lzf! + If None, the compression level is read from the first input file. + Else, a custom compression level will be used. + + """ + parser = ArgumentParser(description='E.g. < python concatenate_h5.py file_1 file_2 /path/to/output.h5 > or ' + '< python concatenate_h5.py --list filepaths.txt /path/to/output.h5 >.\n' + 'Concatenates arrays stored in .h5 files for either multiple direct .h5 inputs or a .txt file of .h5 files (--list option).\n' + 'Outputs a new .h5 file with the concatenated arrays. This output is chunked!\n' + 'Careful: The folders of one file need to have the same number of rows (axis_0)!\n' + 'Make a .txt file with < find /path/to/files -name "file_x-*.h5" | sort --version-sort > listname.list >\n' + 'Chunksize: By default, the chunksize is set to the chunksize of the first inputfile!', + formatter_class=RawTextHelpFormatter) + + parser.add_argument('files', metavar='file', type=str, nargs='*', help = 'a file that should be concatenated, minimum of two.') + parser.add_argument('output_filepath', metavar='output_filepath', type=str, nargs=1, help='filepath and name of the output .h5 file') + parser.add_argument('-l', '--list', dest='list_file', type=str, + help='filepath of a .list file that contains all .h5 files that should be concatenated') + parser.add_argument('--chunksize', dest='chunksize', type=int, + help='Specify a chunksize value in order to use chunked storage for the concatenated .h5 file.' + ' Otherwise, it will be read from the first input file..') + parser.add_argument('--complib', dest='complib', type=str, + help='Specify a filter that should be used for compression. Either "gzip" or "lzf". ' + 'Otherwise, the filter will be read from the first input file.') + parser.add_argument('--complevel', dest='complevel', type=int, + help='Specify a compression filter strength that should be used for the compression. ' + 'Otherwise, the filter will be read from the first input file. ' + 'Can range from 0 to 9. Has no effect on "lzf" compression.') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + args = parser.parse_args() + + if args.list_file: + file_list = [line.rstrip('\n') for line in open(args.list_file)] + else: + file_list = [] + for filepath in args.files: + file_list.append(filepath) + + output_filepath = args.output_filepath[0] + + chunksize = None + if args.chunksize: + chunksize = args.chunksize + print('You chose chunksize = ' + str(chunksize)) + + complib = None + if args.complib: + complib = args.complib + print('You chose complib = ' + complib) + + complevel = None + if args.complevel: + complevel = args.complevel + print('You chose complevel = ' + str(complevel)) + + return file_list, output_filepath, chunksize, complib, complevel + + +def get_cum_number_of_rows(file_list): + """ + Returns the cumulative number of rows (axis_0) in a list based on the specified input .h5 files. + + Parameters + ---------- + file_list : list + List that contains all filepaths of the input files. + + Returns + ------- + cum_number_of_rows_list : list + List that contains the cumulative number of rows (i.e. [0,100,200,300,...] if each file has 100 rows). + + """ + total_number_of_rows = 0 + cum_number_of_rows_list = [0] + number_of_rows_list = [] # used for approximating the chunksize + + # Get total number of rows for the files in the list, faster than resizing the dataset in each iteration of the file loop in concatenate_h5_files() + + for file_name in file_list: + f = h5py.File(file_name, 'r') + + # get number of rows from the first folder of the file -> each folder needs to have the same number of rows + f_keys = list(f.keys()) + # remove pytables folders starting with '_i_', because the shape of its first axis does not correspond to the number of events in the file. + # all other folders normally have an axis_0 shape that is equal to the number of events in the file. + f_keys_stripped = [x for x in f_keys if '_i_' not in x] + + total_number_of_rows += f[f_keys_stripped[0]].shape[0] + cum_number_of_rows_list.append(total_number_of_rows) + number_of_rows_list.append(f[f_keys_stripped[0]].shape[0]) + + f.close() + + return cum_number_of_rows_list + + +def get_f_compression_and_chunking(filepath): + """ + Function that gets the used compression library, the compression level (if applicable) + and the chunksize of axis_0 of the first dataset of the file. + + Parameters + ---------- + filepath : str + Filepath of a .hdf5 file. + + Returns + ------- + compression : str + The compression library that has been identified in the input file. E.g. 'gzip', or 'lzf'. + complevel : int + The compression level that has been identified in the input file. + chunksize : None/int + The chunksize of axis_0 that has been indentified in the input file. + + """ + f = h5py.File(filepath, 'r') + + # remove any keys to pytables folders that may be in the file + f_keys_stripped = [x for x in list(f.keys()) if '_i_' not in x] + + compression = f[f_keys_stripped[0]].compression # compression filter + compression_opts = f[f_keys_stripped[0]].compression_opts # filter strength + chunksize = f[f_keys_stripped[0]].chunks[0] # chunksize along axis_0 of the dataset + + return compression, compression_opts, chunksize + + +def concatenate_h5_files(output_filepath, file_list, cum_rows_list, chunksize, complib, complevel): + """ + Function that concatenates hdf5 files based on an output_filepath and a file_list of input files. + + If the files contain group_info and x_indices folders (if the input files are coming from km3pipe output), + the group-id / the index of the x_indices is fixed in order to not get duplicates of group-ids / x-indices. + + Parameters + ---------- + output_filepath : str + String that specifies the filepath (path+name) of the output .h5 file. + file_list : list + List that contains all filepaths of the input files. + cum_rows_list : list + List that contains the cumulative number of rows (i.e. [0,100,200,300,...] if each file has 100 rows). + chunksize : None/int + Specifies the chunksize for axis_0 in the concatenated output files. + If None, the chunksize is read from the first input file. + Else, a custom chunksize will be used. + complib : None/str + Specifies the compression library that should be used for saving the concatenated output files. + If None, the compression library is read from the first input file. + Else, a custom compression library will be used. + Currently available: 'gzip', or 'lzf'. + complevel : None/int + Specifies the compression level that should be used for saving the concatenated output files. + A compression level is only available for gzip compression, not lzf! + If None, the compression level is read from the first input file. + Else, a custom compression level will be used. + + """ + complib_f, complevel_f, chunksize_f = get_f_compression_and_chunking(file_list[0]) + + chunksize = chunksize_f if chunksize is None else chunksize + complib = complib_f if complib is None else complib + complevel = complevel_f if complevel is None else complevel + + if complib == 'lzf': + complevel = None + + file_output = h5py.File(output_filepath, 'w') + + for n, input_file_name in enumerate(file_list): + print('Processing file ' + file_list[n]) + input_file = h5py.File(input_file_name, 'r') + + # create metadata + if 'format_version' in list(input_file.attrs.keys()) and n == 0: + file_output.attrs['format_version'] = input_file.attrs['format_version'] + + for folder_name in input_file: + + if folder_name.startswith('_i_'): + # we ignore datasets that have been created by pytables, don't need them anymore + continue + + if n > 0 and folder_name in ['group_info', 'x_indices', 'y']: + folder_data = input_file[folder_name][()] + # we need to add the current number of the group_id / index in the file_output + # to the group_ids / indices of the file that is to be appended + column_name = 'group_id' if folder_name in ['group_info', 'y'] else 'index' + # add 1 because the group_ids / indices start with 0 + folder_data[column_name] += np.amax(file_output[folder_name][column_name]) + 1 + + else: + folder_data = input_file[folder_name] + + print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype)) + + if n == 0: + # first file; create the dummy dataset with no max shape + maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None + chunks = (chunksize,) + folder_data.shape[1:] + + output_dataset = file_output.create_dataset(folder_name, data=folder_data, maxshape=maxshape, chunks=chunks, + compression=complib, compression_opts=complevel) + + output_dataset.resize(cum_rows_list[-1], axis=0) + + else: + file_output[folder_name][cum_rows_list[n]:cum_rows_list[n + 1]] = folder_data + + file_output.flush() + + print('Output information:') + print('-------------------') + print('The output file contains the following datasets:') + for folder_name in file_output: + print('Dataset ' + folder_name + ' with the following shape, dtype and chunks (first argument' + ' is the chunksize in axis_0): \n' + str(file_output[folder_name].shape) + ' ; ' + + str(file_output[folder_name].dtype) + ' ; ' + str(file_output[folder_name].chunks)) + + file_output.close() + + +def main(): + """ + Main code. Concatenates .h5 files with multiple datasets, where each dataset in one file needs to have the same number of rows (axis_0). + + Gets user input with aid of the parse_input() function. By default, the chunksize for the output .h5 file is automatically computed. + based on the average number of rows per file, in order to eliminate padding (wastes disk space). + For faster I/O, the chunksize should be set by the user depending on the use case. + In deep learning applications for example, the chunksize should be equal to the batch size that is used later on for reading the data. + """ + file_list, output_filepath, chunksize, complib, complevel = parse_input() + cum_rows_list = get_cum_number_of_rows(file_list) + concatenate_h5_files(output_filepath, file_list, cum_rows_list, chunksize, complib, complevel) + + +if __name__ == '__main__': + main() diff --git a/orcasong_contrib/data_tools/make_data_split/__init__.py b/orcasong_contrib/data_tools/make_data_split/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml new file mode 100644 index 0000000000000000000000000000000000000000..89e5c9e563e975eae5b2bc8fdc4cb27c97eaeb13 --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-c_tight-0_100b.toml @@ -0,0 +1,141 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 29 +n_files_validate = 13 +n_files_rest = 1 +output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/data_splits/xyzc" +output_file_name = "xyzc_tight_0_100b_bg_classifier_dataset" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = false +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = false + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # +[mupage] # 1 to 20000 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/mupage/xyzc" +run_ids_train = [15618, 20000] +run_ids_validate = [13741, 15617] +run_ids_rest = [1, 13740] + + +[random_noise] # 1 to 1500 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/random_noise/xyzc" +run_ids_train = [576, 1500] +run_ids_validate = [186, 575] +run_ids_rest = [1, 185] + + +[muon_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/3-100GeV/xyzc" +run_ids_train = [721, 2400] +run_ids_validate = [1, 720] + + +[muon_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/1-5GeV/xyzc" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/3-100GeV/xyzc" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/1-5GeV/xyzc" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_nc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/3-100GeV/xyzc" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_nc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/1-5GeV/xyzc" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[tau_cc_3_100] # 1 to 1800 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/tau-CC/3-100GeV/xyzc" +run_ids_rest = [1, 1800] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml new file mode 100644 index 0000000000000000000000000000000000000000..072093e7323195ffcf5e1e859f136fce8d95f2d5 --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/configs/config_bg_classifier_xyz-t_tight-0_100b.toml @@ -0,0 +1,141 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 29 +n_files_validate = 13 +n_files_rest = 1 +output_file_folder = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/data_splits/xyzt" +output_file_name = "xyzt_tight_0_100b_bg_classifier_dataset" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = false +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = false + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # +[mupage] # 1 to 20000 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/mupage/xyzt" +run_ids_train = [15618, 20000] +run_ids_validate = [13741, 15617] +run_ids_rest = [1, 13740] + + +[random_noise] # 1 to 1500 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/random_noise/xyzt" +run_ids_train = [576, 1500] +run_ids_validate = [186, 575] +run_ids_rest = [1, 185] + + +[muon_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/3-100GeV/xyzt" +run_ids_train = [721, 2400] +run_ids_validate = [1, 720] + + +[muon_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/muon-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_cc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_cc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-CC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[elec_nc_3_100] # 1 to 2400 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/3-100GeV/xyzt" +run_ids_train = [361, 1200] +run_ids_validate = [1, 360] + + +[elec_nc_1_5] # 1 to 600 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/elec-NC/1-5GeV/xyzt" +run_ids_train = [181, 600] +run_ids_validate = [1, 180] + + +[tau_cc_3_100] # 1 to 1800 +dir = "/home/saturn/capn/mppi033h/Data/input_images/ORCA_2016_115l/tight_0_100b_t_bg_classifier/tau-CC/3-100GeV/xyzt" +run_ids_rest = [1, 1800] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/example_config.toml b/orcasong_contrib/data_tools/make_data_split/example_config.toml new file mode 100644 index 0000000000000000000000000000000000000000..cded7422b15b6354878da9defe688fa6935c3184 --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/example_config.toml @@ -0,0 +1,108 @@ +# Example configuration file for make_data_split.py + +# --- Documentation for every config parameter that is available --- # +# +# Main Parameters +# ---------- +# n_files_train : int +# Into how many files the training dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_validate : int +# Into how many files the validation dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# n_files_rest : int +# Into how many files the "rest" dataset should be split. +# If you don't want to have this dataset, comment out the line or delete it! +# output_file_folder : str +# Path to the folder, where all the output .list files (and the bash job scripts) should be stored. +# output_file_name : str +# String, that specifies the prefix of the filename of the output .list files. +# E.g. if = "xyzc_tight_0": +# xyzc_tight_0_train_0.list, xyzc_tight_0_validate_0.list, ... +# print_only : bool +# If only informationa about the input_groups should be printed, and no .list files should be made. +# +# Job submission Parameters +# ------------------------- +# make_qsub_bash_files : bool +# If bash files should be made, that can be submitted to a cluster, in order to actually concatenate +# the files in the .list files. +# submit_jobs : bool +# Additionally to make_qsub_bash_files, submit the bash job scripts to the cluster after they have been made. +# CAREFUL: May only work for Erlangen-CC. +# venv_path : str +# Path to a virtualenv, e.g. "/home/hpc/capn/mppi033h/.virtualenv/python_3_env/" +# data_tools_folder : str +# Dirpath, where the concatenate.py tool is located. +# E.g. "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +# chunksize : int +# Chunksize parameter, that is used when calling concatenate.py +# complib : str +# Complib parameter, that is used when calling concatenate.py +# complevel : int +# Complevel parameter, that is used when calling concatenate.py +# shuffle_delete : bool +# Option for the shuffle_h5 tool, specifies if the input file that will be shuffled should be +# deleted after the shuffling is finished. +# +# Input Group Parameters +# ---------------------- +# dir : str +# Path of the directory, where the files for this input group are located. +# run_ids_train/run_ids_validate/run_ids_rest : array +# Array, which specifies the range of the run_ids, that should be used for the training/validation.rest +# dataset of this input group. +# E.g. if [1,5], the script will put files from this input group with run_ids from 1 to 5 (including 1 and 5) +# to the training/validation/rest dataset. +# If you don't want to use a specific dataset for this input group, comment out the line or delete it! +# +# --- Documentation for every config parameter that is available --- # + +# --- Main options ---# + +n_files_train = 5 +n_files_validate = 3 +n_files_rest = 1 +output_file_folder = "/home/woody/capn/mppi033h/make_dsplit_test" +output_file_name = "xyzc_tight_0" +print_only = false # only print information of your input_groups, don't make any .list files + +# --- Main options ---# + + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +make_qsub_bash_files = true +submit_jobs = false +venv_path = "/home/hpc/capn/mppi033h/.virtualenv/python_3_env" +data_tools_folder = "/home/woody/capn/mppi033h/Code/OrcaNet/orcanet_contrib/data_tools" +chunksize = 32 +complib = "gzip" +complevel = 1 +shuffle_delete = false + +# --- Options, for submitting jobs to concatenate the .list files. --- # + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # + +[input_group_1] # You can assign any name to this, doesnt matter which one. However, don't make whitespaces!! +dir = "/path/to/the/folder/of/the/data/for/this/input_1/group" +run_ids_train = [1001, 5000] +run_ids_validate = [1, 1000] +run_ids_rest = [5001, 20000] + + +[input_group_2] # 1 to 1500 +dir = "/path/to/the/folder/of/the/data/for/this/input_2/group" +run_ids_train = [101, 500] +run_ids_validate = [1, 100] +#run_ids_rest = [501, 600] + + +[input_group_3] # 1 to 2400 +dir = "/path/to/the/folder/of/the/data/for/this/input_3/group" +run_ids_train = [601, 2400] +#run_ids_validate = [1, 500] # comment out or delete it, if you dont want it +run_ids_rest = [501, 600] + +# --- Input groups : these are the datafiles, that should be concatenated somehow --- # \ No newline at end of file diff --git a/orcasong_contrib/data_tools/make_data_split/make_data_split.py b/orcasong_contrib/data_tools/make_data_split/make_data_split.py new file mode 100644 index 0000000000000000000000000000000000000000..1ec7b4ba92f1dcb0c22adf71862bd7fc72b11b6c --- /dev/null +++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Utility script that makes .list files for the concatenate_h5.py tool. + +Usage: + make_data_split.py CONFIG + make_data_split.py (-h | --help) + +Arguments: + CONFIG A .toml file which contains the configuration options. + +Options: + -h --help Show this screen. + +""" + +import os +import toml +import docopt +import natsort as ns +import h5py + + +def parse_input(): + """ + Parses the config of the .toml file, specified by the user. + + Returns + ------- + cfg : dict + Dict that contains all configuration options from the input .toml file. + + """ + + args = docopt.docopt(__doc__) + config_file = args['CONFIG'] + + cfg = toml.load(config_file) + cfg['toml_filename'] = config_file + + return cfg + + +def get_all_ip_group_keys(cfg): + """ + Gets the keys of all input groups in the config dict. + + The input groups are defined as the dict elements, where the values have the type of a dict. + + Parameters + ---------- + cfg : dict + Dict that contains all configuration options and additional information. + + Returns + ------- + ip_group_keys : list + List of the input_group keys. + + """ + ip_group_keys = [] + for key in cfg: + if type(cfg[key]) == dict: + ip_group_keys.append(key) + + return ip_group_keys + + +def get_h5_filepaths(dirpath): + """ + Returns the filepaths of all .h5 files that are located in a specific directory. + + Parameters + ---------- + dirpath: str + Path of the directory where the .h5 files are located. + + Returns + ------- + filepaths : list + List with the full filepaths of all .h5 files in the dirpath folder. + + """ + filepaths = [] + for f in os.listdir(dirpath): + if f.endswith('.h5'): + filepaths.append(dirpath + '/' + f) + + filepaths = ns.natsorted(filepaths) # TODO should not be necessary actually! + return filepaths + + +def get_number_of_evts_and_run_ids(list_of_files, dataset_key='y', run_id_col_name='run_id'): + """ + Gets the number of events and the run_ids for all hdf5 files in the list_of_files. + + The number of events is calculated based on the dataset, which is specified with the dataset_key parameter. + + Parameters + ---------- + list_of_files : list + List which contains filepaths to h5 files. + dataset_key : str + String which specifies, which dataset in a h5 file should be used for calculating the number of events. + run_id_col_name : str + String, which specifies the column name of the 'run_id' column. + + Returns + ------- + total_number_of_evts : int + The cumulative (total) number of events. + mean_number_of_evts_per_file : float + The mean number of evts per file. + run_ids : list + List containing the run_ids of the files in the list_of_files. + + """ + + total_number_of_evts = 0 + run_ids = [] + + for i, fpath in enumerate(list_of_files): + f = h5py.File(fpath, 'r') + + dset = f[dataset_key] + n_evts = dset.shape[0] + total_number_of_evts += n_evts + + run_id = f[dataset_key][0][run_id_col_name] + run_ids.append(run_id) + + f.close() + + mean_number_of_evts_per_file = total_number_of_evts / len(list_of_files) + + return total_number_of_evts, mean_number_of_evts_per_file, run_ids + + +def split(a, n): + """ + Splits a list into n equal sized (if possible! if not, approximately) chunks. + + Parameters + ---------- + a : list + A list that should be split. + n : int + Number of times the input list should be split. + + Returns + ------- + a_split : list + The input list a, which has been split into n chunks. + + """ + # from https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length + k, m = divmod(len(a), n) + a_split = list((a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))) + return a_split + + +def print_input_statistics(cfg, ip_group_keys): + """ + Prints some useful information for each input_group. + + Parameters + ---------- + cfg : dict + Dict that contains all configuration options and additional information. + ip_group_keys : list + List of the input_group keys. + + """ + + print('----------------------------------------------------------------------') + print('Printing input statistics for your ' + cfg['toml_filename'] + ' input:') + print('----------------------------------------------------------------------') + + print('Your input .toml file has the following data input groups: ' + str(ip_group_keys)) + print('Total number of events: ' + str(cfg['n_evts_total'])) + + for key in ip_group_keys: + print('--------------------------------------------------------------------') + print('Info for group ' + key + ':') + print('Directory: ' + cfg[key]['dir']) + print('Total number of files: ' + str(cfg[key]['n_files'])) + print('Total number of events: ' + str(cfg[key]['n_evts'])) + print('Mean number of events per file: ' + str(round(cfg[key]['n_evts_per_file_mean'], 3))) + print('--------------------------------------------------------------------') + + +def add_fpaths_for_data_split_to_cfg(cfg, key): + """ + Adds all the filepaths for the output files into a list, and puts them into the cfg['output_dsplit'][key] location + for all dsplits (train, validate, rest). + + Parameters + ---------- + cfg : dict + Dict that contains all configuration options and additional information. + key : str + The key of an input_group. + + """ + + fpath_lists = {'train': [], 'validate': [], 'rest': []} + for i, fpath in enumerate(cfg[key]['fpaths']): + + run_id = cfg[key]['run_ids'][i] + + for dsplit in ['train', 'validate', 'rest']: + if 'run_ids_' + dsplit in cfg[key]: + if cfg[key]['run_ids_' + dsplit][0] <= run_id <= cfg[key]['run_ids_' + dsplit][1]: + fpath_lists[dsplit].append(fpath) + + for dsplit in ['train', 'validate', 'rest']: + if len(fpath_lists[dsplit]) == 0: + continue + + n_files_dsplit = cfg['n_files_' + dsplit] + fpath_lists[dsplit] = split(fpath_lists[dsplit], n_files_dsplit) + if 'output_' + dsplit not in cfg: + cfg['output_' + dsplit] = dict() + cfg['output_' + dsplit][key] = fpath_lists[dsplit] + + +def make_dsplit_list_files(cfg): + """ + Writes .list files of the datasplits to the disk, with the information in the cfg['output_dsplit'] dict. + + Parameters + ---------- + cfg : dict + Dict that contains all configuration options and additional information. + + """ + # check if //conc_list_files folder exists, if not create it. + if not os.path.exists(cfg['output_file_folder'] + '/conc_list_files'): + os.makedirs(cfg['output_file_folder'] + '/conc_list_files') + + for dsplit in ['train', 'validate', 'rest']: + + if 'output_' + dsplit not in cfg: + continue + + first_key = list(cfg['output_' + dsplit].keys())[0] + n_output_files = len(cfg['output_' + dsplit][first_key]) + + for i in range(n_output_files): + fpath_output = cfg['output_file_folder'] + '/conc_list_files/' + cfg['output_file_name'] + '_' + dsplit + '_' + str(i) + '.list' + + # for later usage + if 'output_lists' not in cfg: + cfg['output_lists'] = list() + cfg['output_lists'].append(fpath_output) + + with open(fpath_output, 'w') as f_out: + for group_key in cfg['output_' + dsplit]: + for fpath in cfg['output_' + dsplit][group_key][i]: + f_out.write(fpath + '\n') + + +def make_concatenate_and_shuffle_list_files(cfg): + """ + Function that writes qsub .sh files which concatenates all files inside the .list files. + + Parameters + ---------- + cfg : dict + Dict that contains all configuration options and additional information. + + """ + # TODO include options for multicore + + dirpath = cfg['output_file_folder'] + + if not os.path.exists(dirpath + '/logs'): # check if /logs folder exists, if not create it. + os.makedirs(dirpath + '/logs') + if not os.path.exists(dirpath + '/job_scripts'): # check if /job_scripts folder exists, if not create it. + os.makedirs(dirpath + '/job_scripts') + if not os.path.exists(dirpath + '/data_split'): # check if /data_split folder exists, if not create it. + os.makedirs(dirpath + '/data_split') + + # make qsub .sh file for concatenating + for listfile_fpath in cfg['output_lists']: + listfile_fname = os.path.basename(listfile_fpath) + listfile_fname_wout_ext = os.path.splitext(listfile_fname)[0] + conc_outputfile_fpath = cfg['output_file_folder'] + '/data_split/' + listfile_fname_wout_ext + '.h5' + + fpath_bash_script = dirpath + '/job_scripts/submit_concatenate_h5_' + listfile_fname_wout_ext + '.sh' + + with open(fpath_bash_script, 'w') as f: + f.write('#!/usr/bin/env bash\n') + f.write('#\n') + f.write('#PBS -o ' + cfg['output_file_folder'] + '/logs/submit_concatenate_h5_' + listfile_fname_wout_ext + '.out' + ' -e ' + cfg['output_file_folder'] + '/logs/submit_concatenate_h5_' + listfile_fname_wout_ext + '.err\n') + f.write('\n') + f.write('CodeFolder="' + cfg['data_tools_folder'] + '"\n') + f.write('cd ${CodeFolder}\n') + f.write('source activate ' + cfg['venv_path'] + '\n') + f.write('\n') + f.write('# Concatenate the files in the list\n') + + f.write( + 'time python concatenate_h5.py' + + ' --chunksize ' + str(cfg['chunksize']) + + ' --complib ' + str(cfg['complib']) + + ' --complevel ' + str(cfg['complevel']) + + ' -l ' + listfile_fpath + ' ' + conc_outputfile_fpath) + + if cfg['submit_jobs'] is True: + os.system('qsub -l nodes=1:ppn=4,walltime=23:59:00 ' + fpath_bash_script) + + # make qsub .sh file for shuffling + delete_flag_shuffle_tool = '--delete' if cfg['shuffle_delete'] is True else '' + for listfile_fpath in cfg['output_lists']: + listfile_fname = os.path.basename(listfile_fpath) + listfile_fname_wout_ext = os.path.splitext(listfile_fname)[0] + + # This is the input for the shuffle tool! + conc_outputfile_fpath = cfg['output_file_folder'] + '/data_split/' + listfile_fname_wout_ext + '.h5' + + fpath_bash_script = dirpath + '/job_scripts/submit_shuffle_h5_' + listfile_fname_wout_ext + '.sh' + + with open(fpath_bash_script, 'w') as f: + f.write('#!/usr/bin/env bash\n') + f.write('#\n') + f.write('#PBS -o ' + cfg['output_file_folder'] + '/logs/submit_shuffle_h5_' + listfile_fname_wout_ext + '.out' + ' -e ' + cfg['output_file_folder'] + '/logs/submit_shuffle_h5_' + listfile_fname_wout_ext + '.err\n') + f.write('\n') + f.write('CodeFolder="' + cfg['data_tools_folder'] + '"\n') + f.write('cd ${CodeFolder}\n') + f.write('source activate ' + cfg['venv_path'] + '\n') + f.write('\n') + f.write('# Shuffle the h5 file \n') + + f.write( + 'time python shuffle_h5.py' + + delete_flag_shuffle_tool + + ' --chunksize ' + str(cfg['chunksize']) + + ' --complib ' + str(cfg['complib']) + + ' --complevel ' + str(cfg['complevel']) + + ' ' + conc_outputfile_fpath) + + +def make_data_split(): + """ + Main function. + """ + + cfg = parse_input() + + ip_group_keys = get_all_ip_group_keys(cfg) + + n_evts_total = 0 + for key in ip_group_keys: + print('Collecting information from input group ' + key) + cfg[key]['fpaths'] = get_h5_filepaths(cfg[key]['dir']) + cfg[key]['n_files'] = len(cfg[key]['fpaths']) + cfg[key]['n_evts'], cfg[key]['n_evts_per_file_mean'], cfg[key]['run_ids'] = get_number_of_evts_and_run_ids(cfg[key]['fpaths'], dataset_key='y') + + n_evts_total += cfg[key]['n_evts'] + + cfg['n_evts_total'] = n_evts_total + print_input_statistics(cfg, ip_group_keys) + + if cfg['print_only'] is True: + from sys import exit + exit() + + for key in ip_group_keys: + add_fpaths_for_data_split_to_cfg(cfg, key) + + make_dsplit_list_files(cfg) + + if cfg['make_qsub_bash_files'] is True: + make_concatenate_and_shuffle_list_files(cfg) + + +if __name__ == '__main__': + make_data_split() diff --git a/orcasong_contrib/data_tools/shuffle/__init__.py b/orcasong_contrib/data_tools/shuffle/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c1c0f7d4f7cf0716feb52632030d585a844671 --- /dev/null +++ b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Contains functions to shuffles .h5 files. + +Can only be used for files where each dataset has the same number of rows (axis_0). +A fixed random seed (42) is used for the shuffling! + +Currently, two types of .h5 files are supported: + +1) Files which can be read by km3pipe (e.g. files produced with OrcaSong). +2) Plain hdf5 files with a hdf5 folder depth of 1. This method is based on some legacy code. + Be careful to not run out of memory! Needs the unshuffled .h5 file's disk space + the python overhead as memory. + If you want to use it, please use the --legacy_mode option. +""" + +import sys +import os +from argparse import ArgumentParser, RawTextHelpFormatter +import numpy as np +import h5py +import km3pipe as kp +import km3modules as km + +# from memory_profiler import profile # for memory profiling, call with @profile; myfunc() + +__author__ = 'Michael Moser' +__license__ = 'AGPL' +__email__ = 'michael.m.moser@fau.de' + + +def parse_input(): + """ + Parses the user input in order to return the most important information: + + 1) list of files that should be shuffled + 2) if the unshuffled file should be deleted + 3) if the user wants to use a custom chunksize, or if the chunksize should be read from the input file. + 4) if the user wants to use a custom complib, or if the complib should be read from the input file. + 5) if the user wants to use a custom complevel, or if the complevel should be read from the input file. + + Returns + ------- + input_files_list : list + List that contains all filepaths of the input files that should be shuffled. + delete : bool + Boolean flag that specifies, if the unshuffled input files should be deleted after the shuffling. + chunksize : None/int + Specifies the chunksize for axis_0 in the shuffled output files. + If None, the chunksize is read from the input files. + Else, a custom chunksize will be used. + complib : None/str + Specifies the compression library that should be used for saving the shuffled output files. + If None, the compression library is read from the input files. + Else, a custom compression library will be used. + Currently available: 'gzip', or 'lzf'. + complevel : None/int + Specifies the compression level that should be used for saving the shuffled output files. + A compression level is only available for gzip compression, not lzf! + If None, the compression level is read from the input files. + Else, a custom compression level will be used. + legacy_mode : bool + Boolean flag that specifies, if the legacy shuffle mode should be used instead of the standard one. + A more detailed description of this mode can be found in the summary at the top of this python file. + + """ + parser = ArgumentParser(description='E.g. < python shuffle_h5.py filepath_1 [filepath_2] [...] > \n' + 'Shuffles .h5 files. Requires that each dataset of the files has the same number of rows (axis_0). \n' + 'Outputs a new, shuffled .h5 file with the suffix < _shuffled >.', + formatter_class=RawTextHelpFormatter) + + parser.add_argument('files', metavar='file', type=str, nargs='+', help='a .h5 file that should be shuffled, can be more than one argument.') + parser.add_argument('-d', '--delete', action='store_true', + help='deletes the original input file after the shuffled .h5 is created.') + parser.add_argument('--chunksize', dest='chunksize', type=int, + help='Specify a chunksize value in order to use chunked storage for the shuffled .h5 file. \n' + ' Otherwise, it will be read from the input file..') + parser.add_argument('--complib', dest='complib', type=str, + help='Specify a filter that should be used for compression. Either "gzip" or "lzf". \n' + 'Otherwise, the filter will be read from the input file.') + parser.add_argument('--complevel', dest='complevel', type=int, + help='Specify a compression filter strength that should be used for the compression. \n' + 'Otherwise, the filter will be read from the input file. \n' + 'Can range from 0 to 9. Has no effect on "lzf" compression.') + parser.add_argument('--legacy_mode', dest='legacy_mode', action='store_true', + help='If you want to use the legacy mode, as described in the summary at the top of this python file.') + + parser.set_defaults(legacy_mode=False) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + args = parser.parse_args() + + input_files_list = [] + for filepath in args.files: + input_files_list.append(filepath) + + delete = False + if args.delete: + delete = True + print('You chose delete = True') + + chunksize = None + if args.chunksize: + chunksize = args.chunksize + print('You chose chunksize = ' + str(chunksize)) + + complib = None + if args.complib: + complib = args.complib + print('You chose complib = ' + complib) + + complevel = None + if args.complevel: + complevel = args.complevel + print('You chose complevel = ' + str(complevel)) + + legacy_mode = args.legacy_mode + + return input_files_list, delete, chunksize, complib, complevel, legacy_mode + + +def get_f_compression_and_chunking(filepath): + """ + Function that gets the used compression library, the compression level (if applicable) + and the chunksize of axis_0 of the first dataset of the file. + + Parameters + ---------- + filepath : str + Filepath of a .hdf5 file. + + Returns + ------- + compression : str + The compression library that has been identified in the input file. E.g. 'gzip', or 'lzf'. + complevel : int + The compression level that has been identified in the input file. + chunksize : None/int + The chunksize of axis_0 that has been indentified in the input file. + + """ + f = h5py.File(filepath, 'r') + + # remove any keys to pytables folders that may be in the file + f_keys_stripped = [x for x in list(f.keys()) if '_i_' not in x] + + compression = f[f_keys_stripped[0]].compression # compression filter + compression_opts = f[f_keys_stripped[0]].compression_opts # filter strength + chunksize = f[f_keys_stripped[0]].chunks[0] # chunksize along axis_0 of the dataset + + return compression, compression_opts, chunksize + + +def shuffle_h5(filepath_input, tool=False, seed=42, delete=True, chunksize=None, complib=None, complevel=None, legacy_mode=False): + """ + Shuffles a .h5 file where each dataset needs to have the same number of rows (axis_0). + The shuffled data is saved to a new .h5 file with the suffix < _shuffled.h5 >. + + Parameters + ---------- + filepath_input : str + Filepath of the unshuffled input file. + tool : bool + Specifies if the function is accessed from the shuffle_h5_tool. + In this case, the shuffled .h5 file is returned. + seed : int + Sets a fixed random seed for the shuffling. + delete : bool + Specifies if the old, unshuffled file should be deleted after extracting the data. + chunksize : None/int + Specifies the chunksize for axis_0 in the shuffled output files. + If None, the chunksize is read from the input files. + Else, a custom chunksize will be used. + complib : None/str + Specifies the compression library that should be used for saving the shuffled output files. + If None, the compression library is read from the input files. + Else, a custom compression library will be used. + Currently available: 'gzip', or 'lzf'. + complevel : None/int + Specifies the compression level that should be used for saving the shuffled output files. + A compression level is only available for gzip compression, not lzf! + If None, the compression level is read from the input files. + Else, a custom compression level will be used. + legacy_mode : bool + Boolean flag that specifies, if the legacy shuffle mode should be used instead of the standard one. + A more detailed description of this mode can be found in the summary at the top of this python file. + + Returns + ------- + output_file_shuffled : h5py.File + H5py file instance of the shuffled output file. + + """ + complib_f, complevel_f, chunksize_f = get_f_compression_and_chunking(filepath_input) + + chunksize = chunksize_f if chunksize is None else chunksize + complib = complib_f if complib is None else complib + complevel = complevel_f if complevel is None else complevel + + if complib == 'lzf': + complevel = None + + filepath_input_without_ext = os.path.splitext(filepath_input)[0] + filepath_output = filepath_input_without_ext + '_shuffled.h5' + + if not legacy_mode: + # set random km3pipe (=numpy) seed + print('Setting a Global Random State with the seed < 42 >.') + km.GlobalRandomState(seed=seed) + + # km3pipe uses pytables for saving the shuffled output file, which has the name 'zlib' for the 'gzip' filter + if complib == 'gzip': + complib = 'zlib' + + pipe = kp.Pipeline(timeit=True) # add timeit=True argument for profiling + pipe.attach(km.common.StatusBar, every=200) + pipe.attach(km.common.MemoryObserver, every=200) + pipe.attach(kp.io.hdf5.HDF5Pump, filename=filepath_input, shuffle=True, reset_index=True) + pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000) + pipe.drain() + if delete: + os.remove(filepath_input) + + output_file_filepath = filepath_output if delete is False else filepath_input + output_file_shuffled = h5py.File(output_file_filepath, 'r+') + + # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them + for folder_name in output_file_shuffled: + if folder_name.startswith('_i_'): + del output_file_shuffled[folder_name] + + else: + input_file = h5py.File(filepath_input, 'r') + folder_data_array_dict = {} + + for folder_name in input_file: + folder_data_array = input_file[folder_name][()] # get whole numpy array into memory + folder_data_array_dict[folder_name] = folder_data_array # workaround in order to be able to close the input file at the next step + + input_file.close() + + if delete: + os.remove(filepath_input) + + output_file_shuffled = h5py.File(filepath_output, 'w') + for n, dataset_key in enumerate(folder_data_array_dict): + + dataset = folder_data_array_dict[dataset_key] + + if n == 0: + # get a particular seed for the first dataset such that the shuffling is consistent across the datasets + r = np.random.RandomState(seed) + state = r.get_state() + r.shuffle(dataset) + + else: + r.set_state(state) # recover shuffle seed of the first dataset + r.shuffle(dataset) + + chunks = (chunksize,) + dataset.shape[1:] + output_file_shuffled.create_dataset(dataset_key, data=dataset, dtype=dataset.dtype, chunks=chunks, + compression=complib, compression_opts=complevel) + + # close file in the case of tool=True + if tool is False: + output_file_shuffled.close() + else: + return output_file_shuffled + + +def shuffle_h5_tool(): + """ + Frontend for the shuffle_h5 function that can be used in a bash environment. + + Shuffles .h5 files where each dataset needs to have the same number of rows (axis_0) for a single file. + Saves the shuffled data to a new .h5 file. + """ + input_files_list, delete, chunksize, complib, complevel, legacy_mode = parse_input() + + for filepath_input in input_files_list: + print('Shuffling file ' + filepath_input) + output_file_shuffled = shuffle_h5(filepath_input, tool=True, seed=42, delete=delete, chunksize=chunksize, + complib=complib, complevel=complevel, legacy_mode=legacy_mode) + print('Finished shuffling. Output information:') + print('---------------------------------------') + print('The output file contains the following datasets:') + for dataset_name in output_file_shuffled: + print('Dataset ' + dataset_name + ' with the following shape, dtype and chunks ' + '(first argument is the chunksize in axis_0): \n' + str(output_file_shuffled[dataset_name].shape) + + ' ; ' + str(output_file_shuffled[dataset_name].dtype) + ' ; ' + + str(output_file_shuffled[dataset_name].chunks)) + + output_file_shuffled.close() + + +if __name__ == '__main__': + shuffle_h5_tool() diff --git a/user/detx_files/orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx b/orcasong_contrib/detx_files/orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx similarity index 100% rename from user/detx_files/orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx rename to orcasong_contrib/detx_files/orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx diff --git a/orcasong_contrib/utilities/__init__.py b/orcasong_contrib/utilities/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utilities/count_number_of_events_in_folder.py b/orcasong_contrib/utilities/count_number_of_events_in_folder.py similarity index 100% rename from utilities/count_number_of_events_in_folder.py rename to orcasong_contrib/utilities/count_number_of_events_in_folder.py diff --git a/utilities/dom_binning.py b/orcasong_contrib/utilities/dom_binning.py similarity index 100% rename from utilities/dom_binning.py rename to orcasong_contrib/utilities/dom_binning.py diff --git a/orcasong_contrib/utilities/get_func_for_flat_track_shower.py b/orcasong_contrib/utilities/get_func_for_flat_track_shower.py new file mode 100644 index 0000000000000000000000000000000000000000..772e193b0c968878848ecf10f3b98863363eded9 --- /dev/null +++ b/orcasong_contrib/utilities/get_func_for_flat_track_shower.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +TODO +""" + +import os +import numpy as np +import matplotlib as mpl +mpl.use('Agg') +from matplotlib.backends.backend_pdf import PdfPages +from matplotlib import pyplot as plt +import h5py +import natsort as ns + + +def get_h5_filepaths(dirpath): + """ + Returns the filepaths of all .h5 files that are located in a specific directory. + + Parameters + ---------- + dirpath: str + Path of the directory where the .h5 files are located. + + Returns + ------- + filepaths : list + List with the full filepaths of all .h5 files in the dirpath folder. + + """ + filepaths = [] + for f in os.listdir(dirpath): + if f.endswith('.h5'): + filepaths.append(dirpath + '/' + f) + + filepaths = ns.natsorted(filepaths) # TODO should not be necessary actually! + return filepaths + + +def get_energies_for_fpaths(fpath_list, fpath_list_key_ic, cut_e_higher_than_3=False): + """ + + Parameters + ---------- + fpath_list + fpath_list_key_ic + cut_e_higher_than_3 + + Returns + ------- + + """ + + energy_conc_arr = None + for i, fpath in enumerate(fpath_list): + if i % 100 == 0: print('Step ' + str(i)) + + f = h5py.File(fpath, 'r') + + tracks = f['mc_tracks'] + tracks_neutr = tracks[tracks['bjorkeny'] != 0] + + assert f['event_info'].shape == tracks_neutr.shape + energies = tracks_neutr['energy'] + + if cut_e_higher_than_3 is True: + energies = energies[energies <= 3] + + if energy_conc_arr is None: + energy_conc_arr = energies + else: + energy_conc_arr = np.concatenate([energy_conc_arr, energies], axis=0) + + f.close() + + print('Total number of events for ' + fpath_list_key_ic + ' (without 3-5GeV from low_e prod): ' + + str(energy_conc_arr.shape[0])) + print('Total number of files: ' + str(len(fpath_list))) + + return energy_conc_arr + + +def save_energies_for_ic(energies_for_ic): + + np.savez('./energies_for_ic.npz', + muon_cc_3_100=energies_for_ic['muon_cc_3_100'], muon_cc_1_5=energies_for_ic['muon_cc_1_5'], + elec_cc_3_100=energies_for_ic['elec_cc_3_100'], elec_cc_1_5=energies_for_ic['elec_cc_1_5'], + elec_nc_3_100=energies_for_ic['elec_nc_3_100'], elec_nc_1_5=energies_for_ic['elec_nc_1_5']) + + +def load_energies_for_ic(): + + data = np.load('./energies_for_ic.npz') + + energies_for_ic = dict() + energies_for_ic['muon_cc_3_100'] = data['muon_cc_3_100'] + energies_for_ic['muon_cc_1_5'] = data['muon_cc_1_5'] + energies_for_ic['elec_cc_3_100'] = data['elec_cc_3_100'] + energies_for_ic['elec_cc_1_5'] = data['elec_cc_1_5'] + energies_for_ic['elec_nc_3_100'] = data['elec_nc_3_100'] + energies_for_ic['elec_nc_1_5'] = data['elec_nc_1_5'] + + return energies_for_ic + + +def add_low_and_high_e_prods(energies_for_ic): + """ + + Parameters + ---------- + energies_for_ic + + Returns + ------- + + """ + + energies_for_ic['muon_cc'] = np.concatenate([energies_for_ic['muon_cc_3_100'], energies_for_ic['muon_cc_1_5']]) + energies_for_ic['elec_cc'] = np.concatenate([energies_for_ic['elec_cc_3_100'], energies_for_ic['elec_cc_1_5']]) + energies_for_ic['elec_nc'] = np.concatenate([energies_for_ic['elec_nc_3_100'], energies_for_ic['elec_nc_1_5']]) + energies_for_ic['elec_cc_and_nc'] = np.concatenate([energies_for_ic['elec_cc'], energies_for_ic['elec_nc']]) + + +def plot_e_and_make_flat_func(energies_for_ic): + """ + + Parameters + ---------- + energies_for_ic + + Returns + ------- + + """ + def make_plot_options_and_save(ax, pdfpages, ylabel): + plt.xlabel('Energy [GeV]') + plt.ylabel(ylabel) + x_ticks_major = np.arange(0, 101, 10) + ax.set_xticks(x_ticks_major) + ax.grid(True) + plt.tight_layout() + pdfpages.savefig(fig) + plt.cla() + + + pdfpages = PdfPages('./e_hist_plots.pdf') + fig, ax = plt.subplots() + + # plot + hist_muon_cc = plt.hist(energies_for_ic['muon_cc'], bins=99) + plt.title('Muon-CC 1-3 + 3-100 GeV for Run 1-2400') + make_plot_options_and_save(ax, pdfpages, ylabel='Counts [#]') + + hist_shower = plt.hist(energies_for_ic['elec_cc_and_nc'], bins=99) + plt.title('Shower (elec-CC + elec-NC) 1-3 + 3-100 GeV for 2x Run 1-1200') + make_plot_options_and_save(ax, pdfpages, ylabel='Counts [#]') + + hist_elec_cc = plt.hist(energies_for_ic['elec_cc'], bins=99) + plt.title('Elec-CC 1-3 + 3-100 GeV for Run 1-1200') + make_plot_options_and_save(ax, pdfpages, ylabel='Counts [#]') + + hist_elec_nc = plt.hist(energies_for_ic['elec_nc'], bins=99) + plt.title('Elec-NC 1-3 + 3-100 GeV for Run 1-1200') + make_plot_options_and_save(ax, pdfpages, ylabel='Counts [#]') + + # We take 600 muon-CC files and 300 elec-cc and 300 elec_nc files for the split, reduce 1-3GeV bins by 1/2 + hist_shower[0][0] = hist_shower[0][0] / 2 # 1-2GeV + hist_shower[0][1] = hist_shower[0][1] / 2 # 2-3GeV + + track_div_shower = np.divide(hist_muon_cc[0], hist_shower[0]) + print(hist_muon_cc[0]) + print(hist_shower[0]) + + bins=hist_muon_cc[1] # doesnt matter which bins to use + track_div_shower = np.append(track_div_shower, track_div_shower[-1]) + #track_div_shower = np.concatenate([track_div_shower, np.array(track_div_shower[-1])[:, np.newaxis]], axis=0) # fix for mpl + print(bins) + print(track_div_shower) + ax.step(bins, track_div_shower, linestyle='-', where='post') + plt.title('Ratio tracks divided by showers') + make_plot_options_and_save(ax, pdfpages, ylabel='Fraction') + + pdfpages.close() + + +def main(): + dirs = { + 'muon_cc_3_100': '/home/saturn/capn/mppi033h/Data/raw_data/ORCA_JTE_NEMOWATER/calibrated/with_jte_times/3-100GeV/muon-CC', + 'muon_cc_1_5': '/home/saturn/capn/mppi033h/Data/raw_data/ORCA_JTE_NEMOWATER/calibrated/with_jte_times/1-5GeV/muon-CC', + 'elec_cc_3_100': '/home/saturn/capn/mppi033h/Data/raw_data/ORCA_JTE_NEMOWATER/calibrated/with_jte_times/3-100GeV/elec-CC', + 'elec_cc_1_5': '/home/saturn/capn/mppi033h/Data/raw_data/ORCA_JTE_NEMOWATER/calibrated/with_jte_times/1-5GeV/elec-CC', + 'elec_nc_3_100': '/home/saturn/capn/mppi033h/Data/raw_data/ORCA_JTE_NEMOWATER/calibrated/with_jte_times/3-100GeV/elec-NC', + 'elec_nc_1_5': '/home/saturn/capn/mppi033h/Data/raw_data/ORCA_JTE_NEMOWATER/calibrated/with_jte_times/1-5GeV/elec-NC' + } + + if os.path.isfile('./energies_for_ic.npz') is True: + energies_for_ic = load_energies_for_ic() + + else: + fpaths = dict() + for dir_ic_key in dirs: + fpaths[dir_ic_key] = get_h5_filepaths(dirs[dir_ic_key]) + + energies_for_ic = dict() + for fpath_list_key_ic in fpaths: + print('Getting energies for ' + fpath_list_key_ic) + cut_flag = True if fpath_list_key_ic in ['muon_cc_1_5', 'elec_cc_1_5', 'elec_nc_1_5'] else False + fpath_list = fpaths[fpath_list_key_ic] + energies_for_ic[fpath_list_key_ic] = get_energies_for_fpaths(fpath_list, fpath_list_key_ic, cut_e_higher_than_3=cut_flag) + + save_energies_for_ic(energies_for_ic) + + add_low_and_high_e_prods(energies_for_ic) + plot_e_and_make_flat_func(energies_for_ic) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/orcasong_contrib/utilities/timecut_test/__init__.py b/orcasong_contrib/utilities/timecut_test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utilities/timecut_test/plots/elec-CC/.gitkeep b/orcasong_contrib/utilities/timecut_test/plots/elec-CC/.gitkeep similarity index 100% rename from utilities/timecut_test/plots/elec-CC/.gitkeep rename to orcasong_contrib/utilities/timecut_test/plots/elec-CC/.gitkeep diff --git a/utilities/timecut_test/plots/muon-CC/.gitkeep b/orcasong_contrib/utilities/timecut_test/plots/muon-CC/.gitkeep similarity index 100% rename from utilities/timecut_test/plots/muon-CC/.gitkeep rename to orcasong_contrib/utilities/timecut_test/plots/muon-CC/.gitkeep diff --git a/utilities/timecut_test/plots/mupage/.gitkeep b/orcasong_contrib/utilities/timecut_test/plots/mupage/.gitkeep similarity index 100% rename from utilities/timecut_test/plots/mupage/.gitkeep rename to orcasong_contrib/utilities/timecut_test/plots/mupage/.gitkeep diff --git a/utilities/timecut_test/plots/random_noise/.gitkeep b/orcasong_contrib/utilities/timecut_test/plots/random_noise/.gitkeep similarity index 100% rename from utilities/timecut_test/plots/random_noise/.gitkeep rename to orcasong_contrib/utilities/timecut_test/plots/random_noise/.gitkeep diff --git a/utilities/timecut_test/plots/tau-CC/.gitkeep b/orcasong_contrib/utilities/timecut_test/plots/tau-CC/.gitkeep similarity index 100% rename from utilities/timecut_test/plots/tau-CC/.gitkeep rename to orcasong_contrib/utilities/timecut_test/plots/tau-CC/.gitkeep diff --git a/utilities/timecut_test/timecut_test.py b/orcasong_contrib/utilities/timecut_test/timecut_test.py similarity index 100% rename from utilities/timecut_test/timecut_test.py rename to orcasong_contrib/utilities/timecut_test/timecut_test.py diff --git a/user/job_submission_scripts/submit_data_to_images.sh b/user/job_submission_scripts/submit_data_to_images.sh index 5cc9fe075e7d2f54bce6e49e256f36a7070027a0..9b3754224637747734ccc86f4bf341bed6ee7930 100644 --- a/user/job_submission_scripts/submit_data_to_images.sh +++ b/user/job_submission_scripts/submit_data_to_images.sh @@ -24,10 +24,10 @@ python_env_folder=/home/hpc/capn/mppi033h/.virtualenv/python_3_env/ job_logs_folder=/home/woody/capn/mppi033h/logs/orcasong/cout detx_filepath=/home/woody/capn/mppi033h/Code/OrcaSong/user/detx_files/orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx -config_file=/home/woody/capn/mppi033h/Code/OrcaSong/user/config/orca_115l_mupage_rn_neutr_classifier/conf_ORCA_115l_mupage_xyz-c.toml +config_file=/home/woody/capn/mppi033h/Code/OrcaSong/user/config/orca_115l_mupage_rn_neutr_classifier/conf_ORCA_115l_random_noise_xyz-t.toml -particle_type=mupage -mc_prod=mupage +particle_type=random_noise +mc_prod=random_noise # total number of files per job # For neutrinos 3-100GeV: @@ -36,7 +36,7 @@ mc_prod=mupage # muon-CC/elec-CC/elec-NC n=120 with PBS -l nodes=1:ppn=4:sl,walltime=5:00:00 # For mupage: n=250 with PBS -l nodes=1:ppn=4:sl,walltime=5:00:00 # For random_noise: n=100 with PBS -l nodes=1:ppn=4:sl,walltime=5:00:00 -files_per_job=200 # must be dividible by 4! +files_per_job=100 # must be dividible by 4! #--- USER INPUT ---# @@ -75,7 +75,7 @@ folder="${folder_ip_files_arr[${mc_prod}]}" # run no_of_loops=$((${files_per_job}/4)) # divide by 4 cores -> e.g, 15 4-core loops needed for files_per_job=60 -file_no_start=$((1+((${n}-1) * ${files_per_job}))) # filenumber of the first file that is being processed by this script (depends on JobArray variable 'n') +file_no_start=$((500+1+((${n}-1) * ${files_per_job}))) # filenumber of the first file that is being processed by this script (depends on JobArray variable 'n') # currently only working for 4 cores diff --git a/utilities/evaluate_generator_IO_speed.py b/utilities/evaluate_generator_IO_speed.py deleted file mode 100644 index 5c9859bec981598e41e484264237ed7d5d54e5d3..0000000000000000000000000000000000000000 --- a/utilities/evaluate_generator_IO_speed.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Code for testing the readout speed of orcasong .hdf5 files.""" - -import numpy as np -import h5py -import timeit -import cProfile - -def generate_batches_from_hdf5_file(): - # 4d - #filepath = 'JTE_KM3Sim_gseagen_muon-CC_3-100GeV-9_1E7-1bin-3_0gspec_ORCA115_9m_2016_9_xyzt_no_compression_chunked.h5' # 4D, (11x13x18x50)), no compression. chunksize=32 --> 1011 ms - #filepath = 'JTE_KM3Sim_gseagen_muon-CC_3-100GeV-9_1E7-1bin-3_0gspec_ORCA115_9m_2016_9_xyzt_lzf.h5' # 4D, (11x13x18x50), lzf --> 2194 ms - #filepath = 'JTE_KM3Sim_gseagen_muon-CC_3-100GeV-9_1E7-1bin-3_0gspec_ORCA115_9m_2016_9_xyzt_gzip_1.h5' # 4D, (11x13x18x50), gzip, compression_opts=1 --> 1655 ms - - # With new km3pipe structure - filepath = '/home/woody/capn/mppi033h/orcasong_output/4dTo4d/xyzc/JTE_ph_ph_mupage_ph_ph_ph_ORCA115_9m_2016_9_xyzc.h5' - - print('Testing generator on file ' + filepath) - batchsize = 32 - dimensions = (batchsize, 11, 13, 18, 31) # 4D - - f = h5py.File(filepath, "r") - filesize = len(f['y']) - print(filesize) - - n_entries = 0 - while n_entries < (filesize - batchsize): - xs = f['x'][n_entries : n_entries + batchsize] - xs = np.reshape(xs, dimensions).astype(np.float32) - - y_values = f['y'][n_entries:n_entries+batchsize] - ys = y_values[['run_id', 'event_id']] - - n_entries += batchsize - yield (xs, ys) - f.close() - - -number = 20 -#t = timeit.timeit(generate_batches_from_hdf5_file, number = number) -#t = timeit.Timer(stmt="list(generate_batches_from_hdf5_file())", setup="from __main__ import generate_batches_from_hdf5_file") -#print t.timeit(number) / number -#print str(number) + 'loops, on average ' + str(t.timeit(number) / number *1000) + 'ms' - -pr = cProfile.Profile() -pr.enable() - -t = timeit.Timer(stmt="list(generate_batches_from_hdf5_file())", setup="from __main__ import generate_batches_from_hdf5_file") -print(str(number) + 'loops, on average ' + str(t.timeit(number) / number *1000) + 'ms') - -pr.disable() - -pr.print_stats(sort='time') \ No newline at end of file diff --git a/utilities/low_e_prod_get_surviving_events.py b/utilities/low_e_prod_get_surviving_events.py deleted file mode 100644 index e31d08fc0bfb0416090c4fe21c17ff3c7d69116a..0000000000000000000000000000000000000000 --- a/utilities/low_e_prod_get_surviving_events.py +++ /dev/null @@ -1,36 +0,0 @@ -import h5py -import numpy as np - -path = '/home/woody/capn/mppi033h/Data/ORCA_JTE_NEMOWATER/ip_images_1-100GeV/4dTo4d/time_-250+500_w_gf_60b' -# JTE_KM3Sim_gseagen_muon-CC_1-5GeV-9_2E5-1bin-1_0gspec_ORCA115_9m_2016_98_xyzt.h5 -ptypes = {'muon-CC': 'JTE_KM3Sim_gseagen_muon-CC_1-5GeV-9_2E5-1bin-1_0gspec_ORCA115_9m_2016_', - 'elec-CC': 'JTE_KM3Sim_gseagen_elec-CC_1-5GeV-2_7E5-1bin-1_0gspec_ORCA115_9m_2016_'} - -event_id, run_id = None, None -for ptype in ptypes.keys(): - for i in range(601): - if i % 100 == 0: - print(i) - if i == 0: continue - - f = h5py.File(path + '/' + ptypes[ptype] + str(i) + '_xyzt.h5', 'r') - event_id_f = f['y'][:, 0] - run_id_f = f['y'][:, 9] - - if event_id is None: - event_id = event_id_f - run_id = run_id_f - else: - event_id = np.concatenate([event_id, event_id_f], axis=0) - run_id = np.concatenate([run_id, run_id_f], axis=0) - - f.close() - - ax = np.newaxis - arr = np.concatenate([run_id[:, ax], event_id[:, ax]], axis=1) - np.save('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_' + ptype + '.npy', arr) - event_id, run_id = None, None - - - -