diff --git a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py index 05f302b66bc2bebb445e7ba449f8a91a78f3bfd5..096bc83ee640df576c923a2a1d51f4cd9b625aa5 100644 --- a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py +++ b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py @@ -202,8 +202,7 @@ def get_f_compression_and_chunking(filepath): def concatenate_h5_files(output_filepath, file_list, - chunksize=None, complib=None, complevel=None, - event_skipper=None): + chunksize=None, complib=None, complevel=None): """ Function that concatenates hdf5 files based on an output_filepath and a file_list of input files. @@ -230,9 +229,6 @@ def concatenate_h5_files(output_filepath, file_list, A compression level is only available for gzip compression, not lzf! If None, the compression level is read from the first input file. Else, a custom compression level will be used. - event_skipper : function, optional - Function that gets the "y" dataset, and returns an array with bools - showing which events to skip (ie not include in the output). """ cum_rows_list = get_cum_number_of_rows(file_list) @@ -255,12 +251,7 @@ def concatenate_h5_files(output_filepath, file_list, if 'format_version' in list(input_file.attrs.keys()) and n == 0: file_output.attrs['format_version'] = input_file.attrs['format_version'] - if event_skipper is not None: - y_dataset = input_file["y"] - skips = event_skipper(y_dataset) - for folder_name in input_file: - if is_folder_ignored(folder_name): # we ignore datasets that have been created by pytables, don't need them anymore continue @@ -278,12 +269,6 @@ def concatenate_h5_files(output_filepath, file_list, print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype)) - if event_skipper is not None: - folder_data = folder_data[skips] - print('Event Skipper: Shape and dtype of dataset ' + - folder_name + ': ' + str(folder_data.shape) + - ' ; ' + str(folder_data.dtype)) - if n == 0: # first file; create the dummy dataset with no max shape maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None @@ -295,7 +280,7 @@ def concatenate_h5_files(output_filepath, file_list, output_dataset.resize(cum_rows_list[-1], axis=0) else: - file_output[folder_name][cum_rows_list[n]:cum_rows_list[n + 1]] = folder_data + file_output[folder_name][cum_rows_list[n]:cum_rows_list[n+1]] = folder_data file_output.flush() diff --git a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py index 5c667105b7c067abad6c84e7a2f55b839a9edbab..56cdbc35ae50f2541fa26ecf79d7d66e6cc2b2d6 100644 --- a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py +++ b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py @@ -216,13 +216,15 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000) pipe.drain() + + # copy the used_files dataset to the new file + copy_used_files(filepath_input, filepath_output) + if delete: os.remove(filepath_input) - output_file_filepath = filepath_output if delete is False else filepath_input - output_file_shuffled = h5py.File(output_file_filepath, 'r+') - # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them + output_file_shuffled = h5py.File(filepath_output, 'r+') for folder_name in output_file_shuffled: if folder_name.startswith('_i_'): del output_file_shuffled[folder_name] @@ -266,6 +268,20 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None return output_file_shuffled +def copy_used_files(source_file, target_file): + """ + Copy the "used_files" dataset from one h5 file to another, if it is present. + + """ + with h5py.File(source_file, "r") as src: + if "used_files" in src: + print("Copying used_files dataset to new file...") + used_files = src["used_files"] + + with h5py.File(target_file, "a") as trg: + trg.create_dataset("used_files", data=used_files) + + def main(): """ Frontend for the shuffle_h5 function that can be used in a bash environment. diff --git a/orcasong_plag/Readme.rst b/orcasong_plag/Readme.rst index 72527dd8df892777809dd5f26c462af2386a442d..7950420e3abcfbe8fea0b3dd17acfe49146729cc 100644 --- a/orcasong_plag/Readme.rst +++ b/orcasong_plag/Readme.rst @@ -3,4 +3,4 @@ OrcaSong Plag Several changes to the original OrcaSong. Allows to set desired binning via a list. -Does not contain all features of OrcaSong, like skipping events, plotting, etc. \ No newline at end of file +Does not contain all features of OrcaSong, like getting mchits, plotting, etc. \ No newline at end of file diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py index e8883c12d84df609f928453d666aee5a406999a4..20750bc3c4f264a2623986879dc191d7b3687807 100644 --- a/orcasong_plag/core.py +++ b/orcasong_plag/core.py @@ -1,6 +1,6 @@ +import os import km3pipe as kp import km3modules as km -import os from orcasong_plag.modules import (TimePreproc, ImageMaker, @@ -12,6 +12,8 @@ from orcasong_plag.util.bin_stats_plot import (plot_hists, add_hists_to_h5file, plot_hist_of_files) +__author__ = 'Stefan Reck' + class FileBinner: """ diff --git a/orcasong_plag/mc_info_types.py b/orcasong_plag/mc_info_types.py index 8398cd171481b559b44cf2bf2c41b9596dcb96a4..cf3321766dc17aa1473f6701f3159df0cdc12271 100644 --- a/orcasong_plag/mc_info_types.py +++ b/orcasong_plag/mc_info_types.py @@ -3,8 +3,10 @@ Functions that extract info from a blob for the mc_info / y datafield in the h5 files. """ -import numpy as np import warnings +import numpy as np + +__author__ = 'Stefan Reck' def get_mc_info_extr(mc_info_extr): diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py index 30e8860dc5488bb1dacd7ee37bc81636e38fe4d9..797f42add65674096ed6cf2c5b5cae97864e2201 100644 --- a/orcasong_plag/modules.py +++ b/orcasong_plag/modules.py @@ -5,6 +5,8 @@ Custom km3pipe modules for making nn input files. import km3pipe as kp import numpy as np +__author__ = 'Stefan Reck' + class McInfoMaker(kp.Module): """ diff --git a/orcasong_plag/util/bin_stats_plot.py b/orcasong_plag/util/bin_stats_plot.py index eb0c6f5c0650d7a7a8b74c169c874d755efd1dd9..2a02923f9808fe3653395831cd986f9168676799 100644 --- a/orcasong_plag/util/bin_stats_plot.py +++ b/orcasong_plag/util/bin_stats_plot.py @@ -8,6 +8,8 @@ import h5py import numpy as np import argparse +__author__ = 'Stefan Reck' + def plot_hists(hists, save_to, plot_bin_edges=True): """ diff --git a/orcasong_plag/util/binning_1d_visualizer.py b/orcasong_plag/util/binning_1d_visualizer.py index ae97a6b9444071aa7ba234dff74d4412fa841c20..916748c253f9fbdc70c06a721496730bafffefaa 100644 --- a/orcasong_plag/util/binning_1d_visualizer.py +++ b/orcasong_plag/util/binning_1d_visualizer.py @@ -16,6 +16,8 @@ import matplotlib.pyplot as plt from orcasong_plag.modules import time_preproc +__author__ = 'Stefan Reck' + class FieldPlotter: """ diff --git a/orcasong_plag/util/split_conc.py b/orcasong_plag/util/split_conc.py index 209bd6a0e24dc9a8f9ad4a66a8b59111f1e68fa4..7b311b797edc35943f4e29ac6cf8fa948542bfd7 100644 --- a/orcasong_plag/util/split_conc.py +++ b/orcasong_plag/util/split_conc.py @@ -26,6 +26,8 @@ Example: import os import numpy as np +__author__ = 'Stefan Reck' + def get_files(folder): """ diff --git a/setup.py b/setup.py index ff1742ac4e7a14cd05a0a7c3b6beb46bf9ba2cd4..0e67ae53e826aeba680669f2081ed24a9e358f92 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,8 @@ setup( name='orcasong', description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT', url='https://git.km3net.de/ml/OrcaSong', - author='Michael Moser', - author_email='mmoser@km3net.de, michael.m.moser@fau.de', + author='Michael Moser, Stefan Reck', + author_email='mmoser@km3net.de, michael.m.moser@fau.de, stefan.reck@fau.de', license='AGPL', install_requires=requirements, packages=find_packages(),