Minor adjustments.

- Removed event skipper from concatenate_h5_files (use shuffle instead) - Fix for shuffle_h5, and it also copies used_files datagroup if present to shuffled file - Added authorship to orcasong_plag

Minor adjustments.
ca08e887 · Stefan Reck · d36a1831 · ca08e887 · ca08e887 · ca08e887
Commit ca08e887 authored 5 years ago by Stefan Reck
--- a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
+++ b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
@@ -202,8 +202,7 @@ def get_f_compression_and_chunking(filepath):
 def concatenate_h5_files(output_filepath, file_list,
-                         chunksize=None, complib=None, complevel=None,
+                         chunksize=None, complib=None, complevel=None):
-                         event_skipper=None):
    """
    Function that concatenates hdf5 files based on an output_filepath and a file_list of input files.
@@ -230,9 +229,6 @@ def concatenate_h5_files(output_filepath, file_list,
        A compression level is only available for gzip compression, not lzf!
        If None, the compression level is read from the first input file.
        Else, a custom compression level will be used.
-    event_skipper : function, optional
-        Function that gets the "y" dataset, and returns an array with bools
-        showing which events to skip (ie not include in the output).
    """
    cum_rows_list = get_cum_number_of_rows(file_list)
@@ -255,12 +251,7 @@ def concatenate_h5_files(output_filepath, file_list,
        if 'format_version' in list(input_file.attrs.keys()) and n == 0:
            file_output.attrs['format_version'] = input_file.attrs['format_version']
-        if event_skipper is not None:
-            y_dataset = input_file["y"]
-            skips = event_skipper(y_dataset)
        for folder_name in input_file:
            if is_folder_ignored(folder_name):
                # we ignore datasets that have been created by pytables, don't need them anymore
                continue
@@ -278,12 +269,6 @@ def concatenate_h5_files(output_filepath, file_list,
            print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype))
-            if event_skipper is not None:
-                folder_data = folder_data[skips]
-                print('Event Skipper: Shape and dtype of dataset ' +
-                      folder_name + ': ' + str(folder_data.shape) +
-                      ' ; ' + str(folder_data.dtype))
            if n == 0:
                # first file; create the dummy dataset with no max shape
                maxshape = (None,) + folder_data.shape[1:]  # change shape of axis zero to None
@@ -295,7 +280,7 @@ def concatenate_h5_files(output_filepath, file_list,
                output_dataset.resize(cum_rows_list[-1], axis=0)
            else:
-                file_output[folder_name][cum_rows_list[n]:cum_rows_list[n + 1]] = folder_data
+                file_output[folder_name][cum_rows_list[n]:cum_rows_list[n+1]] = folder_data
        file_output.flush()

--- a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py
+++ b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py
@@ -216,13 +216,15 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
        pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000)
        pipe.drain()
+        # copy the used_files dataset to the new file
+        copy_used_files(filepath_input, filepath_output)
        if delete:
            os.remove(filepath_input)
-        output_file_filepath = filepath_output if delete is False else filepath_input
-        output_file_shuffled = h5py.File(output_file_filepath, 'r+')
        # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them
+        output_file_shuffled = h5py.File(filepath_output, 'r+')
        for folder_name in output_file_shuffled:
            if folder_name.startswith('_i_'):
                del output_file_shuffled[folder_name]
@@ -266,6 +268,20 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
        return output_file_shuffled
+def copy_used_files(source_file, target_file):
+    """
+    Copy the "used_files" dataset from one h5 file to another, if it is present.
+    """
+    with h5py.File(source_file, "r") as src:
+        if "used_files" in src:
+            print("Copying used_files dataset to new file...")
+            used_files = src["used_files"]
+            with h5py.File(target_file, "a") as trg:
+                trg.create_dataset("used_files", data=used_files)
 def main():
    """
    Frontend for the shuffle_h5 function that can be used in a bash environment.

--- a/orcasong_plag/Readme.rst
+++ b/orcasong_plag/Readme.rst
@@ -3,4 +3,4 @@ OrcaSong Plag
 Several changes to the original OrcaSong. Allows to set desired binning via
 a list.
-Does not contain all features of OrcaSong, like skipping events, plotting, etc.
+Does not contain all features of OrcaSong, like getting mchits, plotting, etc.
\ No newline at end of file
--- a/orcasong_plag/core.py
+++ b/orcasong_plag/core.py
+import os
 import km3pipe as kp
 import km3modules as km
-import os
 from orcasong_plag.modules import (TimePreproc,
                                   ImageMaker,
@@ -12,6 +12,8 @@ from orcasong_plag.util.bin_stats_plot import (plot_hists,
                                               add_hists_to_h5file,
                                               plot_hist_of_files)
+__author__ = 'Stefan Reck'
 class FileBinner:
    """

--- a/orcasong_plag/mc_info_types.py
+++ b/orcasong_plag/mc_info_types.py
@@ -3,8 +3,10 @@ Functions that extract info from a blob for the mc_info / y datafield
 in the h5 files.
 """
-import numpy as np
 import warnings
+import numpy as np
+__author__ = 'Stefan Reck'
 def get_mc_info_extr(mc_info_extr):

--- a/orcasong_plag/modules.py
+++ b/orcasong_plag/modules.py
@@ -5,6 +5,8 @@ Custom km3pipe modules for making nn input files.
 import km3pipe as kp
 import numpy as np
+__author__ = 'Stefan Reck'
 class McInfoMaker(kp.Module):
    """

--- a/orcasong_plag/util/bin_stats_plot.py
+++ b/orcasong_plag/util/bin_stats_plot.py
@@ -8,6 +8,8 @@ import h5py
 import numpy as np
 import argparse
+__author__ = 'Stefan Reck'
 def plot_hists(hists, save_to, plot_bin_edges=True):
    """

--- a/orcasong_plag/util/binning_1d_visualizer.py
+++ b/orcasong_plag/util/binning_1d_visualizer.py
@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt
 from orcasong_plag.modules import time_preproc
+__author__ = 'Stefan Reck'
 class FieldPlotter:
    """

--- a/orcasong_plag/util/split_conc.py
+++ b/orcasong_plag/util/split_conc.py
@@ -26,6 +26,8 @@ Example:
 import os
 import numpy as np
+__author__ = 'Stefan Reck'
 def get_files(folder):
    """

--- a/setup.py
+++ b/setup.py
@@ -9,8 +9,8 @@ setup(
    name='orcasong',
    description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT',
    url='https://git.km3net.de/ml/OrcaSong',
-    author='Michael Moser',
+    author='Michael Moser, Stefan Reck',
-    author_email='mmoser@km3net.de, michael.m.moser@fau.de',
+    author_email='mmoser@km3net.de, michael.m.moser@fau.de, stefan.reck@fau.de',
    license='AGPL',
    install_requires=requirements,
    packages=find_packages(),