From ca08e887c5212374c7adcf02f79478d42b8e71a2 Mon Sep 17 00:00:00 2001
From: Stefan Reck <stefan.reck@fau.de>
Date: Fri, 3 May 2019 12:33:26 +0200
Subject: [PATCH] Minor adjustments. - Removed event skipper from
 concatenate_h5_files (use shuffle instead) - Fix for shuffle_h5, and it also
 copies used_files datagroup if present to shuffled file - Added authorship to
 orcasong_plag

---
 .../data_tools/concatenate/concatenate_h5.py  | 19 ++--------------
 .../data_tools/shuffle/shuffle_h5.py          | 22 ++++++++++++++++---
 orcasong_plag/Readme.rst                      |  2 +-
 orcasong_plag/core.py                         |  4 +++-
 orcasong_plag/mc_info_types.py                |  4 +++-
 orcasong_plag/modules.py                      |  2 ++
 orcasong_plag/util/bin_stats_plot.py          |  2 ++
 orcasong_plag/util/binning_1d_visualizer.py   |  2 ++
 orcasong_plag/util/split_conc.py              |  2 ++
 setup.py                                      |  4 ++--
 10 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
index 05f302b..096bc83 100644
--- a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
+++ b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
@@ -202,8 +202,7 @@ def get_f_compression_and_chunking(filepath):
 
 
 def concatenate_h5_files(output_filepath, file_list,
-                         chunksize=None, complib=None, complevel=None,
-                         event_skipper=None):
+                         chunksize=None, complib=None, complevel=None):
     """
     Function that concatenates hdf5 files based on an output_filepath and a file_list of input files.
 
@@ -230,9 +229,6 @@ def concatenate_h5_files(output_filepath, file_list,
         A compression level is only available for gzip compression, not lzf!
         If None, the compression level is read from the first input file.
         Else, a custom compression level will be used.
-    event_skipper : function, optional
-        Function that gets the "y" dataset, and returns an array with bools
-        showing which events to skip (ie not include in the output).
 
     """
     cum_rows_list = get_cum_number_of_rows(file_list)
@@ -255,12 +251,7 @@ def concatenate_h5_files(output_filepath, file_list,
         if 'format_version' in list(input_file.attrs.keys()) and n == 0:
             file_output.attrs['format_version'] = input_file.attrs['format_version']
 
-        if event_skipper is not None:
-            y_dataset = input_file["y"]
-            skips = event_skipper(y_dataset)
-
         for folder_name in input_file:
-
             if is_folder_ignored(folder_name):
                 # we ignore datasets that have been created by pytables, don't need them anymore
                 continue
@@ -278,12 +269,6 @@ def concatenate_h5_files(output_filepath, file_list,
 
             print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype))
 
-            if event_skipper is not None:
-                folder_data = folder_data[skips]
-                print('Event Skipper: Shape and dtype of dataset ' +
-                      folder_name + ': ' + str(folder_data.shape) +
-                      ' ; ' + str(folder_data.dtype))
-
             if n == 0:
                 # first file; create the dummy dataset with no max shape
                 maxshape = (None,) + folder_data.shape[1:]  # change shape of axis zero to None
@@ -295,7 +280,7 @@ def concatenate_h5_files(output_filepath, file_list,
                 output_dataset.resize(cum_rows_list[-1], axis=0)
 
             else:
-                file_output[folder_name][cum_rows_list[n]:cum_rows_list[n + 1]] = folder_data
+                file_output[folder_name][cum_rows_list[n]:cum_rows_list[n+1]] = folder_data
 
         file_output.flush()
 
diff --git a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py
index 5c66710..56cdbc3 100644
--- a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py
+++ b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py
@@ -216,13 +216,15 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
 
         pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000)
         pipe.drain()
+
+        # copy the used_files dataset to the new file
+        copy_used_files(filepath_input, filepath_output)
+
         if delete:
             os.remove(filepath_input)
 
-        output_file_filepath = filepath_output if delete is False else filepath_input
-        output_file_shuffled = h5py.File(output_file_filepath, 'r+')
-
         # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them
+        output_file_shuffled = h5py.File(filepath_output, 'r+')
         for folder_name in output_file_shuffled:
             if folder_name.startswith('_i_'):
                 del output_file_shuffled[folder_name]
@@ -266,6 +268,20 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
         return output_file_shuffled
 
 
+def copy_used_files(source_file, target_file):
+    """
+    Copy the "used_files" dataset from one h5 file to another, if it is present.
+
+    """
+    with h5py.File(source_file, "r") as src:
+        if "used_files" in src:
+            print("Copying used_files dataset to new file...")
+            used_files = src["used_files"]
+
+            with h5py.File(target_file, "a") as trg:
+                trg.create_dataset("used_files", data=used_files)
+
+
 def main():
     """
     Frontend for the shuffle_h5 function that can be used in a bash environment.
diff --git a/orcasong_plag/Readme.rst b/orcasong_plag/Readme.rst
index 72527dd..7950420 100644
--- a/orcasong_plag/Readme.rst
+++ b/orcasong_plag/Readme.rst
@@ -3,4 +3,4 @@ OrcaSong Plag
 
 Several changes to the original OrcaSong. Allows to set desired binning via
 a list.
-Does not contain all features of OrcaSong, like skipping events, plotting, etc.
\ No newline at end of file
+Does not contain all features of OrcaSong, like getting mchits, plotting, etc.
\ No newline at end of file
diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py
index e8883c1..20750bc 100644
--- a/orcasong_plag/core.py
+++ b/orcasong_plag/core.py
@@ -1,6 +1,6 @@
+import os
 import km3pipe as kp
 import km3modules as km
-import os
 
 from orcasong_plag.modules import (TimePreproc,
                                    ImageMaker,
@@ -12,6 +12,8 @@ from orcasong_plag.util.bin_stats_plot import (plot_hists,
                                                add_hists_to_h5file,
                                                plot_hist_of_files)
 
+__author__ = 'Stefan Reck'
+
 
 class FileBinner:
     """
diff --git a/orcasong_plag/mc_info_types.py b/orcasong_plag/mc_info_types.py
index 8398cd1..cf33217 100644
--- a/orcasong_plag/mc_info_types.py
+++ b/orcasong_plag/mc_info_types.py
@@ -3,8 +3,10 @@ Functions that extract info from a blob for the mc_info / y datafield
 in the h5 files.
 """
 
-import numpy as np
 import warnings
+import numpy as np
+
+__author__ = 'Stefan Reck'
 
 
 def get_mc_info_extr(mc_info_extr):
diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py
index 30e8860..797f42a 100644
--- a/orcasong_plag/modules.py
+++ b/orcasong_plag/modules.py
@@ -5,6 +5,8 @@ Custom km3pipe modules for making nn input files.
 import km3pipe as kp
 import numpy as np
 
+__author__ = 'Stefan Reck'
+
 
 class McInfoMaker(kp.Module):
     """
diff --git a/orcasong_plag/util/bin_stats_plot.py b/orcasong_plag/util/bin_stats_plot.py
index eb0c6f5..2a02923 100644
--- a/orcasong_plag/util/bin_stats_plot.py
+++ b/orcasong_plag/util/bin_stats_plot.py
@@ -8,6 +8,8 @@ import h5py
 import numpy as np
 import argparse
 
+__author__ = 'Stefan Reck'
+
 
 def plot_hists(hists, save_to, plot_bin_edges=True):
     """
diff --git a/orcasong_plag/util/binning_1d_visualizer.py b/orcasong_plag/util/binning_1d_visualizer.py
index ae97a6b..916748c 100644
--- a/orcasong_plag/util/binning_1d_visualizer.py
+++ b/orcasong_plag/util/binning_1d_visualizer.py
@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt
 
 from orcasong_plag.modules import time_preproc
 
+__author__ = 'Stefan Reck'
+
 
 class FieldPlotter:
     """
diff --git a/orcasong_plag/util/split_conc.py b/orcasong_plag/util/split_conc.py
index 209bd6a..7b311b7 100644
--- a/orcasong_plag/util/split_conc.py
+++ b/orcasong_plag/util/split_conc.py
@@ -26,6 +26,8 @@ Example:
 import os
 import numpy as np
 
+__author__ = 'Stefan Reck'
+
 
 def get_files(folder):
     """
diff --git a/setup.py b/setup.py
index ff1742a..0e67ae5 100644
--- a/setup.py
+++ b/setup.py
@@ -9,8 +9,8 @@ setup(
     name='orcasong',
     description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT',
     url='https://git.km3net.de/ml/OrcaSong',
-    author='Michael Moser',
-    author_email='mmoser@km3net.de, michael.m.moser@fau.de',
+    author='Michael Moser, Stefan Reck',
+    author_email='mmoser@km3net.de, michael.m.moser@fau.de, stefan.reck@fau.de',
     license='AGPL',
     install_requires=requirements,
     packages=find_packages(),
-- 
GitLab