Skip to content
Snippets Groups Projects
Commit ca08e887 authored by Stefan Reck's avatar Stefan Reck
Browse files

Minor adjustments.

- Removed event skipper from concatenate_h5_files (use shuffle instead)
- Fix for shuffle_h5, and it also copies used_files datagroup if present to shuffled file
- Added authorship to orcasong_plag
parent d36a1831
No related branches found
No related tags found
No related merge requests found
...@@ -202,8 +202,7 @@ def get_f_compression_and_chunking(filepath): ...@@ -202,8 +202,7 @@ def get_f_compression_and_chunking(filepath):
def concatenate_h5_files(output_filepath, file_list, def concatenate_h5_files(output_filepath, file_list,
chunksize=None, complib=None, complevel=None, chunksize=None, complib=None, complevel=None):
event_skipper=None):
""" """
Function that concatenates hdf5 files based on an output_filepath and a file_list of input files. Function that concatenates hdf5 files based on an output_filepath and a file_list of input files.
...@@ -230,9 +229,6 @@ def concatenate_h5_files(output_filepath, file_list, ...@@ -230,9 +229,6 @@ def concatenate_h5_files(output_filepath, file_list,
A compression level is only available for gzip compression, not lzf! A compression level is only available for gzip compression, not lzf!
If None, the compression level is read from the first input file. If None, the compression level is read from the first input file.
Else, a custom compression level will be used. Else, a custom compression level will be used.
event_skipper : function, optional
Function that gets the "y" dataset, and returns an array with bools
showing which events to skip (ie not include in the output).
""" """
cum_rows_list = get_cum_number_of_rows(file_list) cum_rows_list = get_cum_number_of_rows(file_list)
...@@ -255,12 +251,7 @@ def concatenate_h5_files(output_filepath, file_list, ...@@ -255,12 +251,7 @@ def concatenate_h5_files(output_filepath, file_list,
if 'format_version' in list(input_file.attrs.keys()) and n == 0: if 'format_version' in list(input_file.attrs.keys()) and n == 0:
file_output.attrs['format_version'] = input_file.attrs['format_version'] file_output.attrs['format_version'] = input_file.attrs['format_version']
if event_skipper is not None:
y_dataset = input_file["y"]
skips = event_skipper(y_dataset)
for folder_name in input_file: for folder_name in input_file:
if is_folder_ignored(folder_name): if is_folder_ignored(folder_name):
# we ignore datasets that have been created by pytables, don't need them anymore # we ignore datasets that have been created by pytables, don't need them anymore
continue continue
...@@ -278,12 +269,6 @@ def concatenate_h5_files(output_filepath, file_list, ...@@ -278,12 +269,6 @@ def concatenate_h5_files(output_filepath, file_list,
print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype)) print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype))
if event_skipper is not None:
folder_data = folder_data[skips]
print('Event Skipper: Shape and dtype of dataset ' +
folder_name + ': ' + str(folder_data.shape) +
' ; ' + str(folder_data.dtype))
if n == 0: if n == 0:
# first file; create the dummy dataset with no max shape # first file; create the dummy dataset with no max shape
maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None
...@@ -295,7 +280,7 @@ def concatenate_h5_files(output_filepath, file_list, ...@@ -295,7 +280,7 @@ def concatenate_h5_files(output_filepath, file_list,
output_dataset.resize(cum_rows_list[-1], axis=0) output_dataset.resize(cum_rows_list[-1], axis=0)
else: else:
file_output[folder_name][cum_rows_list[n]:cum_rows_list[n + 1]] = folder_data file_output[folder_name][cum_rows_list[n]:cum_rows_list[n+1]] = folder_data
file_output.flush() file_output.flush()
......
...@@ -216,13 +216,15 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None ...@@ -216,13 +216,15 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000) pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000)
pipe.drain() pipe.drain()
# copy the used_files dataset to the new file
copy_used_files(filepath_input, filepath_output)
if delete: if delete:
os.remove(filepath_input) os.remove(filepath_input)
output_file_filepath = filepath_output if delete is False else filepath_input
output_file_shuffled = h5py.File(output_file_filepath, 'r+')
# delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them
output_file_shuffled = h5py.File(filepath_output, 'r+')
for folder_name in output_file_shuffled: for folder_name in output_file_shuffled:
if folder_name.startswith('_i_'): if folder_name.startswith('_i_'):
del output_file_shuffled[folder_name] del output_file_shuffled[folder_name]
...@@ -266,6 +268,20 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None ...@@ -266,6 +268,20 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
return output_file_shuffled return output_file_shuffled
def copy_used_files(source_file, target_file):
"""
Copy the "used_files" dataset from one h5 file to another, if it is present.
"""
with h5py.File(source_file, "r") as src:
if "used_files" in src:
print("Copying used_files dataset to new file...")
used_files = src["used_files"]
with h5py.File(target_file, "a") as trg:
trg.create_dataset("used_files", data=used_files)
def main(): def main():
""" """
Frontend for the shuffle_h5 function that can be used in a bash environment. Frontend for the shuffle_h5 function that can be used in a bash environment.
......
...@@ -3,4 +3,4 @@ OrcaSong Plag ...@@ -3,4 +3,4 @@ OrcaSong Plag
Several changes to the original OrcaSong. Allows to set desired binning via Several changes to the original OrcaSong. Allows to set desired binning via
a list. a list.
Does not contain all features of OrcaSong, like skipping events, plotting, etc. Does not contain all features of OrcaSong, like getting mchits, plotting, etc.
\ No newline at end of file \ No newline at end of file
import os
import km3pipe as kp import km3pipe as kp
import km3modules as km import km3modules as km
import os
from orcasong_plag.modules import (TimePreproc, from orcasong_plag.modules import (TimePreproc,
ImageMaker, ImageMaker,
...@@ -12,6 +12,8 @@ from orcasong_plag.util.bin_stats_plot import (plot_hists, ...@@ -12,6 +12,8 @@ from orcasong_plag.util.bin_stats_plot import (plot_hists,
add_hists_to_h5file, add_hists_to_h5file,
plot_hist_of_files) plot_hist_of_files)
__author__ = 'Stefan Reck'
class FileBinner: class FileBinner:
""" """
......
...@@ -3,8 +3,10 @@ Functions that extract info from a blob for the mc_info / y datafield ...@@ -3,8 +3,10 @@ Functions that extract info from a blob for the mc_info / y datafield
in the h5 files. in the h5 files.
""" """
import numpy as np
import warnings import warnings
import numpy as np
__author__ = 'Stefan Reck'
def get_mc_info_extr(mc_info_extr): def get_mc_info_extr(mc_info_extr):
......
...@@ -5,6 +5,8 @@ Custom km3pipe modules for making nn input files. ...@@ -5,6 +5,8 @@ Custom km3pipe modules for making nn input files.
import km3pipe as kp import km3pipe as kp
import numpy as np import numpy as np
__author__ = 'Stefan Reck'
class McInfoMaker(kp.Module): class McInfoMaker(kp.Module):
""" """
......
...@@ -8,6 +8,8 @@ import h5py ...@@ -8,6 +8,8 @@ import h5py
import numpy as np import numpy as np
import argparse import argparse
__author__ = 'Stefan Reck'
def plot_hists(hists, save_to, plot_bin_edges=True): def plot_hists(hists, save_to, plot_bin_edges=True):
""" """
......
...@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt ...@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt
from orcasong_plag.modules import time_preproc from orcasong_plag.modules import time_preproc
__author__ = 'Stefan Reck'
class FieldPlotter: class FieldPlotter:
""" """
......
...@@ -26,6 +26,8 @@ Example: ...@@ -26,6 +26,8 @@ Example:
import os import os
import numpy as np import numpy as np
__author__ = 'Stefan Reck'
def get_files(folder): def get_files(folder):
""" """
......
...@@ -9,8 +9,8 @@ setup( ...@@ -9,8 +9,8 @@ setup(
name='orcasong', name='orcasong',
description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT', description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT',
url='https://git.km3net.de/ml/OrcaSong', url='https://git.km3net.de/ml/OrcaSong',
author='Michael Moser', author='Michael Moser, Stefan Reck',
author_email='mmoser@km3net.de, michael.m.moser@fau.de', author_email='mmoser@km3net.de, michael.m.moser@fau.de, stefan.reck@fau.de',
license='AGPL', license='AGPL',
install_requires=requirements, install_requires=requirements,
packages=find_packages(), packages=find_packages(),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment