Skip to content
Snippets Groups Projects
Commit ca08e887 authored by Stefan Reck's avatar Stefan Reck
Browse files

Minor adjustments.

- Removed event skipper from concatenate_h5_files (use shuffle instead)
- Fix for shuffle_h5, and it also copies used_files datagroup if present to shuffled file
- Added authorship to orcasong_plag
parent d36a1831
No related branches found
No related tags found
No related merge requests found
......@@ -202,8 +202,7 @@ def get_f_compression_and_chunking(filepath):
def concatenate_h5_files(output_filepath, file_list,
chunksize=None, complib=None, complevel=None,
event_skipper=None):
chunksize=None, complib=None, complevel=None):
"""
Function that concatenates hdf5 files based on an output_filepath and a file_list of input files.
......@@ -230,9 +229,6 @@ def concatenate_h5_files(output_filepath, file_list,
A compression level is only available for gzip compression, not lzf!
If None, the compression level is read from the first input file.
Else, a custom compression level will be used.
event_skipper : function, optional
Function that gets the "y" dataset, and returns an array with bools
showing which events to skip (ie not include in the output).
"""
cum_rows_list = get_cum_number_of_rows(file_list)
......@@ -255,12 +251,7 @@ def concatenate_h5_files(output_filepath, file_list,
if 'format_version' in list(input_file.attrs.keys()) and n == 0:
file_output.attrs['format_version'] = input_file.attrs['format_version']
if event_skipper is not None:
y_dataset = input_file["y"]
skips = event_skipper(y_dataset)
for folder_name in input_file:
if is_folder_ignored(folder_name):
# we ignore datasets that have been created by pytables, don't need them anymore
continue
......@@ -278,12 +269,6 @@ def concatenate_h5_files(output_filepath, file_list,
print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype))
if event_skipper is not None:
folder_data = folder_data[skips]
print('Event Skipper: Shape and dtype of dataset ' +
folder_name + ': ' + str(folder_data.shape) +
' ; ' + str(folder_data.dtype))
if n == 0:
# first file; create the dummy dataset with no max shape
maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None
......@@ -295,7 +280,7 @@ def concatenate_h5_files(output_filepath, file_list,
output_dataset.resize(cum_rows_list[-1], axis=0)
else:
file_output[folder_name][cum_rows_list[n]:cum_rows_list[n + 1]] = folder_data
file_output[folder_name][cum_rows_list[n]:cum_rows_list[n+1]] = folder_data
file_output.flush()
......
......@@ -216,13 +216,15 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000)
pipe.drain()
# copy the used_files dataset to the new file
copy_used_files(filepath_input, filepath_output)
if delete:
os.remove(filepath_input)
output_file_filepath = filepath_output if delete is False else filepath_input
output_file_shuffled = h5py.File(output_file_filepath, 'r+')
# delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them
output_file_shuffled = h5py.File(filepath_output, 'r+')
for folder_name in output_file_shuffled:
if folder_name.startswith('_i_'):
del output_file_shuffled[folder_name]
......@@ -266,6 +268,20 @@ def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None
return output_file_shuffled
def copy_used_files(source_file, target_file):
"""
Copy the "used_files" dataset from one h5 file to another, if it is present.
"""
with h5py.File(source_file, "r") as src:
if "used_files" in src:
print("Copying used_files dataset to new file...")
used_files = src["used_files"]
with h5py.File(target_file, "a") as trg:
trg.create_dataset("used_files", data=used_files)
def main():
"""
Frontend for the shuffle_h5 function that can be used in a bash environment.
......
......@@ -3,4 +3,4 @@ OrcaSong Plag
Several changes to the original OrcaSong. Allows to set desired binning via
a list.
Does not contain all features of OrcaSong, like skipping events, plotting, etc.
\ No newline at end of file
Does not contain all features of OrcaSong, like getting mchits, plotting, etc.
\ No newline at end of file
import os
import km3pipe as kp
import km3modules as km
import os
from orcasong_plag.modules import (TimePreproc,
ImageMaker,
......@@ -12,6 +12,8 @@ from orcasong_plag.util.bin_stats_plot import (plot_hists,
add_hists_to_h5file,
plot_hist_of_files)
__author__ = 'Stefan Reck'
class FileBinner:
"""
......
......@@ -3,8 +3,10 @@ Functions that extract info from a blob for the mc_info / y datafield
in the h5 files.
"""
import numpy as np
import warnings
import numpy as np
__author__ = 'Stefan Reck'
def get_mc_info_extr(mc_info_extr):
......
......@@ -5,6 +5,8 @@ Custom km3pipe modules for making nn input files.
import km3pipe as kp
import numpy as np
__author__ = 'Stefan Reck'
class McInfoMaker(kp.Module):
"""
......
......@@ -8,6 +8,8 @@ import h5py
import numpy as np
import argparse
__author__ = 'Stefan Reck'
def plot_hists(hists, save_to, plot_bin_edges=True):
"""
......
......@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt
from orcasong_plag.modules import time_preproc
__author__ = 'Stefan Reck'
class FieldPlotter:
"""
......
......@@ -26,6 +26,8 @@ Example:
import os
import numpy as np
__author__ = 'Stefan Reck'
def get_files(folder):
"""
......
......@@ -9,8 +9,8 @@ setup(
name='orcasong',
description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT',
url='https://git.km3net.de/ml/OrcaSong',
author='Michael Moser',
author_email='mmoser@km3net.de, michael.m.moser@fau.de',
author='Michael Moser, Stefan Reck',
author_email='mmoser@km3net.de, michael.m.moser@fau.de, stefan.reck@fau.de',
license='AGPL',
install_requires=requirements,
packages=find_packages(),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment