Skip to content
Snippets Groups Projects
Commit 95bcd75e authored by Stefan Reck's avatar Stefan Reck
Browse files

Some small additions.

parent b0d29102
No related branches found
No related tags found
No related merge requests found
......@@ -35,8 +35,8 @@ Calling the object like this will show you the binning:
.. code-block:: python
fb
>>> <FileBinner: ('pos_z', 'time') (10, 100)>
>>> fb
<FileBinner: ('pos_z', 'time') (10, 100)>
As you can see, the FileBinner will produce zt data, with 10 and 100 bins,
respectively.
......
......@@ -202,7 +202,8 @@ def get_f_compression_and_chunking(filepath):
def concatenate_h5_files(output_filepath, file_list,
chunksize=None, complib=None, complevel=None):
chunksize=None, complib=None, complevel=None,
event_skipper=None):
"""
Function that concatenates hdf5 files based on an output_filepath and a file_list of input files.
......@@ -229,6 +230,9 @@ def concatenate_h5_files(output_filepath, file_list,
A compression level is only available for gzip compression, not lzf!
If None, the compression level is read from the first input file.
Else, a custom compression level will be used.
event_skipper : function, optional
Function that gets the "y" dataset, and returns an array with bools
showing which events to skip (ie not include in the output).
"""
cum_rows_list = get_cum_number_of_rows(file_list)
......@@ -251,6 +255,10 @@ def concatenate_h5_files(output_filepath, file_list,
if 'format_version' in list(input_file.attrs.keys()) and n == 0:
file_output.attrs['format_version'] = input_file.attrs['format_version']
if event_skipper is not None:
y_dataset = input_file["y"]
skips = event_skipper(y_dataset)
for folder_name in input_file:
if is_folder_ignored(folder_name):
......@@ -270,6 +278,12 @@ def concatenate_h5_files(output_filepath, file_list,
print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype))
if event_skipper is not None:
folder_data = folder_data[skips]
print('Event Skipper: Shape and dtype of dataset ' +
folder_name + ': ' + str(folder_data.shape) +
' ; ' + str(folder_data.dtype))
if n == 0:
# first file; create the dummy dataset with no max shape
maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None
......@@ -293,6 +307,9 @@ def concatenate_h5_files(output_filepath, file_list,
' is the chunksize in axis_0): \n' + str(file_output[folder_name].shape) + ' ; ' +
str(file_output[folder_name].dtype) + ' ; ' + str(file_output[folder_name].chunks))
file_list_ascii = [n.encode("ascii", "ignore") for n in file_list]
file_output.create_dataset("used_files", data=file_list_ascii)
file_output.close()
......
......@@ -4,6 +4,7 @@ in the h5 files.
"""
import numpy as np
import warnings
def get_mc_info_extr(mc_info_extr):
......@@ -24,7 +25,7 @@ def get_mc_info_extr(mc_info_extr):
mc_info_extr = get_event_and_run_id
else:
raise ValueError("Unknown mc_info_type " + mc_info_extr)
raise NameError("Unknown mc_info_type " + mc_info_extr)
return mc_info_extr
......@@ -46,6 +47,9 @@ def get_mupage_mc(blob):
"""
For mupage muon simulations.
Will only take into account muons with at least 1 McHit in the active
line of the detector.
e.g. mcv5.1_r3.mupage_10G.km3_AAv1.jterbr00002800.5103.root.h5
Parameters
......@@ -64,12 +68,37 @@ def get_mupage_mc(blob):
track = dict()
# total number of simulated muons, not all might deposit counts
n_muons_sim = blob['McTracks'].shape[0]
track["n_muons_sim"] = n_muons_sim
# Origin of each mchit (as int) in the active line
in_active_du = blob["McHits"]["du"] == active_du
origin = blob["McHits"]["origin"][in_active_du]
# get how many mchits were produced per muon in the bundle
origin_dict = dict(zip(*np.unique(origin, return_counts=True)))
origin_list = []
for i in range(1, n_muons_sim + 1):
origin_list.append(origin_dict.get(i, 0))
# origin_list[i] is num of mc_hits of muon i in active du
origin_list = np.array(origin_list)
visible_mc_tracks = blob["McTracks"][origin_list > 0]
visible_origin_list = origin_list[origin_list > 0]
# only muons with at least one mchit in active line
n_muons = len(visible_origin_list)
if n_muons == 0:
warnings.warn("No visible muons in blob!")
track["n_muons"] = n_muons
track["event_id"] = blob['EventInfo'].event_id[0]
track["run_id"] = blob["EventInfo"].run_id[0]
# run_id = blob['Header'].start_run.run_id.astype('float32')
# take 0: assumed that this is the same for all muons in a bundle
track["particle_type"] = blob['McTracks'][0].type
track["particle_type"] = visible_mc_tracks[0].type if n_muons != 0 else 0
# always 1 actually
# track["is_cc"] = blob['McTracks'][0].is_cc
......@@ -77,33 +106,19 @@ def get_mupage_mc(blob):
# always 0 actually
# track["bjorkeny"] = blob['McTracks'][0].bjorkeny
# same for all muons in a bundle #TODO not?
track["time_interaction"] = blob['McTracks'][0].time
# takes position of time_residual_vertex in 'neutrino' case
n_muons = blob['McTracks'].shape[0]
track["n_muons"] = n_muons
# same for all muons in a bundle TODO not?
track["time_interaction"] = visible_mc_tracks[0].time if n_muons != 0 else 0
# sum up the energy of all muons
energy = blob['McTracks'].energy
# sum up the energy of all visible muons
energy = visible_mc_tracks.energy if n_muons != 0 else 0
track["energy"] = np.sum(energy)
# Origin of each mchit (as int) in the active line
in_active_du = blob["McHits"]["du"] == active_du
origin = blob["McHits"]["origin"][in_active_du]
# get how many mchits were produced per muon in the bundle
origin_dict = dict(zip(*np.unique(origin, return_counts=True)))
origin_list = []
for i in range(1, n_muons+1):
origin_list.append(origin_dict.get(i, 0))
origin_list = np.array(origin_list)
track["n_mc_hits"] = np.sum(origin_list)
desc_order = np.argsort(-origin_list)
track["n_mc_hits"] = np.sum(visible_origin_list)
desc_order = np.argsort(-visible_origin_list)
# Sort by energy, highest first
sorted_energy = energy[desc_order]
sorted_mc_hits = origin_list[desc_order]
sorted_energy = energy[desc_order] if n_muons != 0 else []
sorted_mc_hits = visible_origin_list[desc_order] if n_muons != 0 else []
# Store number of mchits of the 10 highest mchits muons (-1 if it has less)
for i in range(10):
......@@ -127,24 +142,27 @@ def get_mupage_mc(blob):
track[field_name] = field_data
# only muons with at least one mchit in active line
track["n_muons_visible"] = len(origin_list[origin_list > 0])
# only muons with at least 5 mchits in active line
track["n_muons_5_mchits"] = len(origin_list[origin_list > 4])
# only muons with at least 10 mchits in active line
track["n_muons_10_mchits"] = len(origin_list[origin_list > 9])
# all muons in a bundle are parallel, so just take dir of first muon
track["dir_x"] = blob['McTracks'][0].dir_x
track["dir_y"] = blob['McTracks'][0].dir_y
track["dir_z"] = blob['McTracks'][0].dir_z
# vertex is the weighted (energy) mean of the individual vertices
track["vertex_pos_x"] = np.average(blob['McTracks'][:].pos_x,
weights=blob['McTracks'][:].energy)
track["vertex_pos_y"] = np.average(blob['McTracks'][:].pos_y,
weights=blob['McTracks'][:].energy)
track["vertex_pos_z"] = np.average(blob['McTracks'][:].pos_z,
weights=blob['McTracks'][:].energy)
track["dir_x"] = visible_mc_tracks[0].dir_x if n_muons != 0 else 0
track["dir_y"] = visible_mc_tracks[0].dir_y if n_muons != 0 else 0
track["dir_z"] = visible_mc_tracks[0].dir_z if n_muons != 0 else 0
if n_muons != 0:
# vertex is the weighted (energy) mean of the individual vertices
track["vertex_pos_x"] = np.average(visible_mc_tracks[:].pos_x,
weights=visible_mc_tracks[:].energy)
track["vertex_pos_y"] = np.average(visible_mc_tracks[:].pos_y,
weights=visible_mc_tracks[:].energy)
track["vertex_pos_z"] = np.average(visible_mc_tracks[:].pos_z,
weights=visible_mc_tracks[:].energy)
else:
track["vertex_pos_x"] = 0
track["vertex_pos_y"] = 0
track["vertex_pos_z"] = 0
return track
import os
import numpy as np
def get_files(folder):
"""
Get pathes of all h5 files in given folder.
"""
infiles = os.listdir(folder)
infiles.sort()
infile_paths = []
for infile in infiles:
if infile.endswith(".h5"):
infile_paths.append(os.path.join(folder, infile))
return np.array(infile_paths)
def split_path_list(files, train_frac, n_train_files, n_val_files):
"""
Get train and val files split according to given fraction, and
distributed over given number of files.
Parameters
----------
files : List
The files.
train_frac : float
The fraction of files.
n_train_files : int
Total number of resulting train files.
n_val_files : int
Total number of resulting val files.
Returns
-------
job_files_train : ndarray
The train files. They are chosen randomly from the files list.
The total number of files is the given fraction of the input files.
job_files_val : ndarray
The val files, similar to the train files.
"""
if n_train_files < 1 or n_val_files < 1:
raise ValueError("Need at least 1 file for train and val.")
order = np.arange(len(files))
np.random.shuffle(order)
len_train_files = int(len(files) * train_frac)
train_files = files[order[:len_train_files]]
val_files = files[order[len_train_files:]]
job_files_train = np.array_split(train_files, n_train_files)
job_files_val = np.array_split(val_files, n_val_files)
for fs in job_files_train:
if len(fs) == 0:
raise ValueError("No files for an output train file!")
for fs in job_files_val:
if len(fs) == 0:
raise ValueError("No files for an output val file!")
return job_files_train, job_files_val
def get_split(folder, outfile_basestr, n_train_files=1, n_val_files=1,
train_frac=0.8):
"""
Prepare to concatentate binned .h5 files to training and validation files.
The files in each train or val file will be drawn randomly from the
available files. Each train or val files will be created by its own
seperately submitted job.
Parameters
----------
folder : str
Containing the files to concat.
n_train_files : int
Total number of resulting train files.
n_val_files : int
Total number of resulting val files.
outfile_basestr : str
Path and a base for the name. "train"/"val" and a file number will
get automatically attached to the name.
train_frac : float
The fraction of files in the train set.
Returns
-------
jobs : dict
Contains the arguments for the concatenate script.
"""
files = get_files(folder)
job_files_train, job_files_val = split_path_list(files, train_frac, n_train_files, n_val_files)
jobs = []
for i, job_files in enumerate(job_files_train):
output_filepath = "{}_train_{}.h5".format(outfile_basestr, i)
job_dict = {
"file_list": job_files,
"output_filepath": output_filepath
}
jobs.append(job_dict)
for i, job_files in enumerate(job_files_val):
output_filepath = "{}_val_{}.h5".format(outfile_basestr, i)
job_dict = {
"file_list": job_files,
"output_filepath": output_filepath
}
jobs.append(job_dict)
return jobs
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment