Skip to content
Snippets Groups Projects
Commit c0257d23 authored by Stefan Reck's avatar Stefan Reck
Browse files

Added BinningPlotter to the pipeline.

Will automatically save histograms to pdf, showing the distribution of
hits among the given bin edges.
parent 7ec5b1c3
No related branches found
No related tags found
No related merge requests found
import km3pipe as kp
import km3modules as km
import os
from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker
from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter
from orcasong_plag.mc_info_types import get_mc_info_extr
......@@ -23,6 +24,12 @@ class FileBinner:
Function that extracts desired mc_info from a blob, which is then
stored as the "y" datafield in the .h5 file.
Can also give a str identifier for an existing extractor.
bin_plot_freq : int or None
If int is given, defines after how many blobs data for an overview
histogram is extracted.
It shows the distribution of hits, the bin edges, and how many hits
were cut off for each field name in bin_edges_list.
It will be saved to the same path as the outfile in run.
n_statusbar : int, optional
Print a statusbar every n blobs.
n_memory_observer : int, optional
......@@ -50,6 +57,7 @@ class FileBinner:
def __init__(self, bin_edges_list, mc_info_extr=None):
self.bin_edges_list = bin_edges_list
self.mc_info_extr = mc_info_extr
self.bin_plot_freq = 20
self.n_statusbar = 200
self.n_memory_observer = 400
......@@ -73,7 +81,7 @@ class FileBinner:
Path to the output file.
"""
name, shape = self.get_name_and_shape()
name, shape = self.get_names_and_shape()
print("Generating {} images with shape {}".format(name, shape))
pipe = kp.Pipeline()
......@@ -88,7 +96,7 @@ class FileBinner:
pipe.attach(kp.io.hdf5.HDF5Pump, filenames=infile)
self.attach_binning_modules(pipe)
self.attach_binning_modules(pipe, outfile=outfile)
pipe.attach(kp.io.HDF5Sink,
filename=outfile,
......@@ -99,9 +107,9 @@ class FileBinner:
pipe.drain()
def attach_binning_modules(self, pipe):
def attach_binning_modules(self, pipe, outfile):
"""
Attach modules to transform a blob to images and mc_info to a km3pipe.
Attach modules to a km3pipe which transform a blob to images and mc_info.
"""
pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader',
......@@ -113,6 +121,13 @@ class FileBinner:
# from orcasong.utils import EventSkipper
# pipe.attach(EventSkipper, data_cuts=self.data_cuts)
if self.bin_plot_freq is not None:
pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf"
pipe.attach(BinningPlotter,
bin_plot_freq=self.bin_plot_freq,
bin_edges_list=self.bin_edges_list,
pdf_path=pdf_name)
pipe.attach(ImageMaker,
bin_edges_list=self.bin_edges_list,
store_as="histogram")
......@@ -129,20 +144,17 @@ class FileBinner:
pipe.attach(km.common.Keep, keys=['histogram', 'mc_info'])
def get_name_and_shape(self):
def get_names_and_shape(self):
"""
Get name and shape of the resulting x data, e.g. "pos_z_time", (18, 50).
Get names and shape of the resulting x data, e.g. (pos_z, time), (18, 50).
"""
res_names, shape = [], []
names, shape = [], []
for bin_name, bin_edges in self.bin_edges_list:
res_names.append(bin_name)
names.append(bin_name)
shape.append(len(bin_edges) - 1)
name = "_".join(res_names)
shape = tuple(shape)
return name, shape
return tuple(names), tuple(shape)
def __repr__(self):
name, shape = self.get_name_and_shape()
name, shape = self.get_names_and_shape()
return "<FileBinner: {} {}>".format(name, shape)
......@@ -4,6 +4,9 @@ Custom km3pipe modules for making nn input files.
import km3pipe as kp
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.ticker as ticker
class McInfoMaker(kp.Module):
......@@ -114,3 +117,135 @@ class ImageMaker(kp.Module):
blob[self.store_as] = kp_hist
return blob
class BinningPlotter(kp.Module):
"""
Save a histogram of the number of hits for each binning field name.
E.g. if the bin_edges_list contains "pos_z", this will save a histogram
of #Hits vs. "pos_z" as a pdf, together with how many hits were outside
of the bin edges in both directions.
Per default, the resolution of the histogram (width of bins) will be
higher then the given bin edges, and the edges will be plotted as horizontal
lines. The time is the exception: The plotted bins have exactly the
given bin edges.
Attributes
----------
bin_edges_list : List
List with the names of the fields to bin, and the respective bin edges,
including the left- and right-most bin edge.
pdf_path : str
Where to save the hists to. This pdf will contain all the field names
on their own page each.
bin_plot_freq : int
Extract data for the hitograms only every given number of blobs
(reduces time the pipeline takes to complete).
res_increase : int
Increase the number of bins by this much in the plot (so that one
can see if the edges have been placed correctly). Is never used
for the time binning (field name "time").
plot_bin_edges : bool
If true, will plot the bin edges as horizontal lines. Is never used
for the time binning (field name "time").
"""
def configure(self):
self.bin_edges_list = self.require('bin_edges_list')
self.pdf_path = self.require('pdf_path')
self.bin_plot_freq = self.get("bin_plot_freq", default=20)
self.res_increase = self.get('res_increase', default=5)
self.plot_bin_edges = self.get('plot_bin_edges', default=True)
self.hists = {}
for bin_name, bin_edges in self._yield_spaced_bin_edges():
self.hists[bin_name] = {
"hist": np.zeros(len(bin_edges) - 1),
"out_pos": 0,
"out_neg": 0,
}
self.i = 0
def _space_bin_edges(self, bin_edges):
"""
Increase resolution of given binning.
"""
increased_n_bins = (len(bin_edges) - 1) * self.res_increase + 1
bin_edges = np.linspace(bin_edges[0], bin_edges[-1],
increased_n_bins)
return bin_edges
def _yield_spaced_bin_edges(self):
for bin_name, bin_edges in self.bin_edges_list:
if bin_name != "time":
bin_edges = self._space_bin_edges(bin_edges)
yield bin_name, bin_edges
def process(self, blob):
"""
Extract data from blob for the hist plots.
"""
if self.i % self.bin_plot_freq == 0:
for bin_name, bin_edges in self._yield_spaced_bin_edges():
data = blob["Hits"][bin_name]
hist = np.histogram(data, bins=bin_edges)[0]
out_pos = data[data > np.max(bin_edges)].size
out_neg = data[data < np.min(bin_edges)].size
self.hists[bin_name]["hist"] += hist
self.hists[bin_name]["out_pos"] += out_pos
self.hists[bin_name]["out_neg"] += out_neg
self.i += 1
return blob
def finish(self):
"""
Make and save the histograms to pdf.
"""
with PdfPages(self.pdf_path) as pdf_file:
for bin_name, org_bin_edges in self.bin_edges_list:
hist = self.hists[bin_name]["hist"]
out_pos = self.hists[bin_name]["out_pos"]
out_neg = self.hists[bin_name]["out_neg"]
hist_frac = hist / (np.sum(hist) + out_pos + out_neg)
if bin_name != "time":
bin_edges = self._space_bin_edges(org_bin_edges)
else:
bin_edges = org_bin_edges
bin_spacing = bin_edges[1] - bin_edges[0]
fig, ax = plt.subplots()
plt.bar(bin_edges[:-1],
hist_frac,
align="edge",
width=0.9*bin_spacing,
)
ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
if self.plot_bin_edges and bin_name != "time":
for bin_edge in org_bin_edges:
plt.axvline(x=bin_edge, color='grey', linestyle='-',
linewidth=1, alpha=0.9)
# place a text box in upper left in axes coords
out_pos_rel = out_pos / np.sum(hist)
out_neg_rel = out_neg / np.sum(hist)
textstr = "Hits cut off:\n Left: {:.1%}\n" \
" Right: {:.1%}".format(out_neg_rel, out_pos_rel)
props = dict(boxstyle='round', facecolor='white', alpha=0.9)
ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
verticalalignment='top', bbox=props)
plt.xlabel(bin_name)
plt.ylabel("Fraction of hits")
pdf_file.savefig(fig)
print(bin_name, out_neg, out_pos)
print("Saved binning plot to " + self.pdf_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment