From c0257d23875bdeff9cad2294d7874d41d398057a Mon Sep 17 00:00:00 2001 From: Stefan Reck <stefan.reck@fau.de> Date: Mon, 8 Apr 2019 12:31:31 +0200 Subject: [PATCH] Added BinningPlotter to the pipeline. Will automatically save histograms to pdf, showing the distribution of hits among the given bin edges. --- orcasong_plag/core.py | 40 ++++++++---- orcasong_plag/modules.py | 135 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 14 deletions(-) diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py index fd0d4f5..30ba800 100644 --- a/orcasong_plag/core.py +++ b/orcasong_plag/core.py @@ -1,7 +1,8 @@ import km3pipe as kp import km3modules as km +import os -from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker +from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter from orcasong_plag.mc_info_types import get_mc_info_extr @@ -23,6 +24,12 @@ class FileBinner: Function that extracts desired mc_info from a blob, which is then stored as the "y" datafield in the .h5 file. Can also give a str identifier for an existing extractor. + bin_plot_freq : int or None + If int is given, defines after how many blobs data for an overview + histogram is extracted. + It shows the distribution of hits, the bin edges, and how many hits + were cut off for each field name in bin_edges_list. + It will be saved to the same path as the outfile in run. n_statusbar : int, optional Print a statusbar every n blobs. n_memory_observer : int, optional @@ -50,6 +57,7 @@ class FileBinner: def __init__(self, bin_edges_list, mc_info_extr=None): self.bin_edges_list = bin_edges_list self.mc_info_extr = mc_info_extr + self.bin_plot_freq = 20 self.n_statusbar = 200 self.n_memory_observer = 400 @@ -73,7 +81,7 @@ class FileBinner: Path to the output file. """ - name, shape = self.get_name_and_shape() + name, shape = self.get_names_and_shape() print("Generating {} images with shape {}".format(name, shape)) pipe = kp.Pipeline() @@ -88,7 +96,7 @@ class FileBinner: pipe.attach(kp.io.hdf5.HDF5Pump, filenames=infile) - self.attach_binning_modules(pipe) + self.attach_binning_modules(pipe, outfile=outfile) pipe.attach(kp.io.HDF5Sink, filename=outfile, @@ -99,9 +107,9 @@ class FileBinner: pipe.drain() - def attach_binning_modules(self, pipe): + def attach_binning_modules(self, pipe, outfile): """ - Attach modules to transform a blob to images and mc_info to a km3pipe. + Attach modules to a km3pipe which transform a blob to images and mc_info. """ pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader', @@ -113,6 +121,13 @@ class FileBinner: # from orcasong.utils import EventSkipper # pipe.attach(EventSkipper, data_cuts=self.data_cuts) + if self.bin_plot_freq is not None: + pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf" + pipe.attach(BinningPlotter, + bin_plot_freq=self.bin_plot_freq, + bin_edges_list=self.bin_edges_list, + pdf_path=pdf_name) + pipe.attach(ImageMaker, bin_edges_list=self.bin_edges_list, store_as="histogram") @@ -129,20 +144,17 @@ class FileBinner: pipe.attach(km.common.Keep, keys=['histogram', 'mc_info']) - def get_name_and_shape(self): + def get_names_and_shape(self): """ - Get name and shape of the resulting x data, e.g. "pos_z_time", (18, 50). + Get names and shape of the resulting x data, e.g. (pos_z, time), (18, 50). """ - res_names, shape = [], [] + names, shape = [], [] for bin_name, bin_edges in self.bin_edges_list: - res_names.append(bin_name) + names.append(bin_name) shape.append(len(bin_edges) - 1) - name = "_".join(res_names) - shape = tuple(shape) - - return name, shape + return tuple(names), tuple(shape) def __repr__(self): - name, shape = self.get_name_and_shape() + name, shape = self.get_names_and_shape() return "<FileBinner: {} {}>".format(name, shape) diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py index 6361aa7..139bd42 100644 --- a/orcasong_plag/modules.py +++ b/orcasong_plag/modules.py @@ -4,6 +4,9 @@ Custom km3pipe modules for making nn input files. import km3pipe as kp import numpy as np +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import matplotlib.ticker as ticker class McInfoMaker(kp.Module): @@ -114,3 +117,135 @@ class ImageMaker(kp.Module): blob[self.store_as] = kp_hist return blob + + +class BinningPlotter(kp.Module): + """ + Save a histogram of the number of hits for each binning field name. + + E.g. if the bin_edges_list contains "pos_z", this will save a histogram + of #Hits vs. "pos_z" as a pdf, together with how many hits were outside + of the bin edges in both directions. + + Per default, the resolution of the histogram (width of bins) will be + higher then the given bin edges, and the edges will be plotted as horizontal + lines. The time is the exception: The plotted bins have exactly the + given bin edges. + + Attributes + ---------- + bin_edges_list : List + List with the names of the fields to bin, and the respective bin edges, + including the left- and right-most bin edge. + pdf_path : str + Where to save the hists to. This pdf will contain all the field names + on their own page each. + bin_plot_freq : int + Extract data for the hitograms only every given number of blobs + (reduces time the pipeline takes to complete). + res_increase : int + Increase the number of bins by this much in the plot (so that one + can see if the edges have been placed correctly). Is never used + for the time binning (field name "time"). + plot_bin_edges : bool + If true, will plot the bin edges as horizontal lines. Is never used + for the time binning (field name "time"). + + """ + def configure(self): + self.bin_edges_list = self.require('bin_edges_list') + self.pdf_path = self.require('pdf_path') + self.bin_plot_freq = self.get("bin_plot_freq", default=20) + self.res_increase = self.get('res_increase', default=5) + self.plot_bin_edges = self.get('plot_bin_edges', default=True) + + self.hists = {} + for bin_name, bin_edges in self._yield_spaced_bin_edges(): + self.hists[bin_name] = { + "hist": np.zeros(len(bin_edges) - 1), + "out_pos": 0, + "out_neg": 0, + } + + self.i = 0 + + def _space_bin_edges(self, bin_edges): + """ + Increase resolution of given binning. + """ + increased_n_bins = (len(bin_edges) - 1) * self.res_increase + 1 + bin_edges = np.linspace(bin_edges[0], bin_edges[-1], + increased_n_bins) + + return bin_edges + + def _yield_spaced_bin_edges(self): + for bin_name, bin_edges in self.bin_edges_list: + if bin_name != "time": + bin_edges = self._space_bin_edges(bin_edges) + yield bin_name, bin_edges + + def process(self, blob): + """ + Extract data from blob for the hist plots. + """ + if self.i % self.bin_plot_freq == 0: + for bin_name, bin_edges in self._yield_spaced_bin_edges(): + data = blob["Hits"][bin_name] + hist = np.histogram(data, bins=bin_edges)[0] + + out_pos = data[data > np.max(bin_edges)].size + out_neg = data[data < np.min(bin_edges)].size + + self.hists[bin_name]["hist"] += hist + self.hists[bin_name]["out_pos"] += out_pos + self.hists[bin_name]["out_neg"] += out_neg + + self.i += 1 + return blob + + def finish(self): + """ + Make and save the histograms to pdf. + """ + with PdfPages(self.pdf_path) as pdf_file: + for bin_name, org_bin_edges in self.bin_edges_list: + hist = self.hists[bin_name]["hist"] + out_pos = self.hists[bin_name]["out_pos"] + out_neg = self.hists[bin_name]["out_neg"] + hist_frac = hist / (np.sum(hist) + out_pos + out_neg) + + if bin_name != "time": + bin_edges = self._space_bin_edges(org_bin_edges) + else: + bin_edges = org_bin_edges + + bin_spacing = bin_edges[1] - bin_edges[0] + fig, ax = plt.subplots() + plt.bar(bin_edges[:-1], + hist_frac, + align="edge", + width=0.9*bin_spacing, + ) + ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1)) + + if self.plot_bin_edges and bin_name != "time": + for bin_edge in org_bin_edges: + plt.axvline(x=bin_edge, color='grey', linestyle='-', + linewidth=1, alpha=0.9) + + # place a text box in upper left in axes coords + out_pos_rel = out_pos / np.sum(hist) + out_neg_rel = out_neg / np.sum(hist) + textstr = "Hits cut off:\n Left: {:.1%}\n" \ + " Right: {:.1%}".format(out_neg_rel, out_pos_rel) + props = dict(boxstyle='round', facecolor='white', alpha=0.9) + ax.text(0.05, 0.95, textstr, transform=ax.transAxes, + verticalalignment='top', bbox=props) + + plt.xlabel(bin_name) + plt.ylabel("Fraction of hits") + + pdf_file.savefig(fig) + print(bin_name, out_neg, out_pos) + print("Saved binning plot to " + self.pdf_path) -- GitLab