From c0257d23875bdeff9cad2294d7874d41d398057a Mon Sep 17 00:00:00 2001
From: Stefan Reck <stefan.reck@fau.de>
Date: Mon, 8 Apr 2019 12:31:31 +0200
Subject: [PATCH] Added BinningPlotter to the pipeline. Will automatically save
 histograms to pdf, showing the distribution of hits among the given bin
 edges.

---
 orcasong_plag/core.py    |  40 ++++++++----
 orcasong_plag/modules.py | 135 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+), 14 deletions(-)

diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py
index fd0d4f5..30ba800 100644
--- a/orcasong_plag/core.py
+++ b/orcasong_plag/core.py
@@ -1,7 +1,8 @@
 import km3pipe as kp
 import km3modules as km
+import os
 
-from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker
+from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter
 from orcasong_plag.mc_info_types import get_mc_info_extr
 
 
@@ -23,6 +24,12 @@ class FileBinner:
         Function that extracts desired mc_info from a blob, which is then
         stored as the "y" datafield in the .h5 file.
         Can also give a str identifier for an existing extractor.
+    bin_plot_freq : int or None
+        If int is given, defines after how many blobs data for an overview
+        histogram is extracted.
+        It shows the distribution of hits, the bin edges, and how many hits
+        were cut off for each field name in bin_edges_list.
+        It will be saved to the same path as the outfile in run.
     n_statusbar : int, optional
         Print a statusbar every n blobs.
     n_memory_observer : int, optional
@@ -50,6 +57,7 @@ class FileBinner:
     def __init__(self, bin_edges_list, mc_info_extr=None):
         self.bin_edges_list = bin_edges_list
         self.mc_info_extr = mc_info_extr
+        self.bin_plot_freq = 20
 
         self.n_statusbar = 200
         self.n_memory_observer = 400
@@ -73,7 +81,7 @@ class FileBinner:
             Path to the output file.
 
         """
-        name, shape = self.get_name_and_shape()
+        name, shape = self.get_names_and_shape()
         print("Generating {} images with shape {}".format(name, shape))
 
         pipe = kp.Pipeline()
@@ -88,7 +96,7 @@ class FileBinner:
 
         pipe.attach(kp.io.hdf5.HDF5Pump, filenames=infile)
 
-        self.attach_binning_modules(pipe)
+        self.attach_binning_modules(pipe, outfile=outfile)
 
         pipe.attach(kp.io.HDF5Sink,
                     filename=outfile,
@@ -99,9 +107,9 @@ class FileBinner:
 
         pipe.drain()
 
-    def attach_binning_modules(self, pipe):
+    def attach_binning_modules(self, pipe, outfile):
         """
-        Attach modules to transform a blob to images and mc_info to a km3pipe.
+        Attach modules to a km3pipe which transform a blob to images and mc_info.
 
         """
         pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader',
@@ -113,6 +121,13 @@ class FileBinner:
         #     from orcasong.utils import EventSkipper
         #     pipe.attach(EventSkipper, data_cuts=self.data_cuts)
 
+        if self.bin_plot_freq is not None:
+            pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf"
+            pipe.attach(BinningPlotter,
+                        bin_plot_freq=self.bin_plot_freq,
+                        bin_edges_list=self.bin_edges_list,
+                        pdf_path=pdf_name)
+
         pipe.attach(ImageMaker,
                     bin_edges_list=self.bin_edges_list,
                     store_as="histogram")
@@ -129,20 +144,17 @@ class FileBinner:
 
         pipe.attach(km.common.Keep, keys=['histogram', 'mc_info'])
 
-    def get_name_and_shape(self):
+    def get_names_and_shape(self):
         """
-        Get name and shape of the resulting x data, e.g. "pos_z_time", (18, 50).
+        Get names and shape of the resulting x data, e.g. (pos_z, time), (18, 50).
         """
-        res_names, shape = [], []
+        names, shape = [], []
         for bin_name, bin_edges in self.bin_edges_list:
-            res_names.append(bin_name)
+            names.append(bin_name)
             shape.append(len(bin_edges) - 1)
 
-        name = "_".join(res_names)
-        shape = tuple(shape)
-
-        return name, shape
+        return tuple(names), tuple(shape)
 
     def __repr__(self):
-        name, shape = self.get_name_and_shape()
+        name, shape = self.get_names_and_shape()
         return "<FileBinner: {} {}>".format(name, shape)
diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py
index 6361aa7..139bd42 100644
--- a/orcasong_plag/modules.py
+++ b/orcasong_plag/modules.py
@@ -4,6 +4,9 @@ Custom km3pipe modules for making nn input files.
 
 import km3pipe as kp
 import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import matplotlib.ticker as ticker
 
 
 class McInfoMaker(kp.Module):
@@ -114,3 +117,135 @@ class ImageMaker(kp.Module):
 
         blob[self.store_as] = kp_hist
         return blob
+
+
+class BinningPlotter(kp.Module):
+    """
+    Save a histogram of the number of hits for each binning field name.
+
+    E.g. if the bin_edges_list contains "pos_z", this will save a histogram
+    of #Hits vs. "pos_z" as a pdf, together with how many hits were outside
+    of the bin edges in both directions.
+
+    Per default, the resolution of the histogram (width of bins) will be
+    higher then the given bin edges, and the edges will be plotted as horizontal
+    lines. The time is the exception: The plotted bins have exactly the
+    given bin edges.
+
+    Attributes
+    ----------
+    bin_edges_list : List
+        List with the names of the fields to bin, and the respective bin edges,
+        including the left- and right-most bin edge.
+    pdf_path : str
+        Where to save the hists to. This pdf will contain all the field names
+        on their own page each.
+    bin_plot_freq : int
+        Extract data for the hitograms only every given number of blobs
+        (reduces time the pipeline takes to complete).
+    res_increase : int
+        Increase the number of bins by this much in the plot (so that one
+        can see if the edges have been placed correctly). Is never used
+        for the time binning (field name "time").
+    plot_bin_edges : bool
+        If true, will plot the bin edges as horizontal lines. Is never used
+        for the time binning (field name "time").
+
+    """
+    def configure(self):
+        self.bin_edges_list = self.require('bin_edges_list')
+        self.pdf_path = self.require('pdf_path')
+        self.bin_plot_freq = self.get("bin_plot_freq", default=20)
+        self.res_increase = self.get('res_increase', default=5)
+        self.plot_bin_edges = self.get('plot_bin_edges', default=True)
+
+        self.hists = {}
+        for bin_name, bin_edges in self._yield_spaced_bin_edges():
+            self.hists[bin_name] = {
+                "hist": np.zeros(len(bin_edges) - 1),
+                "out_pos": 0,
+                "out_neg": 0,
+            }
+
+        self.i = 0
+
+    def _space_bin_edges(self, bin_edges):
+        """
+        Increase resolution of given binning.
+        """
+        increased_n_bins = (len(bin_edges) - 1) * self.res_increase + 1
+        bin_edges = np.linspace(bin_edges[0], bin_edges[-1],
+                                increased_n_bins)
+
+        return bin_edges
+
+    def _yield_spaced_bin_edges(self):
+        for bin_name, bin_edges in self.bin_edges_list:
+            if bin_name != "time":
+                bin_edges = self._space_bin_edges(bin_edges)
+            yield bin_name, bin_edges
+
+    def process(self, blob):
+        """
+        Extract data from blob for the hist plots.
+        """
+        if self.i % self.bin_plot_freq == 0:
+            for bin_name, bin_edges in self._yield_spaced_bin_edges():
+                data = blob["Hits"][bin_name]
+                hist = np.histogram(data, bins=bin_edges)[0]
+
+                out_pos = data[data > np.max(bin_edges)].size
+                out_neg = data[data < np.min(bin_edges)].size
+
+                self.hists[bin_name]["hist"] += hist
+                self.hists[bin_name]["out_pos"] += out_pos
+                self.hists[bin_name]["out_neg"] += out_neg
+
+        self.i += 1
+        return blob
+
+    def finish(self):
+        """
+        Make and save the histograms to pdf.
+        """
+        with PdfPages(self.pdf_path) as pdf_file:
+            for bin_name, org_bin_edges in self.bin_edges_list:
+                hist = self.hists[bin_name]["hist"]
+                out_pos = self.hists[bin_name]["out_pos"]
+                out_neg = self.hists[bin_name]["out_neg"]
+                hist_frac = hist / (np.sum(hist) + out_pos + out_neg)
+
+                if bin_name != "time":
+                    bin_edges = self._space_bin_edges(org_bin_edges)
+                else:
+                    bin_edges = org_bin_edges
+
+                bin_spacing = bin_edges[1] - bin_edges[0]
+                fig, ax = plt.subplots()
+                plt.bar(bin_edges[:-1],
+                        hist_frac,
+                        align="edge",
+                        width=0.9*bin_spacing,
+                        )
+                ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
+
+                if self.plot_bin_edges and bin_name != "time":
+                    for bin_edge in org_bin_edges:
+                        plt.axvline(x=bin_edge, color='grey', linestyle='-',
+                                    linewidth=1, alpha=0.9)
+
+                # place a text box in upper left in axes coords
+                out_pos_rel = out_pos / np.sum(hist)
+                out_neg_rel = out_neg / np.sum(hist)
+                textstr = "Hits cut off:\n Left: {:.1%}\n" \
+                          " Right: {:.1%}".format(out_neg_rel, out_pos_rel)
+                props = dict(boxstyle='round', facecolor='white', alpha=0.9)
+                ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
+                        verticalalignment='top', bbox=props)
+
+                plt.xlabel(bin_name)
+                plt.ylabel("Fraction of hits")
+
+                pdf_file.savefig(fig)
+                print(bin_name, out_neg, out_pos)
+        print("Saved binning plot to " + self.pdf_path)
-- 
GitLab