From 2b130099d6ca6f389bea98b5dad5b32f1ff3b766 Mon Sep 17 00:00:00 2001
From: Stefan Reck <stefan.reck@fau.de>
Date: Tue, 9 Apr 2019 11:15:27 +0200
Subject: [PATCH] One binning plot for multiple files.

---
 orcasong_plag/core.py    |  80 +++++++++++++++--------
 orcasong_plag/modules.py | 135 ++++++++++++++++++++++-----------------
 2 files changed, 131 insertions(+), 84 deletions(-)

diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py
index 30ba800..355a438 100644
--- a/orcasong_plag/core.py
+++ b/orcasong_plag/core.py
@@ -2,7 +2,7 @@ import km3pipe as kp
 import km3modules as km
 import os
 
-from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter
+from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter, plot_hists
 from orcasong_plag.mc_info_types import get_mc_info_extr
 
 
@@ -59,8 +59,8 @@ class FileBinner:
         self.mc_info_extr = mc_info_extr
         self.bin_plot_freq = 20
 
-        self.n_statusbar = 200
-        self.n_memory_observer = 400
+        self.n_statusbar = 1000
+        self.n_memory_observer = 1000
         self.do_time_preproc = True
         # self.data_cuts = None
 
@@ -75,8 +75,8 @@ class FileBinner:
 
         Parameters
         ----------
-        infile : str or List
-            Path to the input file(s).
+        infile : str
+            Path to the input file.
         outfile : str
             Path to the output file.
 
@@ -84,34 +84,50 @@ class FileBinner:
         name, shape = self.get_names_and_shape()
         print("Generating {} images with shape {}".format(name, shape))
 
-        pipe = kp.Pipeline()
+        plot_hists = self.bin_plot_freq is not None
 
-        if self.n_statusbar is not None:
-            pipe.attach(km.common.StatusBar, every=self.n_statusbar)
-        if self.n_memory_observer is not None:
-            pipe.attach(km.common.MemoryObserver, every=400)
+        pipe = self.build_pipe(infile, outfile, plot_hists=plot_hists)
+        pipe.drain()
 
-        if not isinstance(infile, list):
-            infile = [infile]
+    def run_multi(self, infiles, outfolder):
+        """
+        Bin multiple files into their own output files each.
 
-        pipe.attach(kp.io.hdf5.HDF5Pump, filenames=infile)
+        Will also generate a summary binning plot for all of the files.
 
-        self.attach_binning_modules(pipe, outfile=outfile)
+        Parameters
+        ----------
+        infiles : List
+            The path to infiles as str.
+        outfolder : str
+            The output folder to place them in.
 
-        pipe.attach(kp.io.HDF5Sink,
-                    filename=outfile,
-                    complib=self.complib,
-                    complevel=self.complevel,
-                    chunksize=self.chunksize,
-                    flush_frequency=self.flush_frequency)
+        """
+        hists = None
+        for infile in infiles:
+            outfile_name = os.path.splitext(os.path.basename(infile))[0] + "_hist.h5"
+            outfile = outfolder + outfile_name
+
+            pipe = self.build_pipe(infile, outfile,
+                                   plot_hists=False, hists_start=hists)
+            smry = pipe.drain()
+            hists = smry["BinningPlotter"]
+        plot_hists(hists, pdf_path=outfolder + "binning_summary.pdf")
+
+    def build_pipe(self, infile, outfile, plot_hists=True, hists_start=None):
+        """
+        Build the pipe to generate images and mc_info for a file.
+        """
 
-        pipe.drain()
+        pipe = kp.Pipeline()
 
-    def attach_binning_modules(self, pipe, outfile):
-        """
-        Attach modules to a km3pipe which transform a blob to images and mc_info.
+        if self.n_statusbar is not None:
+            pipe.attach(km.common.StatusBar, every=self.n_statusbar)
+        if self.n_memory_observer is not None:
+            pipe.attach(km.common.MemoryObserver, every=self.n_memory_observer)
+
+        pipe.attach(kp.io.hdf5.HDF5Pump, filename=infile)
 
-        """
         pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader',
                                           'McTracks', 'Hits', 'McHits'])
         if self.do_time_preproc:
@@ -122,10 +138,14 @@ class FileBinner:
         #     pipe.attach(EventSkipper, data_cuts=self.data_cuts)
 
         if self.bin_plot_freq is not None:
-            pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf"
+            if plot_hists:
+                pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf"
+            else:
+                pdf_name = None
             pipe.attach(BinningPlotter,
                         bin_plot_freq=self.bin_plot_freq,
                         bin_edges_list=self.bin_edges_list,
+                        hists_start=hists_start,
                         pdf_path=pdf_name)
 
         pipe.attach(ImageMaker,
@@ -144,6 +164,14 @@ class FileBinner:
 
         pipe.attach(km.common.Keep, keys=['histogram', 'mc_info'])
 
+        pipe.attach(kp.io.HDF5Sink,
+                    filename=outfile,
+                    complib=self.complib,
+                    complevel=self.complevel,
+                    chunksize=self.chunksize,
+                    flush_frequency=self.flush_frequency)
+        return pipe
+
     def get_names_and_shape(self):
         """
         Get names and shape of the resulting x data, e.g. (pos_z, time), (18, 50).
diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py
index 139bd42..9ddcb65 100644
--- a/orcasong_plag/modules.py
+++ b/orcasong_plag/modules.py
@@ -137,7 +137,7 @@ class BinningPlotter(kp.Module):
     bin_edges_list : List
         List with the names of the fields to bin, and the respective bin edges,
         including the left- and right-most bin edge.
-    pdf_path : str
+    pdf_path : str, optional
         Where to save the hists to. This pdf will contain all the field names
         on their own page each.
     bin_plot_freq : int
@@ -150,22 +150,35 @@ class BinningPlotter(kp.Module):
     plot_bin_edges : bool
         If true, will plot the bin edges as horizontal lines. Is never used
         for the time binning (field name "time").
+    hists_start : dict, optional
+        Starting values for the statistics.
 
     """
     def configure(self):
         self.bin_edges_list = self.require('bin_edges_list')
-        self.pdf_path = self.require('pdf_path')
+        self.pdf_path = self.get('pdf_path', default=None)
         self.bin_plot_freq = self.get("bin_plot_freq", default=20)
         self.res_increase = self.get('res_increase', default=5)
         self.plot_bin_edges = self.get('plot_bin_edges', default=True)
+        self.hists_start = self.get('hists_start', default=None)
 
-        self.hists = {}
-        for bin_name, bin_edges in self._yield_spaced_bin_edges():
-            self.hists[bin_name] = {
-                "hist": np.zeros(len(bin_edges) - 1),
-                "out_pos": 0,
-                "out_neg": 0,
-            }
+        if self.hists_start is None:
+            self.hists = {}
+            for bin_name, org_bin_edges in self.bin_edges_list:
+                if bin_name == "time":
+                    bin_edges = org_bin_edges
+                else:
+                    bin_edges = self._space_bin_edges(org_bin_edges)
+
+                self.hists[bin_name] = {
+                    "hist": np.zeros(len(bin_edges) - 1),
+                    "hist_bin_edges": bin_edges,
+                    "bin_edges": org_bin_edges,
+                    "out_pos": 0,
+                    "out_neg": 0,
+                }
+        else:
+            self.hists = self.hists_start
 
         self.i = 0
 
@@ -179,23 +192,19 @@ class BinningPlotter(kp.Module):
 
         return bin_edges
 
-    def _yield_spaced_bin_edges(self):
-        for bin_name, bin_edges in self.bin_edges_list:
-            if bin_name != "time":
-                bin_edges = self._space_bin_edges(bin_edges)
-            yield bin_name, bin_edges
-
     def process(self, blob):
         """
         Extract data from blob for the hist plots.
         """
         if self.i % self.bin_plot_freq == 0:
-            for bin_name, bin_edges in self._yield_spaced_bin_edges():
+            for bin_name, hists_data in self.hists.items():
+                hist_bin_edges = hists_data["hist_bin_edges"]
+
                 data = blob["Hits"][bin_name]
-                hist = np.histogram(data, bins=bin_edges)[0]
+                hist = np.histogram(data, bins=hist_bin_edges)[0]
 
-                out_pos = data[data > np.max(bin_edges)].size
-                out_neg = data[data < np.min(bin_edges)].size
+                out_pos = data[data > np.max(hist_bin_edges)].size
+                out_neg = data[data < np.min(hist_bin_edges)].size
 
                 self.hists[bin_name]["hist"] += hist
                 self.hists[bin_name]["out_pos"] += out_pos
@@ -208,44 +217,54 @@ class BinningPlotter(kp.Module):
         """
         Make and save the histograms to pdf.
         """
-        with PdfPages(self.pdf_path) as pdf_file:
-            for bin_name, org_bin_edges in self.bin_edges_list:
-                hist = self.hists[bin_name]["hist"]
-                out_pos = self.hists[bin_name]["out_pos"]
-                out_neg = self.hists[bin_name]["out_neg"]
-                hist_frac = hist / (np.sum(hist) + out_pos + out_neg)
+        if self.pdf_path is not None:
+            plot_hists(self.hists, self.pdf_path,
+                       plot_bin_edges=self.plot_bin_edges)
 
-                if bin_name != "time":
-                    bin_edges = self._space_bin_edges(org_bin_edges)
-                else:
-                    bin_edges = org_bin_edges
+        return self.hists
 
-                bin_spacing = bin_edges[1] - bin_edges[0]
-                fig, ax = plt.subplots()
-                plt.bar(bin_edges[:-1],
-                        hist_frac,
-                        align="edge",
-                        width=0.9*bin_spacing,
-                        )
-                ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
-
-                if self.plot_bin_edges and bin_name != "time":
-                    for bin_edge in org_bin_edges:
-                        plt.axvline(x=bin_edge, color='grey', linestyle='-',
-                                    linewidth=1, alpha=0.9)
-
-                # place a text box in upper left in axes coords
-                out_pos_rel = out_pos / np.sum(hist)
-                out_neg_rel = out_neg / np.sum(hist)
-                textstr = "Hits cut off:\n Left: {:.1%}\n" \
-                          " Right: {:.1%}".format(out_neg_rel, out_pos_rel)
-                props = dict(boxstyle='round', facecolor='white', alpha=0.9)
-                ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
-                        verticalalignment='top', bbox=props)
-
-                plt.xlabel(bin_name)
-                plt.ylabel("Fraction of hits")
-
-                pdf_file.savefig(fig)
-                print(bin_name, out_neg, out_pos)
-        print("Saved binning plot to " + self.pdf_path)
+
+def plot_hists(hists, pdf_path, plot_bin_edges=True):
+    """
+    Plot histograms made by the BinningPlotter to the given pdf path.
+
+    """
+    with PdfPages(pdf_path) as pdf_file:
+        for bin_name, hists_data in hists.items():
+            hist_bin_edges = hists_data["hist_bin_edges"]
+            bin_edges = hists_data["bin_edges"]
+            hist = hists_data["hist"]
+            out_pos = hists_data["out_pos"]
+            out_neg = hists_data["out_neg"]
+
+            hist_frac = hist / (np.sum(hist) + out_pos + out_neg)
+
+            bin_spacing = hist_bin_edges[1] - hist_bin_edges[0]
+            fig, ax = plt.subplots()
+            plt.bar(hist_bin_edges[:-1],
+                    hist_frac,
+                    align="edge",
+                    width=0.9 * bin_spacing,
+                    )
+            ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
+
+            if plot_bin_edges and bin_name != "time":
+                for bin_edge in bin_edges:
+                    plt.axvline(x=bin_edge, color='grey', linestyle='-',
+                                linewidth=1, alpha=0.9)
+
+            # place a text box in upper left in axes coords
+            out_pos_rel = out_pos / np.sum(hist)
+            out_neg_rel = out_neg / np.sum(hist)
+            textstr = "Hits cut off:\n Left: {:.1%}\n" \
+                      " Right: {:.1%}".format(out_neg_rel, out_pos_rel)
+            props = dict(boxstyle='round', facecolor='white', alpha=0.9)
+            ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
+                    verticalalignment='top', bbox=props)
+
+            plt.xlabel(bin_name)
+            plt.ylabel("Fraction of hits")
+
+            pdf_file.savefig(fig)
+
+    print("Saved binning plot to " + pdf_path)
-- 
GitLab