From 34ee2feaed5399d82badd8675bdb2c0b9552bc1a Mon Sep 17 00:00:00 2001
From: Stefan Reck <stefan.reck@fau.de>
Date: Tue, 9 Apr 2019 13:16:30 +0200
Subject: [PATCH] Bin stats are now added to each h5 file.

---
 orcasong_plag/core.py                |  81 ++++++++-----
 orcasong_plag/modules.py             | 128 ++++++--------------
 orcasong_plag/util/bin_stats_plot.py | 172 +++++++++++++++++++++++++++
 3 files changed, 260 insertions(+), 121 deletions(-)
 create mode 100644 orcasong_plag/util/bin_stats_plot.py

diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py
index 355a438..cfefa2f 100644
--- a/orcasong_plag/core.py
+++ b/orcasong_plag/core.py
@@ -2,13 +2,18 @@ import km3pipe as kp
 import km3modules as km
 import os
 
-from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter, plot_hists
+from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningStatsMaker
 from orcasong_plag.mc_info_types import get_mc_info_extr
+from orcasong_plag.util.bin_stats_plot import plot_hists, add_hists_to_h5file, plot_hist_of_files
 
 
 class FileBinner:
     """
-    For making binned images.
+    For making binned images and mc_infos, which can be used for conv. nets.
+
+    Can also add statistics of the binning to the h5 files, which can
+    be plotted to show the distribution of hits among the bins and how
+    many hits were cut off.
 
     Attributes
     ----------
@@ -54,10 +59,14 @@ class FileBinner:
         but it increases the RAM usage as well.
 
     """
-    def __init__(self, bin_edges_list, mc_info_extr=None):
+    def __init__(self, bin_edges_list, mc_info_extr=None, add_bin_stats=True):
         self.bin_edges_list = bin_edges_list
         self.mc_info_extr = mc_info_extr
-        self.bin_plot_freq = 20
+
+        if add_bin_stats:
+            self.bin_plot_freq = 1
+        else:
+            self.bin_plot_freq = None
 
         self.n_statusbar = 1000
         self.n_memory_observer = 1000
@@ -69,52 +78,70 @@ class FileBinner:
         self.complevel = 1
         self.flush_frequency = 1000
 
-    def run(self, infile, outfile):
+    def run(self, infile, outfile, save_plot=False):
         """
-        Build the pipeline to make images for the given file.
+        Make images for a file.
 
         Parameters
         ----------
         infile : str
             Path to the input file.
         outfile : str
-            Path to the output file.
+            Path to the output file (will be created).
+        save_plot : bool
+            Save the binning hists as a pdf. Only possible if bin_plot_freq
+            is not None.
 
         """
+        if save_plot and self.bin_plot_freq is None:
+            raise ValueError("Can not make plot when add_bin_stats is False")
+
         name, shape = self.get_names_and_shape()
         print("Generating {} images with shape {}".format(name, shape))
 
-        plot_hists = self.bin_plot_freq is not None
+        pipe = self.build_pipe(infile, outfile)
+        smry = pipe.drain()
+
+        if self.bin_plot_freq is not None:
+            hists = smry["BinningStatsMaker"]
+            add_hists_to_h5file(hists, outfile)
 
-        pipe = self.build_pipe(infile, outfile, plot_hists=plot_hists)
-        pipe.drain()
+            if save_plot:
+                save_as = os.path.splitext(outfile)[0] + "_hists.pdf"
+                plot_hists(hists, save_as)
 
-    def run_multi(self, infiles, outfolder):
+    def run_multi(self, infiles, outfolder, save_plot=False):
         """
         Bin multiple files into their own output files each.
 
-        Will also generate a summary binning plot for all of the files.
-
         Parameters
         ----------
         infiles : List
             The path to infiles as str.
         outfolder : str
-            The output folder to place them in.
+            The output folder to place them in. The output file name will
+            be generated automatically.
+        save_plot : bool
+            Save the binning hists as a pdf. Only possible if bin_plot_freq
+            is not None.
 
         """
-        hists = None
+        if save_plot and self.bin_plot_freq is None:
+            raise ValueError("Can not make plot when add_bin_stats is False")
+
+        outfiles = []
         for infile in infiles:
-            outfile_name = os.path.splitext(os.path.basename(infile))[0] + "_hist.h5"
+            outfile_name = os.path.splitext(os.path.basename(infile))[0] \
+                           + "_hist.h5"
             outfile = outfolder + outfile_name
+            outfiles.append(outfile)
 
-            pipe = self.build_pipe(infile, outfile,
-                                   plot_hists=False, hists_start=hists)
-            smry = pipe.drain()
-            hists = smry["BinningPlotter"]
-        plot_hists(hists, pdf_path=outfolder + "binning_summary.pdf")
+            self.run(infile, outfile, save_plot=False)
 
-    def build_pipe(self, infile, outfile, plot_hists=True, hists_start=None):
+        if save_plot:
+            plot_hist_of_files(outfiles, save_as=outfolder+"binning_hist.pdf")
+
+    def build_pipe(self, infile, outfile):
         """
         Build the pipe to generate images and mc_info for a file.
         """
@@ -138,15 +165,9 @@ class FileBinner:
         #     pipe.attach(EventSkipper, data_cuts=self.data_cuts)
 
         if self.bin_plot_freq is not None:
-            if plot_hists:
-                pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf"
-            else:
-                pdf_name = None
-            pipe.attach(BinningPlotter,
+            pipe.attach(BinningStatsMaker,
                         bin_plot_freq=self.bin_plot_freq,
-                        bin_edges_list=self.bin_edges_list,
-                        hists_start=hists_start,
-                        pdf_path=pdf_name)
+                        bin_edges_list=self.bin_edges_list)
 
         pipe.attach(ImageMaker,
                     bin_edges_list=self.bin_edges_list,
diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py
index 9ddcb65..72c615a 100644
--- a/orcasong_plag/modules.py
+++ b/orcasong_plag/modules.py
@@ -4,9 +4,6 @@ Custom km3pipe modules for making nn input files.
 
 import km3pipe as kp
 import numpy as np
-import matplotlib.pyplot as plt
-from matplotlib.backends.backend_pdf import PdfPages
-import matplotlib.ticker as ticker
 
 
 class McInfoMaker(kp.Module):
@@ -119,17 +116,17 @@ class ImageMaker(kp.Module):
         return blob
 
 
-class BinningPlotter(kp.Module):
+class BinningStatsMaker(kp.Module):
     """
-    Save a histogram of the number of hits for each binning field name.
+    Generate a histogram of the number of hits for each binning field name.
 
-    E.g. if the bin_edges_list contains "pos_z", this will save a histogram
-    of #Hits vs. "pos_z" as a pdf, together with how many hits were outside
+    E.g. if the bin_edges_list contains "pos_z", this will make a histogram
+    of #Hits vs. "pos_z", together with how many hits were outside
     of the bin edges in both directions.
 
     Per default, the resolution of the histogram (width of bins) will be
-    higher then the given bin edges, and the edges will be plotted as horizontal
-    lines. The time is the exception: The plotted bins have exactly the
+    higher then the given bin edges, and the edges will be stored seperatly.
+    The time is the exception: The plotted bins have exactly the
     given bin edges.
 
     Attributes
@@ -137,48 +134,37 @@ class BinningPlotter(kp.Module):
     bin_edges_list : List
         List with the names of the fields to bin, and the respective bin edges,
         including the left- and right-most bin edge.
-    pdf_path : str, optional
-        Where to save the hists to. This pdf will contain all the field names
-        on their own page each.
     bin_plot_freq : int
-        Extract data for the hitograms only every given number of blobs
+        Extract data for the histograms only every given number of blobs
         (reduces time the pipeline takes to complete).
     res_increase : int
-        Increase the number of bins by this much in the plot (so that one
+        Increase the number of bins by this much in the hists (so that one
         can see if the edges have been placed correctly). Is never used
         for the time binning (field name "time").
-    plot_bin_edges : bool
-        If true, will plot the bin edges as horizontal lines. Is never used
-        for the time binning (field name "time").
-    hists_start : dict, optional
-        Starting values for the statistics.
 
     """
     def configure(self):
         self.bin_edges_list = self.require('bin_edges_list')
+
         self.pdf_path = self.get('pdf_path', default=None)
-        self.bin_plot_freq = self.get("bin_plot_freq", default=20)
+        self.bin_plot_freq = self.get("bin_plot_freq", default=1)
         self.res_increase = self.get('res_increase', default=5)
         self.plot_bin_edges = self.get('plot_bin_edges', default=True)
-        self.hists_start = self.get('hists_start', default=None)
-
-        if self.hists_start is None:
-            self.hists = {}
-            for bin_name, org_bin_edges in self.bin_edges_list:
-                if bin_name == "time":
-                    bin_edges = org_bin_edges
-                else:
-                    bin_edges = self._space_bin_edges(org_bin_edges)
-
-                self.hists[bin_name] = {
-                    "hist": np.zeros(len(bin_edges) - 1),
-                    "hist_bin_edges": bin_edges,
-                    "bin_edges": org_bin_edges,
-                    "out_pos": 0,
-                    "out_neg": 0,
-                }
-        else:
-            self.hists = self.hists_start
+
+        self.hists = {}
+        for bin_name, org_bin_edges in self.bin_edges_list:
+            if bin_name == "time":
+                bin_edges = org_bin_edges
+            else:
+                bin_edges = self._space_bin_edges(org_bin_edges)
+
+            self.hists[bin_name] = {
+                "hist": np.zeros(len(bin_edges) - 1),
+                "hist_bin_edges": bin_edges,
+                "bin_edges": org_bin_edges,
+                # below smallest edge, above largest edge:
+                "cut_off": np.zeros(2),
+            }
 
         self.i = 0
 
@@ -207,64 +193,24 @@ class BinningPlotter(kp.Module):
                 out_neg = data[data < np.min(hist_bin_edges)].size
 
                 self.hists[bin_name]["hist"] += hist
-                self.hists[bin_name]["out_pos"] += out_pos
-                self.hists[bin_name]["out_neg"] += out_neg
+                self.hists[bin_name]["cut_off"] += np.array([out_neg, out_pos])
 
         self.i += 1
         return blob
 
     def finish(self):
         """
-        Make and save the histograms to pdf.
-        """
-        if self.pdf_path is not None:
-            plot_hists(self.hists, self.pdf_path,
-                       plot_bin_edges=self.plot_bin_edges)
-
-        return self.hists
+        Get the hists, which are the stats of the binning.
 
+        Its a dict with each binning field name containing the following
+        ndarrays:
 
-def plot_hists(hists, pdf_path, plot_bin_edges=True):
-    """
-    Plot histograms made by the BinningPlotter to the given pdf path.
+        bin_edges : The actual bin edges.
+        cut_off : How many events were cut off in positive and negative
+            direction due to this binning.
+        hist_bin_edges : The bin edges for the plot in finer resolution then
+            the actual bin edges.
+        hist : The number of hist in each bin of the hist_bin_edges.
 
-    """
-    with PdfPages(pdf_path) as pdf_file:
-        for bin_name, hists_data in hists.items():
-            hist_bin_edges = hists_data["hist_bin_edges"]
-            bin_edges = hists_data["bin_edges"]
-            hist = hists_data["hist"]
-            out_pos = hists_data["out_pos"]
-            out_neg = hists_data["out_neg"]
-
-            hist_frac = hist / (np.sum(hist) + out_pos + out_neg)
-
-            bin_spacing = hist_bin_edges[1] - hist_bin_edges[0]
-            fig, ax = plt.subplots()
-            plt.bar(hist_bin_edges[:-1],
-                    hist_frac,
-                    align="edge",
-                    width=0.9 * bin_spacing,
-                    )
-            ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
-
-            if plot_bin_edges and bin_name != "time":
-                for bin_edge in bin_edges:
-                    plt.axvline(x=bin_edge, color='grey', linestyle='-',
-                                linewidth=1, alpha=0.9)
-
-            # place a text box in upper left in axes coords
-            out_pos_rel = out_pos / np.sum(hist)
-            out_neg_rel = out_neg / np.sum(hist)
-            textstr = "Hits cut off:\n Left: {:.1%}\n" \
-                      " Right: {:.1%}".format(out_neg_rel, out_pos_rel)
-            props = dict(boxstyle='round', facecolor='white', alpha=0.9)
-            ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
-                    verticalalignment='top', bbox=props)
-
-            plt.xlabel(bin_name)
-            plt.ylabel("Fraction of hits")
-
-            pdf_file.savefig(fig)
-
-    print("Saved binning plot to " + pdf_path)
+        """
+        return self.hists
diff --git a/orcasong_plag/util/bin_stats_plot.py b/orcasong_plag/util/bin_stats_plot.py
new file mode 100644
index 0000000..83c149f
--- /dev/null
+++ b/orcasong_plag/util/bin_stats_plot.py
@@ -0,0 +1,172 @@
+"""
+Functions for plotting the bin stats made by the BinningStatsMaker module.
+"""
+
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import matplotlib.ticker as ticker
+import h5py
+import numpy as np
+
+
+def plot_hists(hists, save_to, plot_bin_edges=True):
+    """
+    Plot histograms made by the BinningStatsMaker to the given pdf path.
+
+    Parameters
+    ----------
+    hists : dict or List
+        Dicts with the info about the binning, generated by the BinningStatsMaker.
+        Can also be a list of dicts (for multiple files), which will be
+        combined to one plot.
+    save_to : str
+        Where to save the plot to.
+    plot_bin_edges : bool
+        If true, will plot the bin edges as horizontal lines. Is never used
+        for the time binning (field name "time").
+
+    """
+    if isinstance(hists, list):
+        hists = combine_hists(hists)
+
+    with PdfPages(save_to) as pdf_file:
+        for bin_name, hists_data in hists.items():
+            hist_bin_edges = hists_data["hist_bin_edges"]
+            bin_edges = hists_data["bin_edges"]
+            hist = hists_data["hist"]
+            cut_off = hists_data["cut_off"]
+            total_hits = np.sum(hist) + np.sum(cut_off)
+
+            hist_frac = hist / total_hits
+
+            bin_spacing = hist_bin_edges[1] - hist_bin_edges[0]
+            fig, ax = plt.subplots()
+            plt.bar(hist_bin_edges[:-1],
+                    hist_frac,
+                    align="edge",
+                    width=0.9 * bin_spacing,
+                    )
+            ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
+
+            if plot_bin_edges and bin_name != "time":
+                for bin_edge in bin_edges:
+                    plt.axvline(x=bin_edge, color='grey', linestyle='-',
+                                linewidth=1, alpha=0.9)
+
+            # place a text box in upper left in axes coords
+            out_neg_rel = cut_off[0] / total_hits
+            out_pos_rel = cut_off[1] / total_hits
+            textstr = "Hits cut off:\n Left: {:.1%}\n" \
+                      " Right: {:.1%}".format(out_neg_rel, out_pos_rel)
+            props = dict(boxstyle='round', facecolor='white', alpha=0.9)
+            ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
+                    verticalalignment='top', bbox=props)
+
+            plt.xlabel(bin_name)
+            plt.ylabel("Fraction of hits")
+
+            pdf_file.savefig(fig)
+
+    print("Saved binning plot to " + save_to)
+
+
+def combine_hists(hists_list):
+    """
+    Combine the hists for multiple files into a single big one.
+
+    Parameters
+    ----------
+    hists_list : List
+        A list of dicts (hists from the BinningStatsMaker for different files).
+
+    Returns
+    -------
+    combined_hists : Dict
+        The combined hist.
+
+    """
+    # initialize combined dict according to first dict in list
+    combined_hists = {}
+    for bin_name, hists_data in hists_list[0].items():
+        hist_bin_edges = hists_data["hist_bin_edges"]
+        bin_edges = hists_data["bin_edges"]
+        bin_name = bin_name
+
+        combined_hists[bin_name] = {
+            "hist": np.zeros(len(hist_bin_edges) - 1),
+            "hist_bin_edges": hist_bin_edges,
+            "bin_edges": bin_edges,
+            # below smallest edge, above largest edge:
+            "cut_off": np.zeros(2),
+        }
+
+    # add them together
+    for hists in hists_list:
+        for bin_name, hists_data in hists.items():
+            # name and bin edges must be the same for all hists
+            if bin_name not in combined_hists:
+                raise NameError(
+                    "Hists dont all have the same binning field name:"
+                    "{}".format(bin_name))
+
+            bin_edges = hists_data["bin_edges"]
+            comb_edges = combined_hists[bin_name]["bin_edges"]
+            if len(bin_edges) != len(comb_edges):
+                raise ValueError("Hists have different hist bin edges: {} vs {}"
+                                 .format(bin_edges, comb_edges))
+
+            hist_bin_edges = hists_data["hist_bin_edges"]
+            comb_hist_edges = combined_hists[bin_name]["hist_bin_edges"]
+            if len(hist_bin_edges) != len(comb_hist_edges):
+                raise ValueError("Hists have different bin edges: {} vs {}"
+                                 .format(hist_bin_edges, comb_hist_edges))
+
+            combined_hists[bin_name]["hist"] += hists_data["hist"]
+            combined_hists[bin_name]["cut_off"] += hists_data["cut_off"]
+
+    return combined_hists
+
+
+def add_hists_to_h5file(hists, file):
+    """
+    Add the binning stats as groups to a h5 file.
+
+    Parameters
+    ----------
+    hists : dict
+        The histst from the BinningStatsMaker module.
+    file : str
+        Path to the h5 file.
+
+    """
+    base_name = "bin_stats/"
+    with h5py.File(file, "a") as f:
+        for bin_name, hists_data in hists.items():
+            for key, val in hists_data.items():
+                h5_folder = base_name + bin_name + "/" + key
+                f.create_dataset(h5_folder, data=val)
+
+
+def plot_hist_of_files(files, save_as):
+    """
+    Plot the binning stats of a list of files.
+
+    Parameters
+    ----------
+    files : List
+        Path of the files as str.
+    save_as : str
+        Where to save the plot to.
+
+    """
+    hists_list = []
+    opened_files = []
+    for file in files:
+        f = h5py.File(file, "r")
+        hists_list.append(f["bin_stats/"])
+        opened_files.append(f)
+
+    plot_hists(hists_list, save_as)
+
+    for file in opened_files:
+        file.close()
-- 
GitLab