plot_binstats.py

"""
Run with a parser to plot the binning statistics.
Functions for plotting the bin stats made by the BinningStatsMaker module.
"""

import os
import warnings
import argparse
import matplotlib
matplotlib.use("pdf")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import h5py
import numpy as np


__author__ = 'Stefan Reck'


def plot_hists(hists_list, save_to, plot_bin_edges=True):
    """
    Plot histograms made by the BinningStatsMaker to the given pdf path.

    Parameters
    ----------
    hists_list : List
        Can also be a list of dicts (for multiple files), which will be
        combined to one plot.
        Each dict has info about the binning, generated by the BinningStatsMaker.
    save_to : str
        Where to save the plot to.
    plot_bin_edges : bool
        If true, will plot the bin edges as horizontal lines. Is never used
        for the time binning (field name "time").

    """
    hists = combine_hists(hists_list)

    if os.path.exists(save_to):
        raise FileExistsError(f"File {save_to} exists already!")

    with PdfPages(save_to) as pdf_file:
        for bin_name, hists_data in hists.items():
            hist_bin_edges = hists_data["hist_bin_edges"]
            bin_edges = hists_data["bin_edges"]
            hist = hists_data["hist"]
            cut_off = hists_data["cut_off"]
            total_hits = np.sum(hist) + np.sum(cut_off)

            bin_widths = np.diff(hist_bin_edges)
            hist_prob = hist / bin_widths / np.sum(hist)

            fig, ax = plt.subplots()
            plt.bar(
                hist_bin_edges[:-1],
                hist_prob,
                align="edge",
                width=bin_widths
            )

            if plot_bin_edges and bin_name != "time":
                for bin_edge in bin_edges:
                    plt.axvline(
                        x=bin_edge, color='grey', linestyle='-',
                        linewidth=1, alpha=0.9
                    )

            # place a text box in upper left in axes coords
            out_neg_rel = cut_off[0] / total_hits
            out_pos_rel = cut_off[1] / total_hits
            textstr = "Hits cut off:\nLeft: {:.1%}\nRight: {:.1%}".format(
                out_neg_rel, out_pos_rel)
            props = dict(boxstyle='round', facecolor='white', alpha=0.9)
            ax.text(
                0.95, 0.95, textstr, transform=ax.transAxes,
                verticalalignment='top', bbox=props,
                horizontalalignment="right"
            )

            # the auto ticks are nice even numbers
            ylims = [0, ax.get_yticks().max()]
            ax.set_ylim(ylims)

            plt.xlabel(bin_name)
            plt.ylabel("Density of hits")

            pdf_file.savefig(fig)

    print("Saved binning plot to " + save_to)


def combine_hists(hists_list):
    """
    Combine the hists for multiple files into a single big one.

    Parameters
    ----------
    hists_list : List
        A list of dicts (hists from the BinningStatsMaker for different files).

    Returns
    -------
    combined_hists : Dict
        The combined hist.

    """
    if len(hists_list) == 1:
        # just one entry, no need to combine
        return hists_list[0]

    # initialize combined dict according to first dict in list
    combined_hists = {}
    for bin_name, hists_data in hists_list[0].items():
        hist_bin_edges = hists_data["hist_bin_edges"]
        bin_edges = hists_data["bin_edges"]
        bin_name = bin_name

        combined_hists[bin_name] = {
            "hist": np.zeros(len(hist_bin_edges) - 1),
            "hist_bin_edges": hist_bin_edges,
            "bin_edges": bin_edges,
            # below smallest edge, above largest edge:
            "cut_off": np.zeros(2),
        }

    # add them together
    for hists in hists_list:
        for bin_name, hists_data in hists.items():
            # name and bin edges must be the same for all hists
            if bin_name not in combined_hists:
                raise NameError(
                    "Hists dont all have the same binning field name:"
                    "{}".format(bin_name))

            bin_edges = hists_data["bin_edges"]
            comb_edges = combined_hists[bin_name]["bin_edges"]
            if len(bin_edges) != len(comb_edges):
                raise ValueError("Hists have different hist bin edges: {} vs {}"
                                 .format(bin_edges, comb_edges))

            hist_bin_edges = hists_data["hist_bin_edges"]
            comb_hist_edges = combined_hists[bin_name]["hist_bin_edges"]
            if len(hist_bin_edges) != len(comb_hist_edges):
                raise ValueError("Hists have different bin edges: {} vs {}"
                                 .format(hist_bin_edges, comb_hist_edges))

            combined_hists[bin_name]["hist"] += hists_data["hist"]
            combined_hists[bin_name]["cut_off"] += hists_data["cut_off"]

    return combined_hists


def add_hists_to_h5file(hists, file):
    """
    Add the binning stats as groups to a h5 file.

    Parameters
    ----------
    hists : dict
        The histst from the BinningStatsMaker module.
    file : str
        Path to the h5 file.

    """
    with h5py.File(file, "a") as f:
        for bin_name, hists_data in hists.items():
            for key, val in hists_data.items():
                f.create_dataset(f"bin_stats/{bin_name}/{key}", data=val)


def read_hists_from_h5file(file):
    """ Read the binning stats from the file, put it in dicts. """
    data = {}
    with h5py.File(file, "r") as f:
        for bin_name, hists_data in f["bin_stats/"].items():
            data[bin_name] = {k: v[()] for k, v in hists_data.items()}
    return data


def plot_hist_of_files(save_as, files=None):
    """
    Plot the binning stats of all h5 files in the current directory.

    Parameters
    ----------
    save_as : str
        Where to save the plot to.
    files : List, optional
        Path of files to use instead.

    """
    if not files:
        files = get_all_h5_files()

    hists_list = []
    print("Plotting stats of {} file(s)".format(len(files)))

    for file in files:
        try:
            hists_list.append(read_hists_from_h5file(file))
        except KeyError:
            warnings.warn(f"ERROR: No bin_stats in {file}. Skipping ...")
            continue
    if len(hists_list) == 0:
        raise ValueError("No files with bin_stats found!")
    print("Plotting...")
    plot_hists(hists_list, save_as)


def get_all_h5_files():
    """ Get a list of all h5 files in the cwd. """
    files = []
    for file in os.listdir(os.getcwd()):
        if file.endswith(".h5"):
            files.append(file)
    return files


def main():
    parser = argparse.ArgumentParser(
        description='Generate a plot with statistics of the binning. '
                    'Can only be used on files generated with the FileBinner when '
                    'add_bin_stats was set to true (default). ')
    parser.add_argument(
        '--save_as', type=str, default="bin_stats_plot.pdf",
        help='Filename of the plot. Default: bin_stats_plot.pdf.')
    parser.add_argument(
        'files', type=str, nargs='*', default=None,
        help='File(s) to plot. Default: Plot for all h5 files in current dir.')

    plot_hist_of_files(**vars(parser.parse_args()))


if __name__ == "__main__":
    main()