diff --git a/orcasong_plag/core.py b/orcasong_plag/core.py index 30ba800e20dfa1474120a667101ce20473d66805..355a438b51c9c1543afb3045bfe3247893631df4 100644 --- a/orcasong_plag/core.py +++ b/orcasong_plag/core.py @@ -2,7 +2,7 @@ import km3pipe as kp import km3modules as km import os -from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter +from orcasong_plag.modules import TimePreproc, ImageMaker, McInfoMaker, BinningPlotter, plot_hists from orcasong_plag.mc_info_types import get_mc_info_extr @@ -59,8 +59,8 @@ class FileBinner: self.mc_info_extr = mc_info_extr self.bin_plot_freq = 20 - self.n_statusbar = 200 - self.n_memory_observer = 400 + self.n_statusbar = 1000 + self.n_memory_observer = 1000 self.do_time_preproc = True # self.data_cuts = None @@ -75,8 +75,8 @@ class FileBinner: Parameters ---------- - infile : str or List - Path to the input file(s). + infile : str + Path to the input file. outfile : str Path to the output file. @@ -84,34 +84,50 @@ class FileBinner: name, shape = self.get_names_and_shape() print("Generating {} images with shape {}".format(name, shape)) - pipe = kp.Pipeline() + plot_hists = self.bin_plot_freq is not None - if self.n_statusbar is not None: - pipe.attach(km.common.StatusBar, every=self.n_statusbar) - if self.n_memory_observer is not None: - pipe.attach(km.common.MemoryObserver, every=400) + pipe = self.build_pipe(infile, outfile, plot_hists=plot_hists) + pipe.drain() - if not isinstance(infile, list): - infile = [infile] + def run_multi(self, infiles, outfolder): + """ + Bin multiple files into their own output files each. - pipe.attach(kp.io.hdf5.HDF5Pump, filenames=infile) + Will also generate a summary binning plot for all of the files. - self.attach_binning_modules(pipe, outfile=outfile) + Parameters + ---------- + infiles : List + The path to infiles as str. + outfolder : str + The output folder to place them in. - pipe.attach(kp.io.HDF5Sink, - filename=outfile, - complib=self.complib, - complevel=self.complevel, - chunksize=self.chunksize, - flush_frequency=self.flush_frequency) + """ + hists = None + for infile in infiles: + outfile_name = os.path.splitext(os.path.basename(infile))[0] + "_hist.h5" + outfile = outfolder + outfile_name + + pipe = self.build_pipe(infile, outfile, + plot_hists=False, hists_start=hists) + smry = pipe.drain() + hists = smry["BinningPlotter"] + plot_hists(hists, pdf_path=outfolder + "binning_summary.pdf") + + def build_pipe(self, infile, outfile, plot_hists=True, hists_start=None): + """ + Build the pipe to generate images and mc_info for a file. + """ - pipe.drain() + pipe = kp.Pipeline() - def attach_binning_modules(self, pipe, outfile): - """ - Attach modules to a km3pipe which transform a blob to images and mc_info. + if self.n_statusbar is not None: + pipe.attach(km.common.StatusBar, every=self.n_statusbar) + if self.n_memory_observer is not None: + pipe.attach(km.common.MemoryObserver, every=self.n_memory_observer) + + pipe.attach(kp.io.hdf5.HDF5Pump, filename=infile) - """ pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader', 'McTracks', 'Hits', 'McHits']) if self.do_time_preproc: @@ -122,10 +138,14 @@ class FileBinner: # pipe.attach(EventSkipper, data_cuts=self.data_cuts) if self.bin_plot_freq is not None: - pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf" + if plot_hists: + pdf_name = os.path.splitext(outfile)[0] + "_hists.pdf" + else: + pdf_name = None pipe.attach(BinningPlotter, bin_plot_freq=self.bin_plot_freq, bin_edges_list=self.bin_edges_list, + hists_start=hists_start, pdf_path=pdf_name) pipe.attach(ImageMaker, @@ -144,6 +164,14 @@ class FileBinner: pipe.attach(km.common.Keep, keys=['histogram', 'mc_info']) + pipe.attach(kp.io.HDF5Sink, + filename=outfile, + complib=self.complib, + complevel=self.complevel, + chunksize=self.chunksize, + flush_frequency=self.flush_frequency) + return pipe + def get_names_and_shape(self): """ Get names and shape of the resulting x data, e.g. (pos_z, time), (18, 50). diff --git a/orcasong_plag/modules.py b/orcasong_plag/modules.py index 139bd4289188e46e87051b572ccecc815e24069b..9ddcb6558b6b8cf6d1e4002ec31256187040474b 100644 --- a/orcasong_plag/modules.py +++ b/orcasong_plag/modules.py @@ -137,7 +137,7 @@ class BinningPlotter(kp.Module): bin_edges_list : List List with the names of the fields to bin, and the respective bin edges, including the left- and right-most bin edge. - pdf_path : str + pdf_path : str, optional Where to save the hists to. This pdf will contain all the field names on their own page each. bin_plot_freq : int @@ -150,22 +150,35 @@ class BinningPlotter(kp.Module): plot_bin_edges : bool If true, will plot the bin edges as horizontal lines. Is never used for the time binning (field name "time"). + hists_start : dict, optional + Starting values for the statistics. """ def configure(self): self.bin_edges_list = self.require('bin_edges_list') - self.pdf_path = self.require('pdf_path') + self.pdf_path = self.get('pdf_path', default=None) self.bin_plot_freq = self.get("bin_plot_freq", default=20) self.res_increase = self.get('res_increase', default=5) self.plot_bin_edges = self.get('plot_bin_edges', default=True) + self.hists_start = self.get('hists_start', default=None) - self.hists = {} - for bin_name, bin_edges in self._yield_spaced_bin_edges(): - self.hists[bin_name] = { - "hist": np.zeros(len(bin_edges) - 1), - "out_pos": 0, - "out_neg": 0, - } + if self.hists_start is None: + self.hists = {} + for bin_name, org_bin_edges in self.bin_edges_list: + if bin_name == "time": + bin_edges = org_bin_edges + else: + bin_edges = self._space_bin_edges(org_bin_edges) + + self.hists[bin_name] = { + "hist": np.zeros(len(bin_edges) - 1), + "hist_bin_edges": bin_edges, + "bin_edges": org_bin_edges, + "out_pos": 0, + "out_neg": 0, + } + else: + self.hists = self.hists_start self.i = 0 @@ -179,23 +192,19 @@ class BinningPlotter(kp.Module): return bin_edges - def _yield_spaced_bin_edges(self): - for bin_name, bin_edges in self.bin_edges_list: - if bin_name != "time": - bin_edges = self._space_bin_edges(bin_edges) - yield bin_name, bin_edges - def process(self, blob): """ Extract data from blob for the hist plots. """ if self.i % self.bin_plot_freq == 0: - for bin_name, bin_edges in self._yield_spaced_bin_edges(): + for bin_name, hists_data in self.hists.items(): + hist_bin_edges = hists_data["hist_bin_edges"] + data = blob["Hits"][bin_name] - hist = np.histogram(data, bins=bin_edges)[0] + hist = np.histogram(data, bins=hist_bin_edges)[0] - out_pos = data[data > np.max(bin_edges)].size - out_neg = data[data < np.min(bin_edges)].size + out_pos = data[data > np.max(hist_bin_edges)].size + out_neg = data[data < np.min(hist_bin_edges)].size self.hists[bin_name]["hist"] += hist self.hists[bin_name]["out_pos"] += out_pos @@ -208,44 +217,54 @@ class BinningPlotter(kp.Module): """ Make and save the histograms to pdf. """ - with PdfPages(self.pdf_path) as pdf_file: - for bin_name, org_bin_edges in self.bin_edges_list: - hist = self.hists[bin_name]["hist"] - out_pos = self.hists[bin_name]["out_pos"] - out_neg = self.hists[bin_name]["out_neg"] - hist_frac = hist / (np.sum(hist) + out_pos + out_neg) + if self.pdf_path is not None: + plot_hists(self.hists, self.pdf_path, + plot_bin_edges=self.plot_bin_edges) - if bin_name != "time": - bin_edges = self._space_bin_edges(org_bin_edges) - else: - bin_edges = org_bin_edges + return self.hists - bin_spacing = bin_edges[1] - bin_edges[0] - fig, ax = plt.subplots() - plt.bar(bin_edges[:-1], - hist_frac, - align="edge", - width=0.9*bin_spacing, - ) - ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1)) - - if self.plot_bin_edges and bin_name != "time": - for bin_edge in org_bin_edges: - plt.axvline(x=bin_edge, color='grey', linestyle='-', - linewidth=1, alpha=0.9) - - # place a text box in upper left in axes coords - out_pos_rel = out_pos / np.sum(hist) - out_neg_rel = out_neg / np.sum(hist) - textstr = "Hits cut off:\n Left: {:.1%}\n" \ - " Right: {:.1%}".format(out_neg_rel, out_pos_rel) - props = dict(boxstyle='round', facecolor='white', alpha=0.9) - ax.text(0.05, 0.95, textstr, transform=ax.transAxes, - verticalalignment='top', bbox=props) - - plt.xlabel(bin_name) - plt.ylabel("Fraction of hits") - - pdf_file.savefig(fig) - print(bin_name, out_neg, out_pos) - print("Saved binning plot to " + self.pdf_path) + +def plot_hists(hists, pdf_path, plot_bin_edges=True): + """ + Plot histograms made by the BinningPlotter to the given pdf path. + + """ + with PdfPages(pdf_path) as pdf_file: + for bin_name, hists_data in hists.items(): + hist_bin_edges = hists_data["hist_bin_edges"] + bin_edges = hists_data["bin_edges"] + hist = hists_data["hist"] + out_pos = hists_data["out_pos"] + out_neg = hists_data["out_neg"] + + hist_frac = hist / (np.sum(hist) + out_pos + out_neg) + + bin_spacing = hist_bin_edges[1] - hist_bin_edges[0] + fig, ax = plt.subplots() + plt.bar(hist_bin_edges[:-1], + hist_frac, + align="edge", + width=0.9 * bin_spacing, + ) + ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1)) + + if plot_bin_edges and bin_name != "time": + for bin_edge in bin_edges: + plt.axvline(x=bin_edge, color='grey', linestyle='-', + linewidth=1, alpha=0.9) + + # place a text box in upper left in axes coords + out_pos_rel = out_pos / np.sum(hist) + out_neg_rel = out_neg / np.sum(hist) + textstr = "Hits cut off:\n Left: {:.1%}\n" \ + " Right: {:.1%}".format(out_neg_rel, out_pos_rel) + props = dict(boxstyle='round', facecolor='white', alpha=0.9) + ax.text(0.05, 0.95, textstr, transform=ax.transAxes, + verticalalignment='top', bbox=props) + + plt.xlabel(bin_name) + plt.ylabel("Fraction of hits") + + pdf_file.savefig(fig) + + print("Saved binning plot to " + pdf_path)