data_to_images.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Main OrcaSong code which takes raw simulated .h5 files as input in order to generate 2D/3D/4D histograms ('images') that can be used for CNNs."""

import os
import sys
import warnings
#from memory_profiler import profile # for memory profiling, call with @profile; myfunc()
#import line_profiler # call with kernprof -l -v file.py args
import km3pipe as kp
import matplotlib as mpl
mpl.use('Agg')
from matplotlib.backends.backend_pdf import PdfPages

from orcasong.file_to_hits import *
from orcasong.histograms_to_files import *
from orcasong.hits_to_histograms import *

__author__ = 'Michael Moser'
__license__ = 'AGPL'
__version__ = '1.0'
__email__ = 'michael.m.moser@fau.de'
__status__ = 'Prototype'


def parse_input(do2d, do2d_pdf):
    """
    Handles input exceptions, warnings and helps.

    :param bool do2d: Boolean flag for creation of 2D histograms.
    :param do2d_pdf: Boolean flag for creation of 2D pdf images.
    :type do2d_pdf: tuple(bool, int)
    :rtype: str
    :return: fname: Parsed filename.
    """
    if len(sys.argv) < 2 or str(sys.argv[1]) == "-h":
        print("Usage: python " + str(sys.argv[0]) + " file.h5")
        sys.exit(1)

    if do2d==False and do2d_pdf==True:
        raise ValueError('The 2D pdf images cannot be created if do2d=False. Please try again.')

    if do2d_pdf[0] is True and do2d_pdf[1] > 100:
        warnings.warn('You declared do2d_pdf=(True, int) with int > 100. This will take more than two minutes.'
                      'Do you really want to create pdfs images for so many events?')

    if not os.path.isfile(str(sys.argv[1])):
        raise IOError('The file -' + str(sys.argv[1]) + '- does not exist.')

    fname = str(sys.argv[1])
    return fname


def calculate_bin_edges_test(geo, y_bin_edge, z_bin_edge):
    """
    Tests, if the bins in one direction don't accidentally have more than 'one' OM.
    For the x-direction, an overlapping can not be avoided in an orthogonal space.
    For y and z though, it can!
    For y, every bin should contain the number of lines per y-direction * 18 for 18 doms per line.
    For z, every bin should contain 115 entries, since every z bin contains one storey of the 115 ORCA lines.
    Not 100% accurate, since only the dom positions are used and not the individual pmt positions for a dom.
    """
    geo_y = geo[:, 2]
    geo_z = geo[:, 3]
    hist_y = np.histogram(geo_y, bins=y_bin_edge)
    hist_z = np.histogram(geo_z, bins=z_bin_edge)

    print('----------------------------------------------------------------------------------------------')
    print('Y-axis: Bin content: ' + str(hist_y[0]))
    print('It should be:        ' + str(np.array(
        [4 * 18, 7 * 18, 9 * 18, 10 * 18, 9 * 18, 10 * 18, 10 * 18, 10 * 18, 11 * 18, 10 * 18, 9 * 18, 8 * 18, 8 * 18])))
    print('Y-axis: Bin edges: ' + str(hist_y[1]))
    print('..............................................................................................')
    print('Z-axis: Bin content: ' + str(hist_z[0]))
    print('It should have 115 entries everywhere')
    print('Z-axis: Bin edges: ' + str(hist_z[1]))
    print('----------------------------------------------------------------------------------------------')


def calculate_bin_edges(n_bins, det_geo, fname_geo_limits, do4d):
    """
    Calculates the bin edges for the corresponding detector geometry (1 DOM/bin) based on the number of specified bins.
    Used later on for making the event "images" with the in the np.histogramdd funcs in hits_to_histograms.py.
    The bin edges are necessary in order to get the same bin size for each event regardless of the fact if all bins have a hit or not.

    :param tuple n_bins: contains the desired number of bins for each dimension. [n_bins_x, n_bins_y, n_bins_z]
    :param str det_geo: declares what detector geometry should be used for the binning.
    :param str fname_geo_limits: filepath of the .txt ORCA geometry file.
    :param (bool, str) do4d: Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
    :return: ndarray(ndim=1) x_bin_edges, y_bin_edges, z_bin_edges: contains the resulting bin edges for each dimension.
    """
    print("Reading detector geometry in order to calculate the detector dimensions from file " + fname_geo_limits)
    geo = np.loadtxt(fname_geo_limits)

    # derive maximum and minimum x,y,z coordinates of the geometry input [[first_OM_id, xmin, ymin, zmin], [last_OM_id, xmax, ymax, zmax]]
    geo_limits = np.nanmin(geo, axis = 0), np.nanmax(geo, axis = 0)
    print('Detector dimensions [[first_OM_id, xmin, ymin, zmin], [last_OM_id, xmax, ymax, zmax]]: ' + str(geo_limits))

    if det_geo == 'Orca_115l_23m_h_9m_v' or det_geo == 'Orca_115l_23m_h_?m_v':
        x_bin_edges = np.linspace(geo_limits[0][1] - 9.95, geo_limits[1][1] + 9.95, num=n_bins[0] + 1) #try to get the lines in the bin center 9.95*2 = average x-separation of two lines
        y_bin_edges = np.linspace(geo_limits[0][2] - 9.75, geo_limits[1][2] + 9.75, num=n_bins[1] + 1) # Delta y = 19.483

        # Fitted offsets: x,y,factor: factor*(x+x_off), # Stefan's modifications:
        offset_x, offset_y, scale = [6.19, 0.064, 1.0128]
        x_bin_edges = (x_bin_edges + offset_x) * scale
        y_bin_edges = (y_bin_edges + offset_y) * scale

        if det_geo == 'Orca_115l_23m_h_?m_v':
            # ORCA denser detector study
            z_bin_edges = np.linspace(37.84 - 7.5, 292.84 + 7.5, num=n_bins[2] + 1)  # 15m vertical, 18 DOMs
            # z_bin_edges = np.linspace(37.84 - 6, 241.84 + 6, num=n_bins[2] + 1)  # 12m vertical, 18 DOMs
            # z_bin_edges = np.linspace(37.84 - 4.5, 190.84 + 4.5, num=n_bins[2] + 1)  # 9m vertical, 18 DOMs
            # z_bin_edges = np.linspace(37.84 - 3, 139.84 + 3, num=n_bins[2] + 1)  # 6m vertical, 18 DOMs
            # z_bin_edges = np.linspace(37.84 - 2.25, 114.34 + 2.25, num=n_bins[2] + 1)  # 4.5m vertical, 18 DOMs

        else:
            n_bins_z = n_bins[2] if do4d[1] != 'xzt-c' else n_bins[1] # n_bins = [xyz,t/c] or n_bins = [xzt,c]
            z_bin_edges = np.linspace(geo_limits[0][3] - 4.665, geo_limits[1][3] + 4.665, num=n_bins_z + 1)  # Delta z = 9.329

        # calculate_bin_edges_test(geo, y_bin_edges, z_bin_edges) # test disabled by default. Activate it, if you change the offsets in x/y/z-bin-edges

    else:
        raise ValueError('The specified detector geometry "' + str(det_geo) + '" is not available.')

    return x_bin_edges, y_bin_edges, z_bin_edges


def main(n_bins, det_geo, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'), prod_ident=None,
         timecut=('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=False, data_cuts=None):
    """
    Main code. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used for a CNN.

    :param tuple(int) n_bins: Declares the number of bins that should be used for each dimension (x,y,z,t).
    :param str det_geo: declares what detector geometry should be used for the binning. E.g. 'Orca_115l_23m_h_9m_v'.
    :param bool do2d: Declares if 2D histograms should be created.
    :param (bool, int) do2d_pdf: Declares if pdf visualizations of the 2D histograms should be created. Cannot be called if do2d=False.
                                 The event loop will be stopped after the integer specified in the second argument.
    :param bool do3d: Declares if 3D histograms should be created.
    :param (bool, str) do4d: Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
                             Currently, only 'time' and 'channel_id' are available.
    :param int prod_ident: optional int identifier for the used mc production.
                           This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC.
                           In this case, the events are not fully distinguishable with only the run_id and the event_id!
                           In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable.
    :param (str, str/None) timecut: Tuple that defines what timecut should be used in hits_to_histograms.py.
                                    Currently available:
                                    ('timeslice_relative', None): Cuts out the central 30% of the snapshot.
                                    ('trigger_cluster', 'all' / 'tight-1' / 'tight-2'): Cuts based on the mean of the triggered hits.
                                    all: [-350ns, 850ns] -> 20ns / bin (60 bins)
                                    tight-1: [-250ns, 500ns] -> 12.5ns / bin , tight-2: [-150ns, 200ns] -> 5.8ns / bin
    :param bool do_mc_hits: Declares if hits (False, mc_hits + BG) or mc_hits (True) should be processed
    :param bool use_calibrated_file: Declares if the input file is already calibrated (pos_x/y/z, time) or not.
    :param dict data_cuts: Dictionary that contains information about any possible cuts that should be applied.
                           Supports the following cuts: 'triggered', 'energy_lower_limit'
    """
    if data_cuts is None: data_cuts={'triggered': False, 'energy_lower_limit': 0}
    np.random.seed(42) # set random seed

    filename_input = parse_input(do2d, do2d_pdf)
    filename = os.path.basename(os.path.splitext(filename_input)[0])
    filename_output = filename.replace('.','_')
    filename_geo_limits = 'ORCA_Geo_115lines.txt' # used for calculating the dimensions of the ORCA can

    geo = None
    if use_calibrated_file is False:
        filename_geometry = 'orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx'  # used for x/y/z calibration
        if os.path.isfile(filename_geometry) is True:
            geo = kp.Geometry(filename='/home/woody/capn/mppi033h/misc/orca_detectors/fixed/' + filename_geometry)
        else:
            raise IOError('The .detx file does not exist in the default path </home/woody/capn/mppi033h/misc/orca_detectors/fixed/>! '
                          'Change the path or add the .detx file to the default path.')

    x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges(n_bins, det_geo, filename_geo_limits, do4d)

    all_4d_to_2d_hists, all_4d_to_3d_hists, all_4d_to_4d_hists, mc_infos = [], [], [], []

    pdf_2d_plots = PdfPages('Results/4dTo2d/' + filename_output + '_plots.pdf') if do2d_pdf[0] is True else None

    # Initialize HDF5Pump of the input file
    event_pump = kp.io.hdf5.HDF5Pump(filename=filename_input)
    print('Generating histograms from the hits in XYZT format for files based on ' + filename_input)
    for i, event_blob in enumerate(event_pump):
        if i % 10 == 0:
            print('Event No. ' + str(i))

        # filter out all hit and track information belonging that to this event
        event_hits, event_track = get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d, prod_ident)

        if event_track[2] < data_cuts['energy_lower_limit'] or event_track[2] > data_cuts['energy_upper_limit']:
            # Cutting events with energy < threshold (default=0) and with energy > threshold (default=200)
            continue

        if data_cuts['throw_away_prob'] > 0:
            throw_away_prob = data_cuts['throw_away_prob']
            throw_away = np.random.choice([False, True], p=[1-throw_away_prob, throw_away_prob])
            if throw_away is True:
                continue

        #     # TODO temporary, deprecated solution, we always need to throw away the same events if we have multiple inputs -> use fixed seed
        #     arr = np.load('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_elec-CC.npy')
        #     arr_list = arr.tolist()
        #     evt_id = event_track[0]
        #     run_id = event_track[9]
        #
        #     if [run_id, evt_id] not in arr_list:
        #         continue

        # event_track: [event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time]
        mc_infos.append(event_track)

        if do2d:
            compute_4d_to_2d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_2d_hists, timecut, event_track, do2d_pdf[0], pdf_2d_plots)

        if do3d:
            compute_4d_to_3d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_3d_hists, timecut)

        if do4d[0]:
            compute_4d_to_4d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_4d_hists, timecut, do4d)

        if do2d_pdf[0] is True and i >= do2d_pdf[1]:
            pdf_2d_plots.close()
            break

    if do2d:
        store_histograms_as_hdf5(np.stack([hist_tuple[0] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xy/' + filename_output + '_xy.h5')
        store_histograms_as_hdf5(np.stack([hist_tuple[1] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xz/' + filename_output + '_xz.h5')
        store_histograms_as_hdf5(np.stack([hist_tuple[2] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/yz/' + filename_output + '_yz.h5')
        store_histograms_as_hdf5(np.stack([hist_tuple[3] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xt/' + filename_output + '_xt.h5')
        store_histograms_as_hdf5(np.stack([hist_tuple[4] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/yt/' + filename_output + '_yt.h5')
        store_histograms_as_hdf5(np.stack([hist_tuple[5] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/zt/' + filename_output + '_zt.h5')

    if do3d:
        store_histograms_as_hdf5(np.stack([hist_tuple[0] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xyz/' + filename_output + '_xyz.h5', compression=('gzip', 1))
        store_histograms_as_hdf5(np.stack([hist_tuple[1] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xyt/' + filename_output + '_xyt.h5', compression=('gzip', 1))
        store_histograms_as_hdf5(np.stack([hist_tuple[2] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xzt/' + filename_output + '_xzt.h5', compression=('gzip', 1))
        store_histograms_as_hdf5(np.stack([hist_tuple[3] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/yzt/' + filename_output + '_yzt.h5', compression=('gzip', 1))
        store_histograms_as_hdf5(np.stack([hist_tuple[4] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/rzt/' + filename_output + '_rzt.h5', compression=('gzip', 1))

    if do4d[0]:
        folder = ''
        if not os.path.exists('Results/4dTo4d/h5/xyzt/' + folder):
            os.makedirs('Results/4dTo4d/h5/xyzt/' + folder)
        if folder != '': folder += '/'

        if do4d[1] == 'channel_id':
            store_histograms_as_hdf5(np.array(all_4d_to_4d_hists), np.array(mc_infos), 'Results/4dTo4d/h5/xyzc/' + folder + filename_output + '_xyzc.h5', compression=('gzip', 1))
        else:
            store_histograms_as_hdf5(np.array(all_4d_to_4d_hists), np.array(mc_infos), 'Results/4dTo4d/h5/xyzt/' + folder + filename_output + '_xyzt.h5', compression=('gzip', 1))


if __name__ == '__main__':
    # 3-100GeV
    # main(n_bins=(11,13,18,60), det_geo='Orca_115l_23m_h_9m_v', do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
    #      timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})

    # 1-5GeV , info throw away: elec-CC 0.25, muon-CC 0.25, elec-NC: 0.00
    # main(n_bins=(11,13,18,60), det_geo='Orca_115l_23m_h_9m_v', do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
    #      timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.00})

    # xyz-c
    # 3-100GeV
    # main(n_bins=(11,13,18,31), det_geo='Orca_115l_23m_h_9m_v', do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
    #      timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})

    # 1-5GeV , info throw away: elec-CC 0.25, muon-CC 0.25, elec-NC: 0.00
    main(n_bins=(11,13,18,31), det_geo='Orca_115l_23m_h_9m_v', do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
         timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
         data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.00})

    # main(n_bins=(11,13,18,60), det_geo='Orca_115l_23m_h_9m_v', do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
    #      timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,
    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})