Daniel Guderian · 5157ed07 · 9ce542e5 · c3ccddf7 · 3a54214b · 1924c48c
--- a/orcasong/mc_info_extr.py

+ 476

− 20
+++ b/orcasong/mc_info_extr.py

+ 476

− 20
 """
 Functions that extract info from a blob for the mc_info / y datafield
-in the h5 files. Very much WIP.
+in the h5 files.

 These are made for the specific given runs. They might not be
 applicable to other data, and could cause errors or produce unexpected
 @@ -10,35 +10,491 @@ results when used on data other then the specified.

 import warnings
 import numpy as np
+from km3pipe.io.hdf5 import HDF5Header
+from h5py import File

-__author__ = 'Stefan Reck'
+__author__ = "Daniel Guderian"


-def get_real_data(blob):
+def get_std_reco(blob):
+
    """
-    Get info present in real data.
-    Designed for the 2017 one line runs.
+    Function to extract std reco info. The implemented strategy is the following:
+    First, look for whether a rec stag has been reached and only then extract the reconstructed
+    paramater from it. If not, set it to a dummy value (for now 0). This means that for an analysis the events with
+    exactly zero have to be filtered out!
+
+    The 'best track' is the first (highest lik) while a certain rec stage has to be reached. This might
+    have to be adjusted for other recos than JMuonGandalf chain.
+
+
+    Members of the Tracks:
+    dtype([('E', '<f8'), ('JCOPY_Z_M', '<f4'), ('JENERGY_CHI2', '<f4'), ('JENERGY_ENERGY', '<f4'),
+     ('JENERGY_MUON_RANGE_METRES', '<f4'), ('JENERGY_NDF', '<f4'), ('JENERGY_NOISE_LIKELIHOOD', '<f4'),
+     ('JENERGY_NUMBER_OF_HITS', '<f4'), ('JGANDALF_BETA0_RAD', '<f4'), ('JGANDALF_BETA1_RAD', '<f4'),
+     ('JGANDALF_CHI2', '<f4'), ('JGANDALF_LAMBDA', '<f4'), ('JGANDALF_NUMBER_OF_HITS', '<f4'),
+     ('JGANDALF_NUMBER_OF_ITERATIONS', '<f4'), ('JSHOWERFIT_ENERGY', '<f4'), ('JSTART_LENGTH_METRES', '<f4'),
+     ('JSTART_NPE_MIP', '<f4'), ('JSTART_NPE_MIP_TOTAL', '<f4'), ('JVETO_NPE', '<f4'), ('JVETO_NUMBER_OF_HITS', '<f4'),
+     ('dir_x', '<f8'), ('dir_y', '<f8'), ('dir_z', '<f8'), ('id', '<i4'), ('idx', '<i8'), ('length', '<f8'),
+     ('likelihood', '<f8'), ('pos_x', '<f8'), ('pos_y', '<f8'), ('pos_z', '<f8'), ('rec_type', '<i4'),
+     ('t', '<f8'), ('group_id', '<i8')])
+
+
+    members of rec stages:
+    .idx (corresponding to the track id),
+    .rec_stage (rec stage identifier, for JMuonGandalf for example: 1=prefit, 2=simplex, 3=gandalf,
+                    4=engery, 5=start),
+    .group_id (event id in file)
+
+
+    Parameters
+    ----------
+    blob : blob containing the reco info
+
+    Returns
+    -------
+    std_reco_info : dict
+            Dict with the most common std reco params. Can be expanded.

    """
-    event_info = blob['EventInfo'][0]
-    track = {
-        'event_id': event_info.event_id,
-        'run_id': event_info.run_id,
-        'trigger_mask': event_info.trigger_mask,
+
+    # use this later to identify not reconstructed events
+    dummy_value = 0
+
+    # if there was no std reco at all, this will not exist
+    # these are events that stopped at/before prefit
+    try:
+        rec_stages = blob["RecStages"]
+        # get first track only
+        rec_stages_best_track = rec_stages.rec_stage[rec_stages.idx == 0]
+
+        # often enough: best track is the first
+        best_track = blob["Tracks"][0]
+
+    except KeyError:
+        rec_stages_best_track = []
+        print(
+            "An event didnt have any reco. Setting everything to"
+            + str(dummy_value)
+            + "."
+        )
+
+    # take the direction only if JGanalf was executed
+    if 3 in rec_stages_best_track:
+
+        std_dir_x = best_track["dir_x"]
+        std_dir_y = best_track["dir_y"]
+        std_dir_z = best_track["dir_z"]
+
+        std_beta0 = best_track["JGANDALF_BETA0_RAD"]
+        std_lik = best_track["likelihood"]
+        std_n_hits_gandalf = best_track["JGANDALF_NUMBER_OF_HITS"]
+
+    else:
+
+        std_dir_x = dummy_value
+        std_dir_y = dummy_value
+        std_dir_z = dummy_value
+
+        std_beta0 = dummy_value
+        std_lik = dummy_value
+        std_n_hits_gandalf = dummy_value
+
+    # energy fit from JEnergy
+    if 4 in rec_stages_best_track:
+
+        std_energy = best_track["E"]
+        lik_energy = best_track["JENERGY_CHI2"]
+
+    else:
+        std_energy = dummy_value
+        lik_energy = dummy_value
+
+    # vertex and length from JStart
+    if 5 in rec_stages_best_track:
+
+        std_pos_x = best_track["pos_x"]
+        std_pos_y = best_track["pos_y"]
+        std_pos_z = best_track["pos_z"]
+
+        std_length = best_track["JSTART_LENGTH_METRES"]
+
+    else:
+
+        std_pos_x = dummy_value
+        std_pos_y = dummy_value
+        std_pos_z = dummy_value
+
+        std_length = dummy_value
+
+    std_reco_info = {
+        "std_dir_x": std_dir_x,
+        "std_dir_y": std_dir_y,
+        "std_dir_z": std_dir_z,
+        "std_beta0": std_beta0,
+        "std_lik": std_lik,
+        "std_n_hits_gandalf": std_n_hits_gandalf,
+        "std_pos_x": std_pos_x,
+        "std_pos_y": std_pos_y,
+        "std_pos_z": std_pos_z,
+        "std_energy": std_energy,
+        "std_lik_energy": lik_energy,
+        "std_length": std_length,
    }
-    return track
+
+    return std_reco_info


-def get_pure_noise(blob):
+def get_real_data_info_extr(input_file):
+
    """
-    For simulated pure noise events, which have particle_type 0.
+    Wrapper function that includes the actual mc_info_extr
+    for real data. There are no n_gen like in the neutrino case.
+
+    Parameters
+    ----------
+    input_file : km3net data file
+            Can be online or offline format.
+
+    Returns
+    -------
+    mc_info_extr : function
+            The actual mc_info_extr function that holds the extractions.

    """
-    event_info = blob['EventInfo']

-    track = {
-        'event_id': event_info.event_id[0],
-        'run_id': event_info.run_id,
-        'particle_type': 0
-    }
-    return track
+    # check if std reco is present
+    f = File(input_file, "r")
+    has_std_reco = "reco" in f.keys()
+
+    def mc_info_extr(blob):
+
+        """
+        Processes a blob and creates the y with mc_info and, if existing, std reco.
+
+        For this real data case it is only general event info, like the id.
+
+        Parameters
+        ----------
+        blob : dict
+                The blob from the pipeline.
+
+        Returns
+        -------
+        track : dict
+                Containing all the specified info the y should have.
+
+        """
+
+        event_info = blob["EventInfo"][0]
+
+        # add n_hits info for the cut
+        n_hits = len(blob["Hits"])
+
+        track = {
+            "event_id": event_info.event_id,
+            "run_id": event_info.run_id,
+            "trigger_mask": event_info.trigger_mask,
+            "n_hits": n_hits,
+        }
+
+        # get all the std reco info
+        if has_std_reco:
+
+            std_reco_info = get_std_reco(blob)
+
+            track.update(std_reco_info)
+
+        return track
+
+    return mc_info_extr
+
+
+def get_random_noise_mc_info_extr(input_file):
+
+    """
+    Wrapper function that includes the actual mc_info_extr
+    for random noise simulations. There are no n_gen like in the neutrino case.
+
+    Parameters
+    ----------
+    input_file : km3net data file
+            Can be online or offline format.
+
+    Returns
+    -------
+    mc_info_extr : function
+            The actual mc_info_extr function that holds the extractions.
+
+    """
+
+    # check if std reco is present
+    f = File(input_file, "r")
+    has_std_reco = "reco" in f.keys()
+
+    def mc_info_extr(blob):
+
+        """
+        Processes a blob and creates the y with mc_info and, if existing, std reco.
+
+        For this random noise case it is only general event info, like the id.
+
+        Parameters
+        ----------
+        blob : dict
+                The blob from the pipeline.
+
+        Returns
+        -------
+        track : dict
+                Containing all the specified info the y should have.
+
+        """
+        event_info = blob["EventInfo"]
+
+        track = {
+            "event_id": event_info.event_id[0],
+            "run_id": event_info.run_id[0],
+            "particle_type": 0,
+        }
+
+        # get all the std reco info
+        if has_std_reco:
+
+            std_reco_info = get_std_reco(blob)
+
+            track.update(std_reco_info)
+
+        return track
+
+    return mc_info_extr
+
+
+def get_neutrino_mc_info_extr(input_file):
+
+    """
+    Wrapper function that includes the actual mc_info_extr
+    for neutrino simulations. The n_gen parameter, needed for neutrino weighting
+    is extracted from the header of the file.
+
+    Parameters
+    ----------
+    input_file : km3net data file
+            Can be online or offline format.
+
+    Returns
+    -------
+    mc_info_extr : function
+            The actual mc_info_extr function that holds the extractions.
+
+    """
+
+    # check if std reco is present
+    f = File(input_file, "r")
+    has_std_reco = "reco" in f.keys()
+
+    # get the n_gen
+    header = HDF5Header.from_hdf5(input_file)
+    n_gen = header.genvol.numberOfEvents
+
+    def mc_info_extr(blob):
+
+        """
+        Processes a blob and creates the y with mc_info and, if existing, std reco.
+
+        For this neutrino case it is the full mc info for the primary neutrino; there are the several "McTracks":
+        check the simulation which index "p" the neutrino has.
+
+        Parameters
+        ----------
+        blob : dict
+                The blob from the pipeline.
+
+        Returns
+        -------
+        track : dict
+                Containing all the specified info the y should have.
+
+        """
+
+        # get general info about the event
+        event_id = blob["EventInfo"].event_id[0]
+        run_id = blob["EventInfo"].run_id[0]
+        # weights for neutrino analysis
+        weight_w1 = blob["EventInfo"].weight_w1[0]
+        weight_w2 = blob["EventInfo"].weight_w2[0]
+        weight_w3 = blob["EventInfo"].weight_w3[0]
+
+        # first, look for the particular neutrino index of the production
+        p = 0  # for ORCA4 (and probably subsequent productions)
+
+        mc_track = blob["McTracks"][p]
+
+        # some track mc truth info
+        particle_type = mc_track.type
+        energy = mc_track.energy
+        is_cc = mc_track.cc
+        bjorkeny = mc_track.by
+        dir_x, dir_y, dir_z = mc_track.dir_x, mc_track.dir_y, mc_track.dir_z
+        time_interaction = (
+            mc_track.time
+        )  # actually always 0 for primary neutrino, measured in MC time
+        vertex_pos_x, vertex_pos_y, vertex_pos_z = (
+            mc_track.pos_x,
+            mc_track.pos_y,
+            mc_track.pos_z,
+        )
+
+        # add also the nhits info
+        n_hits = len(blob["Hits"])
+
+        track = {
+            "event_id": event_id,
+            "particle_type": particle_type,
+            "energy": energy,
+            "is_cc": is_cc,
+            "bjorkeny": bjorkeny,
+            "dir_x": dir_x,
+            "dir_y": dir_y,
+            "dir_z": dir_z,
+            "time_interaction": time_interaction,
+            "run_id": run_id,
+            "vertex_pos_x": vertex_pos_x,
+            "vertex_pos_y": vertex_pos_y,
+            "vertex_pos_z": vertex_pos_z,
+            "n_hits": n_hits,
+            "weight_w1": weight_w1,
+            "weight_w2": weight_w2,
+            "weight_w3": weight_w3,
+            "n_gen": n_gen,
+        }
+        print(is_cc)
+        # get all the std reco info
+        if has_std_reco:
+
+            std_reco_info = get_std_reco(blob)
+
+            track.update(std_reco_info)
+
+        return track
+
+    return mc_info_extr
+
+
+def get_muon_mc_info_extr(input_file):
+
+    """
+    Wrapper function that includes the actual mc_info_extr
+    for atm. muon simulations. There are no n_gen like in the neutrino case.
+
+    Parameters
+    ----------
+    input_file : km3net data file
+            Can be online or offline format.
+
+    Returns
+    -------
+    mc_info_extr : function
+            The actual mc_info_extr function that holds the extractions.
+
+    """
+
+    # check if std reco is present
+    f = File(input_file, "r")
+    has_std_reco = "reco" in f.keys()
+
+    # no n_gen here, but needed for concatenation
+    n_gen = 1
+
+    def mc_info_extr(blob):
+
+        """
+        Processes a blob and creates the y with mc_info and, if existing, std reco.
+
+        For this atm. muon case it is the full mc info for the primary; there are the several "McTracks":
+        check the simulation to understand what "p" you want. Muons come in bundles that have the same direction.
+        For energy: sum of all muons in a bundle,
+        for vertex: weighted (energy) mean of the individual vertices .
+
+        Parameters
+        ----------
+        blob : dict
+                The blob from the pipeline.
+
+        Returns
+        -------
+        track : dict
+                Containing all the specified info the y should have.
+
+        """
+
+        event_id = blob["EventInfo"].event_id[0]
+        run_id = blob["EventInfo"].run_id[0]
+
+        p = 0  # for ORCA4 (and probably subsequent productions)
+
+        mc_track = blob["McTracks"][p]
+
+        particle_type = (
+            mc_track.type
+        )  # assumed that this is the same for all muons in a bundle
+        is_cc = mc_track.cc  # always 0 actually
+        bjorkeny = mc_track.by
+        time_interaction = mc_track.time  # same for all muons in a bundle
+
+        # sum up the energy of all muons
+        energy = np.sum(blob["McTracks"].energy)
+
+        # all muons in a bundle are parallel, so just take dir of first muon
+        dir_x, dir_y, dir_z = mc_track.dir_x, mc_track.dir_y, mc_track.dir_z
+
+        # vertex is the weighted (energy) mean of the individual vertices
+        vertex_pos_x = np.average(
+            blob["McTracks"][p:].pos_x, weights=blob["McTracks"][p:].energy
+        )
+        vertex_pos_y = np.average(
+            blob["McTracks"][p:].pos_y, weights=blob["McTracks"][p:].energy
+        )
+        vertex_pos_z = np.average(
+            blob["McTracks"][p:].pos_z, weights=blob["McTracks"][p:].energy
+        )
+
+        # add also the nhits info
+        n_hits = len(blob["Hits"])
+
+        # this is only relevant for neutrinos, though dummy info is needed for the concatenation
+        weight_w1 = 10
+        weight_w2 = 10
+        weight_w3 = 10
+
+        track = {
+            "event_id": event_id,
+            "particle_type": particle_type,
+            "energy": energy,
+            "is_cc": is_cc,
+            "bjorkeny": bjorkeny,
+            "dir_x": dir_x,
+            "dir_y": dir_y,
+            "dir_z": dir_z,
+            "time_interaction": time_interaction,
+            "run_id": run_id,
+            "vertex_pos_x": vertex_pos_x,
+            "vertex_pos_y": vertex_pos_y,
+            "vertex_pos_z": vertex_pos_z,
+            "n_hits": n_hits,
+            "weight_w1": weight_w1,
+            "weight_w2": weight_w2,
+            "weight_w3": weight_w3,
+            "n_gen": n_gen,
+        }
+
+        # get all the std reco info
+        if has_std_reco:
+
+            std_reco_info = get_std_reco(blob)
+
+            track.update(std_reco_info)
+
+        return track
+
+    return mc_info_extr