1) Added a prod_ident parameter, which is useful if you use multiple mc productions in one dataset.

E.g. if you use the 1-5GeV & the 3-100GeV ORCA 2016 MC, it is not possible to distinguish the >3GeV events from the lowE prod from the events of the highE prod. 2) Now, always a fixed random seed is used. This is needed, if you use multiple input datasets and you want to throw away a fixed portion of events.

1) Added a prod_ident parameter, which is useful if you use multiple mc productions in one dataset.
1638a483 · ViaFerrata · e0b6925a · 1638a483 · 1638a483 · 1638a483
Commit 1638a483 authored 6 years ago by ViaFerrata
--- a/file_to_hits.py
+++ b/file_to_hits.py
@@ -34,7 +34,7 @@ def get_time_residual_nu_interaction_mean_triggered_hits(time_interaction, hits_
    return time_residual_vertex


-def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d):
+def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d, prod_ident):
    """
    Reads a km3pipe blob which contains the information for one event.
    Returns a hit array and a track array that contains all relevant information of the event.
@@ -48,6 +48,7 @@ def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts,
    :param dict data_cuts: specifies if cuts should be applied. Contains the keys 'triggered' and 'energy_lower_limit'.
    :param (bool, str) do4d: Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
                             In the case of 'channel_id', this information needs to be included in the event_hits as well.
+    :param int prod_ident: optional int that identifies the used production, more documentation in the docs of the main function.
    :return: ndarray(ndim=2) event_hits: 2D array containing the hit information of the event [pos_xyz time (channel_id)].
    :return: ndarray(ndim=1) event_track: 1D array containing important MC information of the event.
                                          [event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time]
@@ -85,9 +86,13 @@ def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts,

    time_residual_vertex = get_time_residual_nu_interaction_mean_triggered_hits(time_interaction, hits_time, triggered)

+
    # save collected information to arrays event_track and event_hits
-    event_track = np.array([event_id, particle_type, energy, is_cc, bjorkeny, dir_x, dir_y, dir_z, time_track,
-                            run_id, vertex_pos_x, vertex_pos_y, vertex_pos_z, time_residual_vertex], dtype=np.float64)
+    track = [event_id, particle_type, energy, is_cc, bjorkeny, dir_x, dir_y, dir_z, time_track, run_id,
+             vertex_pos_x, vertex_pos_y, vertex_pos_z, time_residual_vertex]
+    if prod_ident is not None: track.append(np.full(event_id.shape, prod_ident))
+
+    event_track = np.array(track, dtype=np.float64)

    ax = np.newaxis
    event_hits = np.concatenate([pos_x[:, ax], pos_y[:, ax], pos_z[:, ax], hits_time[:, ax], triggered[:, ax]], axis=1) # dtype: np.float64

--- a/h5_data_to_h5_input.py
+++ b/h5_data_to_h5_input.py
@@ -117,7 +117,7 @@ def calculate_bin_edges(n_bins, geo_fix, fname_geo_limits, do4d):
    return x_bin_edges, y_bin_edges, z_bin_edges


-def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
+def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'), prod_ident = None,
         timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=False, data_cuts=None):
    """
    Main code. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used for a CNN
@@ -129,6 +129,10 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
    :param bool do3d: Declares if 3D histograms should be created.
    :param (bool, str) do4d: Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
                             Currently, only 'time' and 'channel_id' are available.
+    :param int prod_ident: optional int identifier for the used mc production.
+                           This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC.
+                           In this case, the events are not fully distinguishable with only the run_id and the event_id!
+                           In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable.
    :param (str, str/None) timecut: Tuple that defines what timecut should be used in hits_to_histograms.py.
                                    Currently available:
                                    ('timeslice_relative', None): Cuts out the central 30% of the snapshot.
@@ -141,6 +145,7 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
                           Supports the following cuts: 'triggered', 'energy_lower_limit'
    """
    if data_cuts is None: data_cuts={'triggered': False, 'energy_lower_limit': 0}
+    np.random.seed(42) # set random seed

    filename_input = parse_input(do2d, do2d_pdf)
    filename = os.path.basename(os.path.splitext(filename_input)[0])
@@ -170,7 +175,7 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
            print('Event No. ' + str(i))

        # filter out all hit and track information belonging that to this event
-        event_hits, event_track = get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d)
+        event_hits, event_track = get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d, prod_ident)

        if event_track[2] < data_cuts['energy_lower_limit'] or event_track[2] > data_cuts['energy_upper_limit']:
            # Cutting events with energy < threshold (default=0) and with energy > threshold (default=200)
@@ -182,6 +187,15 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
            if throw_away is True:
                continue

+        #     # TODO temporary, deprecated solution, we always need to throw away the same events if we have multiple inputs -> use fixed seed
+        #     arr = np.load('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_elec-CC.npy')
+        #     arr_list = arr.tolist()
+        #     evt_id = event_track[0]
+        #     run_id = event_track[9]
+        #
+        #     if [run_id, evt_id] not in arr_list:
+        #         continue
+
        # event_track: [event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time]
        mc_infos.append(event_track)

@@ -227,14 +241,25 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4

 if __name__ == '__main__':
    # 3-100GeV
-    main(n_bins=(11,13,18,300), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
-         timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,
-         data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})
+    # main(n_bins=(11,13,18,60), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
+    #      timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
+    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})

-    # 1-5GeV
-    # main(n_bins=(11,13,18,300), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
-    #      timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,
-    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.25})
+    # 1-5GeV , info throw away: elec-CC 0.25, muon-CC 0.25, elec-NC: 0.00
+    # main(n_bins=(11,13,18,60), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
+    #      timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
+    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.00})
+
+    # xyz-c
+    # 3-100GeV
+    # main(n_bins=(11,13,18,31), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
+    #      timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
+    #      data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})
+
+    # 1-5GeV , info throw away: elec-CC 0.25, muon-CC 0.25, elec-NC: 0.00
+    main(n_bins=(11,13,18,31), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
+         timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
+         data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.00})

    # main(n_bins=(11,13,18,60), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
    #      timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,

--- a/utilities/low_e_prod_get_surviving_events.py
+++ b/utilities/low_e_prod_get_surviving_events.py
+import h5py
+import numpy as np
+
+path = '/home/woody/capn/mppi033h/Data/ORCA_JTE_NEMOWATER/ip_images_1-100GeV/4dTo4d/time_-250+500_w_gf_60b'
+# JTE_KM3Sim_gseagen_muon-CC_1-5GeV-9_2E5-1bin-1_0gspec_ORCA115_9m_2016_98_xyzt.h5
+ptypes = {'muon-CC': 'JTE_KM3Sim_gseagen_muon-CC_1-5GeV-9_2E5-1bin-1_0gspec_ORCA115_9m_2016_',
+          'elec-CC': 'JTE_KM3Sim_gseagen_elec-CC_1-5GeV-2_7E5-1bin-1_0gspec_ORCA115_9m_2016_'}
+
+event_id, run_id = None, None
+for ptype in ptypes.keys():
+    for i in range(601):
+        if i % 100 == 0:
+            print(i)
+        if i == 0: continue
+
+        f = h5py.File(path + '/' + ptypes[ptype] + str(i) + '_xyzt.h5', 'r')
+        event_id_f = f['y'][:, 0]
+        run_id_f = f['y'][:, 9]
+
+        if event_id is None:
+            event_id = event_id_f
+            run_id = run_id_f
+        else:
+            event_id = np.concatenate([event_id, event_id_f], axis=0)
+            run_id = np.concatenate([run_id, run_id_f], axis=0)
+
+        f.close()
+
+    ax = np.newaxis
+    arr = np.concatenate([run_id[:, ax], event_id[:, ax]], axis=1)
+    np.save('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_' + ptype + '.npy', arr)
+    event_id, run_id = None, None
+
+
+
+