Skip to content
Snippets Groups Projects
Commit 1638a483 authored by ViaFerrata's avatar ViaFerrata
Browse files

1) Added a prod_ident parameter, which is useful if you use multiple mc productions in one dataset.

E.g. if you use the 1-5GeV & the 3-100GeV ORCA 2016 MC, it is not possible to distinguish the >3GeV events from the lowE prod from the events of the highE prod.

2) Now, always a fixed random seed is used. This is needed, if you use multiple input datasets and you want to throw away a fixed portion of events.
parent e0b6925a
No related branches found
No related tags found
No related merge requests found
......@@ -34,7 +34,7 @@ def get_time_residual_nu_interaction_mean_triggered_hits(time_interaction, hits_
return time_residual_vertex
def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d):
def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d, prod_ident):
"""
Reads a km3pipe blob which contains the information for one event.
Returns a hit array and a track array that contains all relevant information of the event.
......@@ -48,6 +48,7 @@ def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts,
:param dict data_cuts: specifies if cuts should be applied. Contains the keys 'triggered' and 'energy_lower_limit'.
:param (bool, str) do4d: Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
In the case of 'channel_id', this information needs to be included in the event_hits as well.
:param int prod_ident: optional int that identifies the used production, more documentation in the docs of the main function.
:return: ndarray(ndim=2) event_hits: 2D array containing the hit information of the event [pos_xyz time (channel_id)].
:return: ndarray(ndim=1) event_track: 1D array containing important MC information of the event.
[event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time]
......@@ -85,9 +86,13 @@ def get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts,
time_residual_vertex = get_time_residual_nu_interaction_mean_triggered_hits(time_interaction, hits_time, triggered)
# save collected information to arrays event_track and event_hits
event_track = np.array([event_id, particle_type, energy, is_cc, bjorkeny, dir_x, dir_y, dir_z, time_track,
run_id, vertex_pos_x, vertex_pos_y, vertex_pos_z, time_residual_vertex], dtype=np.float64)
track = [event_id, particle_type, energy, is_cc, bjorkeny, dir_x, dir_y, dir_z, time_track, run_id,
vertex_pos_x, vertex_pos_y, vertex_pos_z, time_residual_vertex]
if prod_ident is not None: track.append(np.full(event_id.shape, prod_ident))
event_track = np.array(track, dtype=np.float64)
ax = np.newaxis
event_hits = np.concatenate([pos_x[:, ax], pos_y[:, ax], pos_z[:, ax], hits_time[:, ax], triggered[:, ax]], axis=1) # dtype: np.float64
......
......@@ -117,7 +117,7 @@ def calculate_bin_edges(n_bins, geo_fix, fname_geo_limits, do4d):
return x_bin_edges, y_bin_edges, z_bin_edges
def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'), prod_ident = None,
timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=False, data_cuts=None):
"""
Main code. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used for a CNN
......@@ -129,6 +129,10 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
:param bool do3d: Declares if 3D histograms should be created.
:param (bool, str) do4d: Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
Currently, only 'time' and 'channel_id' are available.
:param int prod_ident: optional int identifier for the used mc production.
This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC.
In this case, the events are not fully distinguishable with only the run_id and the event_id!
In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable.
:param (str, str/None) timecut: Tuple that defines what timecut should be used in hits_to_histograms.py.
Currently available:
('timeslice_relative', None): Cuts out the central 30% of the snapshot.
......@@ -141,6 +145,7 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
Supports the following cuts: 'triggered', 'energy_lower_limit'
"""
if data_cuts is None: data_cuts={'triggered': False, 'energy_lower_limit': 0}
np.random.seed(42) # set random seed
filename_input = parse_input(do2d, do2d_pdf)
filename = os.path.basename(os.path.splitext(filename_input)[0])
......@@ -170,7 +175,7 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
print('Event No. ' + str(i))
# filter out all hit and track information belonging that to this event
event_hits, event_track = get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d)
event_hits, event_track = get_event_data(event_blob, geo, do_mc_hits, use_calibrated_file, data_cuts, do4d, prod_ident)
if event_track[2] < data_cuts['energy_lower_limit'] or event_track[2] > data_cuts['energy_upper_limit']:
# Cutting events with energy < threshold (default=0) and with energy > threshold (default=200)
......@@ -182,6 +187,15 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
if throw_away is True:
continue
# # TODO temporary, deprecated solution, we always need to throw away the same events if we have multiple inputs -> use fixed seed
# arr = np.load('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_elec-CC.npy')
# arr_list = arr.tolist()
# evt_id = event_track[0]
# run_id = event_track[9]
#
# if [run_id, evt_id] not in arr_list:
# continue
# event_track: [event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time]
mc_infos.append(event_track)
......@@ -227,14 +241,25 @@ def main(n_bins, geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4
if __name__ == '__main__':
# 3-100GeV
main(n_bins=(11,13,18,300), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,
data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})
# main(n_bins=(11,13,18,60), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
# timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
# data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})
# 1-5GeV
# main(n_bins=(11,13,18,300), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
# timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,
# data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.25})
# 1-5GeV , info throw away: elec-CC 0.25, muon-CC 0.25, elec-NC: 0.00
# main(n_bins=(11,13,18,60), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'time'),
# timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
# data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.00})
# xyz-c
# 3-100GeV
# main(n_bins=(11,13,18,31), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
# timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
# data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 200, 'throw_away_prob': 0})
# 1-5GeV , info throw away: elec-CC 0.25, muon-CC 0.25, elec-NC: 0.00
main(n_bins=(11,13,18,31), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
timecut = ('trigger_cluster', 'tight_1'), do_mc_hits=False, use_calibrated_file=True,
data_cuts = {'triggered': False, 'energy_lower_limit': 0, 'energy_upper_limit': 3, 'throw_away_prob': 0.00})
# main(n_bins=(11,13,18,60), geo_fix=True, do2d=False, do2d_pdf=(False, 10), do3d=False, do4d=(True, 'channel_id'),
# timecut = ('trigger_cluster', 'all'), do_mc_hits=False, use_calibrated_file=True,
......
import h5py
import numpy as np
path = '/home/woody/capn/mppi033h/Data/ORCA_JTE_NEMOWATER/ip_images_1-100GeV/4dTo4d/time_-250+500_w_gf_60b'
# JTE_KM3Sim_gseagen_muon-CC_1-5GeV-9_2E5-1bin-1_0gspec_ORCA115_9m_2016_98_xyzt.h5
ptypes = {'muon-CC': 'JTE_KM3Sim_gseagen_muon-CC_1-5GeV-9_2E5-1bin-1_0gspec_ORCA115_9m_2016_',
'elec-CC': 'JTE_KM3Sim_gseagen_elec-CC_1-5GeV-2_7E5-1bin-1_0gspec_ORCA115_9m_2016_'}
event_id, run_id = None, None
for ptype in ptypes.keys():
for i in range(601):
if i % 100 == 0:
print(i)
if i == 0: continue
f = h5py.File(path + '/' + ptypes[ptype] + str(i) + '_xyzt.h5', 'r')
event_id_f = f['y'][:, 0]
run_id_f = f['y'][:, 9]
if event_id is None:
event_id = event_id_f
run_id = run_id_f
else:
event_id = np.concatenate([event_id, event_id_f], axis=0)
run_id = np.concatenate([run_id, run_id_f], axis=0)
f.close()
ax = np.newaxis
arr = np.concatenate([run_id[:, ax], event_id[:, ax]], axis=1)
np.save('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_' + ptype + '.npy', arr)
event_id, run_id = None, None
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment