Newer
Older
Daniel Guderian
committed
"""
Functions that extract info from a blob for the mc_info / y datafield
in the h5 files.
These are made for the specific given runs. They might not be
applicable to other data, and could cause errors or produce unexpected
results when used on data other then the specified. Check for example the
primary position in the mc_tracks.
Daniel Guderian
committed
"""
import warnings
import numpy as np
from km3pipe.io.hdf5 import HDF5Header
from h5py import File
__author__ = "Daniel Guderian"
Daniel Guderian
committed
def get_std_reco(blob,rec_types,rec_parameters_names):
Daniel Guderian
committed
"""
Function to extract std reco info. This implementation requires h5 files
to be processed with the option "--best_tracks" which adds the selection
of best tracks for each reco type to the output using the km3io tools.
Daniel Guderian
committed
Returns
-------
std_reco_info : dict
Dict with the std reco info of the best tracks.
Daniel Guderian
committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
#this dict will be filled up
std_reco_info = {}
#all known reco types to iterate over
reco_type_dict = {
"BestJmuon" : ("jmuon_","best_jmuon"),
"BestJshower" : ("jshower_","best_jshower"),
"BestDusjshower" : ("dusjshower_","best_dusjshower"),
"BestAashower" : ("aashower_","best_aashower"),
}
for name_in_blob,(identifier,best_track_name) in reco_type_dict.items():
#always write out something for the generally present rec types
if best_track_name in rec_types:
#specific names are with the prefix from the rec type
specific_reco_names = np.core.defchararray.add(identifier,rec_parameters_names)
#extract actually present info
if name_in_blob in blob:
#get the previously identified best track
bt = blob[name_in_blob]
#get all its values
values = bt.item()
values_list = list(values)
#reco_names = bt.dtype.names #in case the fitinf and stuff will be tailored to the reco types
#at some point, get the names directly like this
#in case there is no reco for this event but the reco type was done in general
else:
#fill all values with nan's
values_array = np.empty(len(specific_reco_names))
values_array[:] = np.nan
values_list = values_array.tolist()
#create a dict out of them
keys_list = list(specific_reco_names)
zip_iterator = zip(keys_list, values_list)
reco_dict = dict(zip_iterator)
#add this dict to the complete std reco collection
std_reco_info.update(reco_dict)
return std_reco_info
def get_rec_types_in_file(file):
"""
Checks and returns which rec types are in the file and thus need to be present
in all best track and their fitinf information later.
"""
#the known rec types
rec_type_names = ["best_jmuon","best_jshower","best_dusjshower","best_aashower"]
#all reco related objects in the file
reco_objects_in_file = file["reco"].keys()
#check which ones are in there
rec_types_in_file = []
for rec_type in rec_type_names:
if rec_type in reco_objects_in_file:
rec_types_in_file.append(rec_type)
#also get from here the list of dtype names that is share for all recos
rec_parameters_names = file["reco"][rec_type].dtype.names
return rec_types_in_file,rec_parameters_names
Daniel Guderian
committed
def get_real_data_info_extr(input_file):
"""
Wrapper function that includes the actual mc_info_extr
for real data. There are no n_gen like in the neutrino case.
Parameters
----------
input_file : km3net data file
Can be online or offline format.
Returns
-------
mc_info_extr : function
The actual mc_info_extr function that holds the extractions.
"""
# check if std reco is present
f = File(input_file, "r")
has_std_reco = "reco" in f.keys()
#also check, which rec types are present
rec_types,rec_parameters_names = get_rec_types_in_file(f)
Daniel Guderian
committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def mc_info_extr(blob):
"""
Processes a blob and creates the y with mc_info and, if existing, std reco.
For this real data case it is only general event info, like the id.
Parameters
----------
blob : dict
The blob from the pipeline.
Returns
-------
track : dict
Containing all the specified info the y should have.
"""
event_info = blob["EventInfo"][0]
# add n_hits info for the cut
n_hits = len(blob["Hits"])
track = {
"event_id": event_info.event_id,
"run_id": event_info.run_id,
"trigger_mask": event_info.trigger_mask,
"n_hits": n_hits,
}
# get all the std reco info
if has_std_reco:
std_reco_info = get_std_reco(blob,rec_types,rec_parameters_names)
Daniel Guderian
committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
track.update(std_reco_info)
return track
return mc_info_extr
def get_random_noise_mc_info_extr(input_file):
"""
Wrapper function that includes the actual mc_info_extr
for random noise simulations. There are no n_gen like in the neutrino case.
Parameters
----------
input_file : km3net data file
Can be online or offline format.
Returns
-------
mc_info_extr : function
The actual mc_info_extr function that holds the extractions.
"""
# check if std reco is present
f = File(input_file, "r")
has_std_reco = "reco" in f.keys()
#also check, which rec types are present
rec_types,rec_parameters_names = get_rec_types_in_file(f)
Daniel Guderian
committed
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def mc_info_extr(blob):
"""
Processes a blob and creates the y with mc_info and, if existing, std reco.
For this random noise case it is only general event info, like the id.
Parameters
----------
blob : dict
The blob from the pipeline.
Returns
-------
track : dict
Containing all the specified info the y should have.
"""
event_info = blob["EventInfo"]
track = {
"event_id": event_info.event_id[0],
"run_id": event_info.run_id[0],
"particle_type": 0,
}
# get all the std reco info
if has_std_reco:
std_reco_info = get_std_reco(blob,rec_types,rec_parameters_names)
Daniel Guderian
committed
track.update(std_reco_info)
return track
return mc_info_extr
Daniel Guderian
committed
def get_neutrino_mc_info_extr(input_file):
"""
Wrapper function that includes the actual mc_info_extr
for neutrino simulations. The n_gen parameter, needed for neutrino weighting
is extracted from the header of the file.
Parameters
----------
input_file : km3net data file
Can be online or offline format.
Returns
-------
mc_info_extr : function
The actual mc_info_extr function that holds the extractions.
"""
# check if std reco is present
f = File(input_file, "r")
has_std_reco = "reco" in f.keys()
rec_types,rec_parameters_names = get_rec_types_in_file(f)
Daniel Guderian
committed
# get the n_gen
header = HDF5Header.from_hdf5(input_file)
n_gen = header.genvol.numberOfEvents
Daniel Guderian
committed
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def mc_info_extr(blob):
"""
Processes a blob and creates the y with mc_info and, if existing, std reco.
For this neutrino case it is the full mc info for the primary neutrino; there are the several "McTracks":
check the simulation which index "p" the neutrino has.
Parameters
----------
blob : dict
The blob from the pipeline.
Returns
-------
track : dict
Containing all the specified info the y should have.
"""
# get general info about the event
event_id = blob["EventInfo"].event_id[0]
run_id = blob["EventInfo"].run_id[0]
# weights for neutrino analysis
weight_w1 = blob["EventInfo"].weight_w1[0]
weight_w2 = blob["EventInfo"].weight_w2[0]
weight_w3 = blob["EventInfo"].weight_w3[0]
# first, look for the particular neutrino index of the production
p = 0 # for ORCA4 (and probably subsequent productions)
mc_track = blob["McTracks"][p]
# some track mc truth info
particle_type = mc_track.pdgid #sometimes type, sometimes pdgid
Daniel Guderian
committed
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
energy = mc_track.energy
is_cc = mc_track.cc
bjorkeny = mc_track.by
dir_x, dir_y, dir_z = mc_track.dir_x, mc_track.dir_y, mc_track.dir_z
time_interaction = (
mc_track.time
) # actually always 0 for primary neutrino, measured in MC time
vertex_pos_x, vertex_pos_y, vertex_pos_z = (
mc_track.pos_x,
mc_track.pos_y,
mc_track.pos_z,
)
# add also the nhits info
n_hits = len(blob["Hits"])
track = {
"event_id": event_id,
"particle_type": particle_type,
"energy": energy,
"is_cc": is_cc,
"bjorkeny": bjorkeny,
"dir_x": dir_x,
"dir_y": dir_y,
"dir_z": dir_z,
"time_interaction": time_interaction,
"run_id": run_id,
"vertex_pos_x": vertex_pos_x,
"vertex_pos_y": vertex_pos_y,
"vertex_pos_z": vertex_pos_z,
"n_hits": n_hits,
"weight_w1": weight_w1,
"weight_w2": weight_w2,
"weight_w3": weight_w3,
"n_gen": n_gen,
}
Daniel Guderian
committed
# get all the std reco info
if has_std_reco:
std_reco_info = get_std_reco(blob,rec_types,rec_parameters_names)
Daniel Guderian
committed
track.update(std_reco_info)
return track
return mc_info_extr
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
#function used by Stefan to identify which muons leave how many mc hits in the (active) detector.
def get_mchits_per_muon(blob, inactive_du=None):
"""
For each muon in McTracks, get the number of McHits.
Parameters
----------
blob
The blob.
inactive_du : int, optional
McHits in this DU will not be counted.
Returns
-------
np.array
n_mchits, len = number of muons
"""
ids = blob["McTracks"]["id"]
# Origin of each mchit (as int) in the active line
origin = blob["McHits"]["origin"]
if inactive_du:
# only hits in active line
origin = origin[blob["McHits"]["du"] != inactive_du]
# get how many mchits were produced per muon in the bundle
origin_dict = dict(zip(*np.unique(origin, return_counts=True)))
return np.array([origin_dict.get(i, 0) for i in ids])
def get_muon_mc_info_extr(input_file,prod_identifier=2,inactive_du=None):
Daniel Guderian
committed
"""
Wrapper function that includes the actual mc_info_extr
for atm. muon simulations. There are no n_gen like in the neutrino case.
Parameters
----------
input_file : km3net data file
Can be online or offline format.
prod_identifier : int
Solotion for now: just give a 1 for km3sim and a 2 for JSerine production.
They have different simulated run times, shich are needed for the correct scaling.
Daniel Guderian
committed
Returns
-------
mc_info_extr : function
The actual mc_info_extr function that holds the extractions.
"""
# check if std reco is present
f = File(input_file, "r")
has_std_reco = "reco" in f.keys()
#also check, which rec types are present
rec_types,rec_parameters_names = get_rec_types_in_file(f)
Daniel Guderian
committed
# no n_gen here, but needed for concatenation
n_gen = 1
Daniel Guderian
committed
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
def mc_info_extr(blob):
"""
Processes a blob and creates the y with mc_info and, if existing, std reco.
For this atm. muon case it is the full mc info for the primary; there are the several "McTracks":
check the simulation to understand what "p" you want. Muons come in bundles that have the same direction.
For energy: sum of all muons in a bundle,
for vertex: weighted (energy) mean of the individual vertices .
Parameters
----------
blob : dict
The blob from the pipeline.
Returns
-------
track : dict
Containing all the specified info the y should have.
"""
event_id = blob["EventInfo"].event_id[0]
run_id = blob["EventInfo"].run_id[0]
p = 0 # for ORCA4 (and probably subsequent productions)
mc_track = blob["McTracks"][p]
particle_type = mc_track.pdgid # assumed that this is the same for all muons in a bundle, new: pdgid, old: type
is_cc = 0 #set to 0
bjorkeny = 0 #set to zero
Daniel Guderian
committed
time_interaction = mc_track.time # same for all muons in a bundle
# sum up the energy from all muons that have at least x mc hits
n_hits_per_muon = get_mchits_per_muon(blob, inactive_du=inactive_du) #DU1 in ORCA4 is in the detx but not powered
#dont consider muons with less than 10 mc hits
suficient_hits_mask = n_hits_per_muon >= 15
energy = np.sum(blob["McTracks"][suficient_hits_mask].energy)
Daniel Guderian
committed
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
# all muons in a bundle are parallel, so just take dir of first muon
dir_x, dir_y, dir_z = mc_track.dir_x, mc_track.dir_y, mc_track.dir_z
# vertex is the weighted (energy) mean of the individual vertices
vertex_pos_x = np.average(
blob["McTracks"][p:].pos_x, weights=blob["McTracks"][p:].energy
)
vertex_pos_y = np.average(
blob["McTracks"][p:].pos_y, weights=blob["McTracks"][p:].energy
)
vertex_pos_z = np.average(
blob["McTracks"][p:].pos_z, weights=blob["McTracks"][p:].energy
)
# add also the nhits info
n_hits = len(blob["Hits"])
# this is only relevant for neutrinos, though dummy info is needed for the concatenation
weight_w1 = 10
weight_w2 = 10
weight_w3 = 10
track = {
"event_id": event_id,
"particle_type": particle_type,
"energy": energy,
"is_cc": is_cc,
"bjorkeny": bjorkeny,
"dir_x": dir_x,
"dir_y": dir_y,
"dir_z": dir_z,
"time_interaction": time_interaction,
"run_id": run_id,
"vertex_pos_x": vertex_pos_x,
"vertex_pos_y": vertex_pos_y,
"vertex_pos_z": vertex_pos_z,
"n_hits": n_hits,
"weight_w1": weight_w1,
"weight_w2": weight_w2,
"weight_w3": weight_w3,
"n_gen": n_gen,
"prod_identifier": prod_identifier,
Daniel Guderian
committed
}
# get all the std reco info
if has_std_reco:
std_reco_info = get_std_reco(blob,rec_types,rec_parameters_names)
Daniel Guderian
committed
track.update(std_reco_info)
return track
return mc_info_extr