Newer
Older
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# coding=utf-8
# Filename: data_to_images.py
"""
Main OrcaSong code which takes raw simulated .h5 files and the corresponding .detx detector file as input in
order to generate 2D/3D/4D histograms ('images') that can be used for CNNs.
The input file can be calibrated or not (e.g. contains pos_xyz of the hits).
Makes only 4D histograms ('images') by default.
Usage:
data_to_images.py [options] FILENAME DETXFILE
data_to_images.py (-h | --help)
Options:
-h --help Show this screen.
-c CONFIGFILE Load all options from a config file (.toml format).
--n_bins N_BINS Number of bins that are used in the image making for each dimension, e.g. (x,y,z,t).
[default: 11,13,18,60]
--det_geo DET_GEO Which detector geometry to use for the binning, e.g. 'Orca_115l_23m_h_9m_v'.
[default: Orca_115l_23m_h_9m_v]
--do2d If 2D histograms, 'images', should be created.
--do2d_plots If 2D pdf plot visualizations of the 2D histograms should be created, cannot be called if do2d=False.
--do2d_plots_n N For how many events the 2D plot visualizations should be made.
OrcaSong will exit after reaching N events. [default: 10]
--do3d If 3D histograms, 'images', should be created.
--dont_do4d If 4D histograms, 'images', should NOT be created.
--do4d_mode MODE What dimension should be used in the 4D histograms as the 4th dim.
Available: 'time', 'channel_id'. [default: time]
--timecut_mode MODE Defines what timecut mode should be used in hits_to_histograms.py.
At the moment, these cuts are only optimized for ORCA 115l neutrino events!
Currently available:
'timeslice_relative': Cuts out the central 30% of the snapshot.
'trigger_cluster': Cuts based on the mean of the triggered hits.
The timespan for this cut can be chosen in --timecut_timespan.
'None': No timecut.
[default: trigger_cluster]
--timecut_timespan TIMESPAN Only used with timecut_mode 'trigger_cluster'.
Defines the timespan of the trigger_cluster cut.
Currently available:
'all': [-350ns, 850ns] -> 20ns / bin (60 bins)
'tight-1': [-250ns, 500ns] -> 12.5ns / bin
'tight-2': [-150ns, 200ns] -> 5.8ns / bin
[default: tight-1]
--do_mc_hits If only the mc_hits (no BG hits!) should be used for the image processing.
--data_cut_triggered If non-triggered hits should be thrown away for the images.
--data_cut_e_low E_LOW Cut events that are lower than the specified threshold value in GeV.
--data_cut_e_high E_HIGH Cut events that are higher than the specified threshold value in GeV.
--data_cut_throw_away FRACTION Throw away a random fraction (percentage) of events. [default: 0.00]
--prod_ident PROD_IDENT Optional int identifier for the used mc production.
This is useful, if you use events from two different mc productions,
e.g. the 1-5GeV & 3-100GeV Orca 2016 MC. The prod_ident int will be saved in
the 'y' dataset of the output file of OrcaSong. [default: 1]
"""
__author__ = 'Michael Moser'
__license__ = 'AGPL'
__version__ = '1.0'
__email__ = 'michael.m.moser@fau.de'
__status__ = 'Prototype'
import os
import sys
import warnings
import toml
from docopt import docopt
#from memory_profiler import profile # for memory profiling, call with @profile; myfunc()
#import line_profiler # call with kernprof -l -v file.py args
import matplotlib as mpl
mpl.use('Agg')
from matplotlib.backends.backend_pdf import PdfPages
from orcasong.file_to_hits import *
from orcasong.histograms_to_files import *
from orcasong.hits_to_histograms import *
def parse_input():
"""
Parses and returns all necessary input options for the data_to_images function.
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
Check the data_to_images function to get docs about the individual parameters.
"""
args = docopt(__doc__)
if args['-c']:
config = toml.load(args['-c'])
args.update(config)
fname = args['FILENAME']
detx_filepath = args['DETXFILE']
n_bins = tuple(map(int, args['--n_bins'].split(',')))
det_geo = args['--det_geo']
do2d = args['--do2d']
do2d_plots = (args['--do2d_plots'], int(args['--do2d_plots_n']))
do3d = args['--do3d']
do4d = (not bool(args['--dont_do4d']), args['--do4d_mode'])
timecut = (args['--timecut_mode'], args['--timecut_timespan'])
do_mc_hits = args['--do_mc_hits']
data_cuts = dict()
data_cuts['triggered'] = args['--data_cut_triggered']
data_cuts['energy_lower_limit'] = float(args['--data_cut_e_low']) if args['--data_cut_e_low'] is not None else None
data_cuts['energy_upper_limit'] = float(args['--data_cut_e_high']) if args['--data_cut_e_high'] is not None else None
data_cuts['throw_away_prob'] = float(args['--data_cut_throw_away'])
prod_ident = int(args['--prod_ident'])
return fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut,\
do_mc_hits, data_cuts
def parser_check_input(args):
Sanity check of the user input. Only necessary for options that are not boolean.
Parameters
----------
args : dict
Docopt parser element that contains all input information.
#---- Checks input types ----#
# Check for options with a single, non-boolean element
single_args = {'--det_geo': str, '--do2d_plots_n': int, '--do4d_mode': str, '--timecut_mode': str,
'timecut_timespan': str, '--data_cut_e_low ': float, '--data_cut_e_high': float,
'--data_cut_throw_away': float, '--prod_ident': int}
for key in single_args:
expected_arg_type = single_args[key]
parsed_arg = args[key]
if parsed_arg is None: # we don't want to check args when there has been no user input
continue
try:
map(expected_arg_type, parsed_arg)
except ValueError:
raise TypeError('The argument option ', key, ' only accepts ', str(expected_arg_type),
' values as an input.')
# Checks the n_bins tuple input
try:
map(int, args['--n_bins'].split(','))
except ValueError:
raise TypeError('The argument option n_bins only accepts integer values as an input'
' (Format: --n_bins 11,13,18,60).')
# ---- Checks input types ----#
# ---- Check other things ----#
if not os.path.isfile(args['FILENAME']):
raise IOError('The file -' + args['FILENAME'] + '- does not exist.')
if not os.path.isfile(args['DETXFILE']):
raise IOError('The file -' + args['DETXFILE'] + '- does not exist.')
if bool(args['--do2d']) == False and bool(args['--do2d_plots']) == True:
raise ValueError('The 2D pdf images cannot be created if do2d=False!')
if bool(args['--do2d_plots']) == True and int(args['--do2d_plots_n']) > 100:
warnings.warn('You declared do2d_pdf=(True, int) with int > 100. This will take more than two minutes.'
'Do you really want to create pdfs images for so many events?')
def calculate_bin_edges(n_bins, det_geo, detx_filepath, do4d):
Calculates the bin edges for the corresponding detector geometry (1 DOM/bin) based on the number of specified bins.
Used later on for making the event "images" with the in the np.histogramdd funcs in hits_to_histograms.py.
The bin edges are necessary in order to get the same bin size for each event regardless of the fact if all bins have a hit or not.
Parameters
----------
n_bins : tuple
Contains the desired number of bins for each dimension, [n_bins_x, n_bins_y, n_bins_z].
det_geo : str
Declares what detector geometry should be used for the binning.
detx_filepath : str
Filepath of a .detx detector file which contains the geometry of the detector.
do4d : tuple(boo, str)
Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
Returns
-------
x_bin_edges, y_bin_edges, z_bin_edges : ndarray(ndim=1)
The bin edges for the x,y,z direction.
# Loads a kp.Geometry object based on the filepath of a .detx file
print("Reading detector geometry in order to calculate the detector dimensions from file " + detx_filepath)
geo = kp.calib.Calibration(filename=detx_filepath)
# derive maximum and minimum x,y,z coordinates of the geometry input [[xmin, ymin, zmin], [xmax, ymax, zmax]]
dom_position_values = geo.get_detector().dom_positions.values()
dom_pos_max = np.amax([pos for pos in dom_position_values], axis=0)
dom_pos_min = np.amin([pos for pos in dom_position_values], axis=0)
geo_limits = dom_pos_min, dom_pos_max
print('Detector dimensions [[xmin, ymin, zmin], [xmax, ymax, zmax]]: ' + str(geo_limits))
if det_geo == 'Orca_115l_23m_h_9m_v' or det_geo == 'Orca_115l_23m_h_?m_v':
x_bin_edges = np.linspace(geo_limits[0][0] - 9.95, geo_limits[1][0] + 9.95, num=n_bins[0] + 1) #try to get the lines in the bin center 9.95*2 = average x-separation of two lines
y_bin_edges = np.linspace(geo_limits[0][1] - 9.75, geo_limits[1][1] + 9.75, num=n_bins[1] + 1) # Delta y = 19.483
# Fitted offsets: x,y,factor: factor*(x+x_off), # Stefan's modifications:
offset_x, offset_y, scale = [6.19, 0.064, 1.0128]
x_bin_edges = (x_bin_edges + offset_x) * scale
y_bin_edges = (y_bin_edges + offset_y) * scale
if det_geo == 'Orca_115l_23m_h_?m_v':
# ORCA denser detector study
z_bin_edges = np.linspace(37.84 - 7.5, 292.84 + 7.5, num=n_bins[2] + 1) # 15m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 6, 241.84 + 6, num=n_bins[2] + 1) # 12m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 4.5, 190.84 + 4.5, num=n_bins[2] + 1) # 9m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 3, 139.84 + 3, num=n_bins[2] + 1) # 6m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 2.25, 114.34 + 2.25, num=n_bins[2] + 1) # 4.5m vertical, 18 DOMs
else:
n_bins_z = n_bins[2] if do4d[1] != 'xzt-c' else n_bins[1] # n_bins = [xyz,t/c] or n_bins = [xzt,c]
z_bin_edges = np.linspace(geo_limits[0][2] - 4.665, geo_limits[1][2] + 4.665, num=n_bins_z + 1) # Delta z = 9.329
# calculate_bin_edges_test(dom_positions, y_bin_edges, z_bin_edges) # test disabled by default. Activate it, if you change the offsets in x/y/z-bin-edges
else:
raise ValueError('The specified detector geometry "' + str(det_geo) + '" is not available.')
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
return geo, x_bin_edges, y_bin_edges, z_bin_edges
def calculate_bin_edges_test(geo, y_bin_edges, z_bin_edges):
"""
Tests, if the bins in one direction don't accidentally have more than 'one' OM.
For the x-direction, an overlapping can not be avoided in an orthogonal space.
For y and z though, it can!
For y, every bin should contain the number of lines per y-direction * 18 for 18 doms per line.
For z, every bin should contain 115 entries, since every z bin contains one storey of the 115 ORCA lines.
Not 100% accurate, since only the dom positions are used and not the individual pmt positions for a dom.
"""
dom_positions = np.stack(list(geo.get_detector().dom_positions.values()))
dom_y = dom_positions[:, 1]
dom_z = dom_positions[:, 2]
hist_y = np.histogram(dom_y, bins=y_bin_edges)
hist_z = np.histogram(dom_z, bins=z_bin_edges)
print('----------------------------------------------------------------------------------------------')
print('Y-axis: Bin content: ' + str(hist_y[0]))
print('It should be: ' + str(np.array(
[4 * 18, 7 * 18, 9 * 18, 10 * 18, 9 * 18, 10 * 18, 10 * 18, 10 * 18, 11 * 18, 10 * 18, 9 * 18, 8 * 18, 8 * 18])))
print('Y-axis: Bin edges: ' + str(hist_y[1]))
print('..............................................................................................')
print('Z-axis: Bin content: ' + str(hist_z[0]))
print('It should have 115 entries everywhere')
print('Z-axis: Bin edges: ' + str(hist_z[1]))
print('----------------------------------------------------------------------------------------------')
ViaFerrata
committed
def get_file_particle_type(event_pump):
"""
Returns a string that specifies the type of the particles that are contained in the input file.
Parameters
----------
event_pump : kp.io.hdf5.HDF5Pump
Km3pipe HDF5Pump instance which is linked to the input file.
Returns
-------
file_particle_type : str
String that specifies the type of particles that are contained in the file: ['undefined', 'muon', 'neutrino'].
"""
if 'McTracks' not in event_pump[0]:
file_particle_type = 'undefined'
else:
particle_type = event_pump[0]['McTracks'].type
# if mupage file: first mc_track is an empty neutrino track, second track is the first muon
if particle_type[0] == 0 and np.abs(particle_type[1]) == 13:
ViaFerrata
committed
file_particle_type = 'muon'
# the first mc_track is the primary neutrino if the input file is containing neutrino events
elif np.abs(particle_type[0]) in [12, 14, 16]:
ViaFerrata
committed
file_particle_type = 'neutrino'
else:
raise ValueError('The type of the particles in the "McTracks" folder, <', str(particle_type), '> is not known.')
return file_particle_type
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def skip_event(event_track, data_cuts):
"""
Function that checks if an event should be skipped, depending on the data_cuts input.
Parameters
----------
event_track : ndarray(ndim=1)
1D array containing important MC information of the event, only the energy of the event (pos_2) is used here.
data_cuts : dict
Dictionary that contains information about any possible cuts that should be applied.
Supports the following cuts: 'triggered', 'energy_lower_limit', 'energy_upper_limit', 'throw_away_prob'.
Returns
-------
continue_bool : bool
boolean flag to specify, if this event should be skipped or not.
"""
continue_bool = False
if data_cuts['energy_lower_limit'] is not None:
continue_bool = event_track[2] < data_cuts['energy_lower_limit'] # True if E < lower limit
if data_cuts['energy_upper_limit'] is not None:
continue_bool = event_track[2] > data_cuts['energy_upper_limit'] # True if E > upper limit
if data_cuts['throw_away_prob'] > 0:
throw_away_prob = data_cuts['throw_away_prob']
throw_away = np.random.choice([False, True], p=[1 - throw_away_prob, throw_away_prob])
if throw_away == True: continue_bool = True
# # TODO temporary, deprecated solution, we always need to throw away the same events if we have multiple inputs -> use fixed seed
# arr = np.load('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_elec-CC.npy')
# arr_list = arr.tolist()
# evt_id = event_track[0]
# run_id = event_track[9]
#
# if [run_id, evt_id] not in arr_list:
# continue
return continue_bool
def data_to_images(fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut,
do_mc_hits, data_cuts):
Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used
for a CNN.
Parameters
----------
fname : str
Filename (full path!) of the input file.
detx_filepath : str
String with the full filepath to the corresponding .detx file of the input file.
Used for the binning and for the hits calibration if the input file is not calibrated yet
(e.g. hits do not contain pos_x/y/z, time, ...).
n_bins : tuple of int
Declares the number of bins that should be used for each dimension, e.g. (x,y,z,t).
det_geo : str
Declares what detector geometry should be used for the binning. E.g. 'Orca_115l_23m_h_9m_v'.
do2d : bool
Declares if 2D histograms, 'images', should be created.
do2d_plots : tuple(bool, int)
Declares if pdf visualizations of the 2D histograms should be created, cannot be called if do2d=False.
The event loop will be stopped after the integer specified in the second argument.
do3d : bool
Declares if 3D histograms should be created.
do4d : tuple(bool, str)
Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
Currently, only 'time' and 'channel_id' are available.
prod_ident : int
Optional int identifier for the used mc production.
This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC.
In this case, the events are not fully distinguishable with only the run_id and the event_id!
In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable.
timecut : tuple(str, str/None)
Tuple that defines what timecut should be used in hits_to_histograms.py.
Currently available:
('timeslice_relative', None): Cuts out the central 30% of the snapshot.
('trigger_cluster', 'all' / 'tight-1' / 'tight-2'): Cuts based on the mean of the triggered hits.
(None, ...): No timecut.
all: [-350ns, 850ns] -> 20ns / bin (60 bins)
tight-1: [-250ns, 500ns] -> 12.5ns / bin , tight-2: [-150ns, 200ns] -> 5.8ns / bin
do_mc_hits : bool
Declares if hits (False, mc_hits + BG) or mc_hits (True) should be processed.
data_cuts : dict
Dictionary that contains information about any possible cuts that should be applied.
Supports the following cuts: 'triggered', 'energy_lower_limit', 'energy_upper_limit', 'throw_away_prob'.
ViaFerrata
committed
np.random.seed(42) # set random seed
filename = os.path.basename(os.path.splitext(fname)[0])
filename_output = filename.replace('.','_')
geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges(n_bins, det_geo, detx_filepath, do4d)
all_4d_to_2d_hists, all_4d_to_3d_hists, all_4d_to_4d_hists, mc_infos = [], [], [], []
pdf_2d_plots = PdfPages('Results/4dTo2d/' + filename_output + '_plots.pdf') if do2d_plots[0] is True else None
# Initialize HDF5Pump of the input file
event_pump = kp.io.hdf5.HDF5Pump(filename=fname)
ViaFerrata
committed
file_particle_type = get_file_particle_type(event_pump)
print('Generating histograms from the hits in XYZT format for files based on ' + fname)
for i, event_blob in enumerate(event_pump):
if i % 10 == 0:
# filter out all hit and track information belonging that to this event
ViaFerrata
committed
event_hits, event_track = get_event_data(event_blob, file_particle_type, geo, do_mc_hits, data_cuts, do4d, prod_ident)
continue_bool = skip_event(event_track, data_cuts)
if continue_bool: continue
ViaFerrata
committed
# event_track: [event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time]
mc_infos.append(event_track)
if do2d:
compute_4d_to_2d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_2d_hists, timecut, event_track, do2d_plots[0], pdf_2d_plots)
if do3d:
compute_4d_to_3d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_3d_hists, timecut)
if do4d[0]:
compute_4d_to_4d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_4d_hists, timecut, do4d)
if do2d_plots[0] is True and i >= do2d_plots[1]:
pdf_2d_plots.close()
break
if do2d:
store_histograms_as_hdf5(np.stack([hist_tuple[0] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xy/' + filename_output + '_xy.h5')
store_histograms_as_hdf5(np.stack([hist_tuple[1] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xz/' + filename_output + '_xz.h5')
store_histograms_as_hdf5(np.stack([hist_tuple[2] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/yz/' + filename_output + '_yz.h5')
store_histograms_as_hdf5(np.stack([hist_tuple[3] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xt/' + filename_output + '_xt.h5')
store_histograms_as_hdf5(np.stack([hist_tuple[4] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/yt/' + filename_output + '_yt.h5')
store_histograms_as_hdf5(np.stack([hist_tuple[5] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/zt/' + filename_output + '_zt.h5')
if do3d:
store_histograms_as_hdf5(np.stack([hist_tuple[0] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xyz/' + filename_output + '_xyz.h5', compression=('gzip', 1))
store_histograms_as_hdf5(np.stack([hist_tuple[1] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xyt/' + filename_output + '_xyt.h5', compression=('gzip', 1))
store_histograms_as_hdf5(np.stack([hist_tuple[2] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xzt/' + filename_output + '_xzt.h5', compression=('gzip', 1))
store_histograms_as_hdf5(np.stack([hist_tuple[3] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/yzt/' + filename_output + '_yzt.h5', compression=('gzip', 1))
store_histograms_as_hdf5(np.stack([hist_tuple[4] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/rzt/' + filename_output + '_rzt.h5', compression=('gzip', 1))
if do4d[0]:
folder = ''
if not os.path.exists('Results/4dTo4d/h5/xyzt/' + folder):
os.makedirs('Results/4dTo4d/h5/xyzt/' + folder)
if folder != '': folder += '/'
if do4d[1] == 'channel_id':
store_histograms_as_hdf5(np.array(all_4d_to_4d_hists), np.array(mc_infos), 'Results/4dTo4d/h5/xyzc/' + folder + filename_output + '_xyzc.h5', compression=('gzip', 1))
else:
store_histograms_as_hdf5(np.array(all_4d_to_4d_hists), np.array(mc_infos), 'Results/4dTo4d/h5/xyzt/' + folder + filename_output + '_xyzt.h5', compression=('gzip', 1))
def main():
"""
Parses the input to the main data_to_images function.
"""
fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut, do_mc_hits, data_cuts = parse_input()
data_to_images(fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut,
do_mc_hits, data_cuts)