-
ViaFerrata authoredViaFerrata authored
data_to_images.py 24.53 KiB
#!/usr/bin/env python
# coding=utf-8
# Filename: data_to_images.py
"""
Main OrcaSong code which takes raw simulated .h5 files and the corresponding .detx detector file as input in
order to generate 2D/3D/4D histograms ('images') that can be used for CNNs.
First argument: KM3NeT hdf5 simfile at JTE level.
Second argument: a .detx file that is associated with the hdf5 file.
The input file can be calibrated or not (e.g. contains pos_xyz of the hits) and the OrcaSong output is written
to the current folder by default (otherwise use --o option).
Makes only 4D histograms ('images') by default.
Usage:
data_to_images.py [options] FILENAME DETXFILE
data_to_images.py (-h | --help)
Options:
-h --help Show this screen.
-c CONFIGFILE Load all options from a config file (.toml format).
--o OUTPUTPATH Path for the directory, where the OrcaSong output should be stored. [default: ./]
--chunksize CHUNKSIZE Chunksize (axis_0) that should be used for the hdf5 output of OrcaSong. [default: 32]
--complib COMPLIB Compression library that should be used for the OrcaSong output.
All PyTables compression filters are available. [default: zlib]
--complevel COMPLEVEL Compression level that should be used for the OrcaSong output. [default: 1]
--n_bins N_BINS Number of bins that are used in the image making for each dimension, e.g. (x,y,z,t).
[default: 11,13,18,60]
--det_geo DET_GEO Which detector geometry to use for the binning, e.g. 'Orca_115l_23m_h_9m_v'.
[default: Orca_115l_23m_h_9m_v]
--do2d If 2D histograms, 'images', should be created.
--do2d_plots If 2D pdf plot visualizations of the 2D histograms should be created, cannot be called if do2d=False.
--do2d_plots_n N For how many events the 2D plot visualizations should be made.
OrcaSong will exit after reaching N events. [default: 10]
--do3d If 3D histograms, 'images', should be created.
--dont_do4d If 4D histograms, 'images', should NOT be created.
--do4d_mode MODE What dimension should be used in the 4D histograms as the 4th dim.
Available: 'time', 'channel_id'. [default: time]
--timecut_mode MODE Defines what timecut mode should be used in hits_to_histograms.py.
At the moment, these cuts are only optimized for ORCA 115l neutrino events!
Currently available:
'timeslice_relative': Cuts out the central 30% of the snapshot.
'trigger_cluster': Cuts based on the mean of the triggered hits.
The timespan for this cut can be chosen in --timecut_timespan.
'None': No timecut.
[default: trigger_cluster]
--timecut_timespan TIMESPAN Only used with timecut_mode 'trigger_cluster'.
Defines the timespan of the trigger_cluster cut.
Currently available:
'all': [-350ns, 850ns] -> 20ns / bin (60 bins)
'tight-1': [-250ns, 500ns] -> 12.5ns / bin
'tight-2': [-150ns, 200ns] -> 5.8ns / bin
[default: tight-1]
--do_mc_hits If only the mc_hits (no BG hits!) should be used for the image processing.
--data_cut_triggered If non-triggered hits should be thrown away for the images.
--data_cut_e_low E_LOW Cut events that are lower than the specified threshold value in GeV.
--data_cut_e_high E_HIGH Cut events that are higher than the specified threshold value in GeV.
--data_cut_throw_away FRACTION Throw away a random fraction (percentage) of events. [default: 0.00]
--prod_ident PROD_IDENT Optional int identifier for the used mc production.
This is useful, if you use events from two different mc productions,
e.g. the 1-5GeV & 3-100GeV Orca 2016 MC. The prod_ident int will be saved in
the 'y' dataset of the output file of OrcaSong. [default: 1]
"""
__author__ = 'Michael Moser'
__license__ = 'AGPL'
__version__ = '1.0'
__email__ = 'michael.m.moser@fau.de'
__status__ = 'Prototype'
import os
import sys
import warnings
import toml
from docopt import docopt
#from memory_profiler import profile # for memory profiling, call with @profile; myfunc()
#import line_profiler # call with kernprof -l -v file.py args
import numpy as np
import km3pipe as kp
import km3modules as km
import matplotlib as mpl
mpl.use('Agg')
from matplotlib.backends.backend_pdf import PdfPages
from orcasong.file_to_hits import EventDataExtractor
from orcasong.hits_to_histograms import HistogramMaker
def parse_input():
"""
Parses and returns all necessary input options for the data_to_images function.
Check the data_to_images function to get docs about the individual parameters.
"""
args = docopt(__doc__)
if args['-c']:
config = toml.load(args['-c'])
args.update(config)
fname = args['FILENAME']
detx_filepath = args['DETXFILE']
output_dirpath = args['--o']
chunksize = int(args['--chunksize'])
complib = args['--complib']
complevel = int(args['--complevel'])
n_bins = tuple(map(int, args['--n_bins'].split(',')))
det_geo = args['--det_geo']
do2d = args['--do2d']
do2d_plots = (args['--do2d_plots'], int(args['--do2d_plots_n']))
do3d = args['--do3d']
do4d = (not bool(args['--dont_do4d']), args['--do4d_mode'])
timecut = (args['--timecut_mode'], args['--timecut_timespan'])
do_mc_hits = args['--do_mc_hits']
data_cuts = dict()
data_cuts['triggered'] = args['--data_cut_triggered']
data_cuts['energy_lower_limit'] = float(args['--data_cut_e_low']) if args['--data_cut_e_low'] is not None else None
data_cuts['energy_upper_limit'] = float(args['--data_cut_e_high']) if args['--data_cut_e_high'] is not None else None
data_cuts['throw_away_prob'] = float(args['--data_cut_throw_away'])
prod_ident = int(args['--prod_ident'])
return fname, detx_filepath, output_dirpath, chunksize, complib, complevel, n_bins, det_geo, do2d,\
do2d_plots, do3d, do4d, prod_ident, timecut, do_mc_hits, data_cuts
def parser_check_input(args):
"""
Sanity check of the user input. Only necessary for options that are not boolean.
Parameters
----------
args : dict
Docopt parser element that contains all input information.
"""
#---- Checks input types ----#
# Check for options with a single, non-boolean element
single_args = {'--det_geo': str, '--do2d_plots_n': int, '--do4d_mode': str, '--timecut_mode': str,
'timecut_timespan': str, '--data_cut_e_low ': float, '--data_cut_e_high': float,
'--data_cut_throw_away': float, '--prod_ident': int}
for key in single_args:
expected_arg_type = single_args[key]
parsed_arg = args[key]
if parsed_arg is None: # we don't want to check args when there has been no user input
continue
try:
map(expected_arg_type, parsed_arg)
except ValueError:
raise TypeError('The argument option ', key, ' only accepts ', str(expected_arg_type),
' values as an input.')
# Checks the n_bins tuple input
try:
map(int, args['--n_bins'].split(','))
except ValueError:
raise TypeError('The argument option n_bins only accepts integer values as an input'
' (Format: --n_bins 11,13,18,60).')
# ---- Checks input types ----#
# ---- Check other things ----#
if not os.path.isfile(args['FILENAME']):
raise IOError('The file -' + args['FILENAME'] + '- does not exist.')
if not os.path.isfile(args['DETXFILE']):
raise IOError('The file -' + args['DETXFILE'] + '- does not exist.')
if bool(args['--do2d']) == False and bool(args['--do2d_plots']) == True:
raise ValueError('The 2D pdf images cannot be created if do2d=False!')
if bool(args['--do2d_plots']) == True and int(args['--do2d_plots_n']) > 100:
warnings.warn('You declared do2d_pdf=(True, int) with int > 100. This will take more than two minutes.'
'Do you really want to create pdfs images for so many events?')
def make_output_dirs(output_dirpath, do2d, do3d, do4d):
"""
Function that creates all output directories if they don't exist already.
Parameters
----------
output_dirpath : str
Full path to the directory, where the orcasong output should be stored.
do2d : bool
Declares if 2D histograms, are to be created.
do3d : bool
Declares if 3D histograms are to be created.
do4d : tuple(bool, str)
Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
Currently, only 'time' and 'channel_id' are available.
"""
if do2d:
projections = ['xy', 'xz', 'yz', 'xt', 'yt', 'zt']
for proj in projections:
if not os.path.exists(output_dirpath + '/orcasong_output/4dTo2d/' + proj):
os.makedirs(output_dirpath + '/orcasong_output/4dTo2d/' + proj)
if do3d:
projections = ['xyz', 'xyt', 'xzt', 'yzt', 'rzt']
for proj in projections:
if not os.path.exists(output_dirpath + '/orcasong_output/4dTo3d/' + proj):
os.makedirs(output_dirpath + '/orcasong_output/4dTo3d/' + proj)
if do4d[0]:
proj = 'xyzt' if not do4d[1] == 'channel_id' else 'xyzc'
if not os.path.exists(output_dirpath + '/orcasong_output/4dTo4d/' + proj):
os.makedirs(output_dirpath + '/orcasong_output/4dTo4d/' + proj)
def calculate_bin_edges(n_bins, det_geo, detx_filepath, do4d):
"""
Calculates the bin edges for the corresponding detector geometry (1 DOM/bin) based on the number of specified bins.
Used later on for making the event "images" with the in the np.histogramdd funcs in hits_to_histograms.py.
The bin edges are necessary in order to get the same bin size for each event regardless of the fact if all bins have a hit or not.
Parameters
----------
n_bins : tuple
Contains the desired number of bins for each dimension, [n_bins_x, n_bins_y, n_bins_z].
det_geo : str
Declares what detector geometry should be used for the binning.
detx_filepath : str
Filepath of a .detx detector file which contains the geometry of the detector.
do4d : tuple(boo, str)
Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
Returns
-------
x_bin_edges, y_bin_edges, z_bin_edges : ndarray(ndim=1)
The bin edges for the x,y,z direction.
"""
# Loads a kp.Geometry object based on the filepath of a .detx file
print("Reading detector geometry in order to calculate the detector dimensions from file " + detx_filepath)
geo = kp.calib.Calibration(filename=detx_filepath)
# derive maximum and minimum x,y,z coordinates of the geometry input [[xmin, ymin, zmin], [xmax, ymax, zmax]]
dom_position_values = geo.get_detector().dom_positions.values()
dom_pos_max = np.amax([pos for pos in dom_position_values], axis=0)
dom_pos_min = np.amin([pos for pos in dom_position_values], axis=0)
geo_limits = dom_pos_min, dom_pos_max
print('Detector dimensions [[xmin, ymin, zmin], [xmax, ymax, zmax]]: ' + str(geo_limits))
if det_geo == 'Orca_115l_23m_h_9m_v' or det_geo == 'Orca_115l_23m_h_?m_v':
x_bin_edges = np.linspace(geo_limits[0][0] - 9.95, geo_limits[1][0] + 9.95, num=n_bins[0] + 1) #try to get the lines in the bin center 9.95*2 = average x-separation of two lines
y_bin_edges = np.linspace(geo_limits[0][1] - 9.75, geo_limits[1][1] + 9.75, num=n_bins[1] + 1) # Delta y = 19.483
# Fitted offsets: x,y,factor: factor*(x+x_off), # Stefan's modifications:
offset_x, offset_y, scale = [6.19, 0.064, 1.0128]
x_bin_edges = (x_bin_edges + offset_x) * scale
y_bin_edges = (y_bin_edges + offset_y) * scale
if det_geo == 'Orca_115l_23m_h_?m_v':
# ORCA denser detector study
z_bin_edges = np.linspace(37.84 - 7.5, 292.84 + 7.5, num=n_bins[2] + 1) # 15m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 6, 241.84 + 6, num=n_bins[2] + 1) # 12m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 4.5, 190.84 + 4.5, num=n_bins[2] + 1) # 9m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 3, 139.84 + 3, num=n_bins[2] + 1) # 6m vertical, 18 DOMs
# z_bin_edges = np.linspace(37.84 - 2.25, 114.34 + 2.25, num=n_bins[2] + 1) # 4.5m vertical, 18 DOMs
else:
n_bins_z = n_bins[2] if do4d[1] != 'xzt-c' else n_bins[1] # n_bins = [xyz,t/c] or n_bins = [xzt,c]
z_bin_edges = np.linspace(geo_limits[0][2] - 4.665, geo_limits[1][2] + 4.665, num=n_bins_z + 1) # Delta z = 9.329
# calculate_bin_edges_test(dom_positions, y_bin_edges, z_bin_edges) # test disabled by default. Activate it, if you change the offsets in x/y/z-bin-edges
else:
raise ValueError('The specified detector geometry "' + str(det_geo) + '" is not available.')
return geo, x_bin_edges, y_bin_edges, z_bin_edges
def calculate_bin_edges_test(geo, y_bin_edges, z_bin_edges):
"""
Tests, if the bins in one direction don't accidentally have more than 'one' OM.
For the x-direction, an overlapping can not be avoided in an orthogonal space.
For y and z though, it can!
For y, every bin should contain the number of lines per y-direction * 18 for 18 doms per line.
For z, every bin should contain 115 entries, since every z bin contains one storey of the 115 ORCA lines.
Not 100% accurate, since only the dom positions are used and not the individual pmt positions for a dom.
"""
dom_positions = np.stack(list(geo.get_detector().dom_positions.values()))
dom_y = dom_positions[:, 1]
dom_z = dom_positions[:, 2]
hist_y = np.histogram(dom_y, bins=y_bin_edges)
hist_z = np.histogram(dom_z, bins=z_bin_edges)
print('----------------------------------------------------------------------------------------------')
print('Y-axis: Bin content: ' + str(hist_y[0]))
print('It should be: ' + str(np.array(
[4 * 18, 7 * 18, 9 * 18, 10 * 18, 9 * 18, 10 * 18, 10 * 18, 10 * 18, 11 * 18, 10 * 18, 9 * 18, 8 * 18, 8 * 18])))
print('Y-axis: Bin edges: ' + str(hist_y[1]))
print('..............................................................................................')
print('Z-axis: Bin content: ' + str(hist_z[0]))
print('It should have 115 entries everywhere')
print('Z-axis: Bin edges: ' + str(hist_z[1]))
print('----------------------------------------------------------------------------------------------')
def get_file_particle_type(fname):
"""
Returns a string that specifies the type of the particles that are contained in the input file.
Parameters
----------
fname : str
Filename (full path!) of the input file.
Returns
-------
file_particle_type : str
String that specifies the type of particles that are contained in the file: ['undefined', 'muon', 'neutrino'].
"""
event_pump = kp.io.hdf5.HDF5Pump(filename=fname) # TODO suppress print of hdf5pump and close pump afterwards
if 'McTracks' not in event_pump[0]:
file_particle_type = 'undefined'
else:
particle_type = event_pump[0]['McTracks'].type
# if mupage file: first mc_track is an empty neutrino track, second track is the first muon
if particle_type[0] == 0 and np.abs(particle_type[1]) == 13:
file_particle_type = 'muon'
# the first mc_track is the primary neutrino if the input file is containing neutrino events
elif np.abs(particle_type[0]) in [12, 14, 16]:
file_particle_type = 'neutrino'
else:
raise ValueError('The type of the particles in the "McTracks" folder, <', str(particle_type), '> is not known.')
event_pump.close_file()
return file_particle_type
class EventSkipper(kp.Module):
"""
KM3Pipe module that skips events based on some data_cuts.
"""
def configure(self):
self.data_cuts = self.require('data_cuts')
def process(self, blob):
continue_bool = skip_event(blob['event_track'], self.data_cuts)
if continue_bool:
return
else:
return blob
def skip_event(event_track, data_cuts):
"""
Function that checks if an event should be skipped, depending on the data_cuts input.
Parameters
----------
event_track : ndarray(ndim=1)
1D array containing important MC information of the event, only the energy of the event (pos_2) is used here.
data_cuts : dict
Dictionary that contains information about any possible cuts that should be applied.
Supports the following cuts: 'triggered', 'energy_lower_limit', 'energy_upper_limit', 'throw_away_prob'.
Returns
-------
continue_bool : bool
boolean flag to specify, if this event should be skipped or not.
"""
continue_bool = False
if data_cuts['energy_lower_limit'] is not None:
continue_bool = event_track.energy[0] < data_cuts['energy_lower_limit'] # True if E < lower limit
if data_cuts['energy_upper_limit'] is not None and continue_bool == False:
continue_bool = event_track.energy[0] > data_cuts['energy_upper_limit'] # True if E > upper limit
if data_cuts['throw_away_prob'] > 0 and continue_bool == False:
throw_away_prob = data_cuts['throw_away_prob']
throw_away = np.random.choice([False, True], p=[1 - throw_away_prob, throw_away_prob])
if throw_away: continue_bool = True
return continue_bool
def data_to_images(fname, detx_filepath, output_dirpath, chunksize, complib, complevel, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut,
do_mc_hits, data_cuts):
"""
Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used
for a CNN.
Parameters
----------
fname : str
Filename (full path!) of the input file.
detx_filepath : str
String with the full filepath to the corresponding .detx file of the input file.
Used for the binning and for the hits calibration if the input file is not calibrated yet
(e.g. hits do not contain pos_x/y/z, time, ...).
output_dirpath : str
Full path to the directory, where the orcasong output should be stored.
chunksize : int
Chunksize (along axis_0) that is used for saving the OrcaSong output to a .h5 file.
complib : str
Compression library that is used for saving the OrcaSong output to a .h5 file.
All PyTables compression filters are available, e.g. 'zlib', 'lzf', 'blosc', ... .
complevel : int
Compression level for the compression filter that is used for saving the OrcaSong output to a .h5 file.
n_bins : tuple of int
Declares the number of bins that should be used for each dimension, e.g. (x,y,z,t).
det_geo : str
Declares what detector geometry should be used for the binning. E.g. 'Orca_115l_23m_h_9m_v'.
do2d : bool
Declares if 2D histograms, 'images', should be created.
do2d_plots : tuple(bool, int)
Declares if pdf visualizations of the 2D histograms should be created, cannot be called if do2d=False.
The event loop will be stopped after the integer specified in the second argument.
do3d : bool
Declares if 3D histograms should be created.
do4d : tuple(bool, str)
Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz.
Currently, only 'time' and 'channel_id' are available.
prod_ident : int
Optional int identifier for the used mc production.
This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC.
In this case, the events are not fully distinguishable with only the run_id and the event_id!
In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable.
timecut : tuple(str, str/None)
Tuple that defines what timecut should be used in hits_to_histograms.py.
Currently available:
('timeslice_relative', None): Cuts out the central 30% of the snapshot.
('trigger_cluster', 'all' / 'tight-1' / 'tight-2'): Cuts based on the mean of the triggered hits.
(None, ...): No timecut.
all: [-350ns, 850ns] -> 20ns / bin (60 bins)
tight-1: [-250ns, 500ns] -> 12.5ns / bin , tight-2: [-150ns, 200ns] -> 5.8ns / bin
do_mc_hits : bool
Declares if hits (False, mc_hits + BG) or mc_hits (True) should be processed.
data_cuts : dict
Dictionary that contains information about any possible cuts that should be applied.
Supports the following cuts: 'triggered', 'energy_lower_limit', 'energy_upper_limit', 'throw_away_prob'.
"""
make_output_dirs(output_dirpath, do2d, do3d, do4d)
filename = os.path.basename(os.path.splitext(fname)[0])
filename_output = filename.replace('.','_')
km.GlobalRandomState(seed=42) # set random km3pipe (=numpy) seed
geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges(n_bins, det_geo, detx_filepath, do4d)
pdf_2d_plots = PdfPages(output_dirpath + '/orcasong_output/4dTo2d/' + filename_output + '_plots.pdf') if do2d_plots[0] is True else None
file_particle_type = get_file_particle_type(fname)
print('Generating histograms from the hits for files based on ' + fname)
# Initialize OrcaSong Event Pipeline
pipe = kp.Pipeline()
pipe.attach(km.common.StatusBar, every=50)
pipe.attach(kp.io.hdf5.HDF5Pump, filename=fname)
pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader', 'McTracks', 'Hits', 'McHits'])
pipe.attach(EventDataExtractor,
file_particle_type=file_particle_type, geo=geo, do_mc_hits=do_mc_hits,
data_cuts=data_cuts, do4d=do4d, prod_ident=prod_ident)
pipe.attach(km.common.Keep, keys=['event_hits', 'event_track'])
pipe.attach(EventSkipper, data_cuts=data_cuts)
pipe.attach(HistogramMaker,
x_bin_edges=x_bin_edges, y_bin_edges=y_bin_edges, z_bin_edges=z_bin_edges,
n_bins=n_bins, timecut=timecut, do2d=do2d, do2d_plots=do2d_plots, pdf_2d_plots=pdf_2d_plots,
do3d=do3d, do4d=do4d)
pipe.attach(km.common.Delete, keys=['event_hits'])
if do2d:
for proj in ['xy', 'xz', 'yz', 'xt', 'yt', 'zt']:
savestr = output_dirpath + '/orcasong_output/4dTo2d/' + proj + '/' + filename_output + '_' + proj + '.h5'
pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize)
if do3d:
for proj in ['xyz', 'xyt', 'xzt', 'yzt', 'rzt']:
savestr = output_dirpath + '/orcasong_output/4dTo3d/' + proj + '/' + filename_output + '_' + proj + '.h5'
pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize)
if do4d[0]:
proj = 'xyzt' if not do4d[1] == 'channel_id' else 'xyzc'
savestr = output_dirpath + '/orcasong_output/4dTo4d/' + proj + '/' + filename_output + '_' + proj + '.h5'
pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize)
# Execute Pipeline
pipe.drain()
def main():
"""
Parses the input to the main data_to_images function.
"""
fname, detx_filepath, output_dirpath, chunksize, complib, complevel, n_bins, det_geo, do2d, do2d_plots, do3d,\
do4d, prod_ident, timecut, do_mc_hits, data_cuts = parse_input()
data_to_images(fname, detx_filepath, output_dirpath, chunksize, complib, complevel, n_bins, det_geo, do2d,
do2d_plots, do3d, do4d, prod_ident, timecut, do_mc_hits, data_cuts)
if __name__ == '__main__':
main()