#!/usr/bin/env python # coding=utf-8 # Filename: data_to_images.py """ Main OrcaSong code which takes raw simulated .h5 files and the corresponding .detx detector file as input in order to generate 2D/3D/4D histograms ('images') that can be used for CNNs. The input file can be calibrated or not (e.g. contains pos_xyz of the hits). Makes only 4D histograms ('images') by default. Usage: data_to_images.py [options] FILENAME DETXFILE data_to_images.py (-h | --help) Options: -h --help Show this screen. -c CONFIGFILE Load all options from a config file (.toml format). --n_bins N_BINS Number of bins that are used in the image making for each dimension, e.g. (x,y,z,t). [default: 11,13,18,60] --det_geo DET_GEO Which detector geometry to use for the binning, e.g. 'Orca_115l_23m_h_9m_v'. [default: Orca_115l_23m_h_9m_v] --do2d If 2D histograms, 'images', should be created. --do2d_plots If 2D pdf plot visualizations of the 2D histograms should be created, cannot be called if do2d=False. --do2d_plots_n N For how many events the 2D plot visualizations should be made. OrcaSong will exit after reaching N events. [default: 10] --do3d If 3D histograms, 'images', should be created. --dont_do4d If 4D histograms, 'images', should NOT be created. --do4d_mode MODE What dimension should be used in the 4D histograms as the 4th dim. Available: 'time', 'channel_id'. [default: time] --timecut_mode MODE Defines what timecut mode should be used in hits_to_histograms.py. At the moment, these cuts are only optimized for ORCA 115l neutrino events! Currently available: 'timeslice_relative': Cuts out the central 30% of the snapshot. 'trigger_cluster': Cuts based on the mean of the triggered hits. The timespan for this cut can be chosen in --timecut_timespan. 'None': No timecut. [default: trigger_cluster] --timecut_timespan TIMESPAN Only used with timecut_mode 'trigger_cluster'. Defines the timespan of the trigger_cluster cut. Currently available: 'all': [-350ns, 850ns] -> 20ns / bin (60 bins) 'tight-1': [-250ns, 500ns] -> 12.5ns / bin 'tight-2': [-150ns, 200ns] -> 5.8ns / bin [default: tight-1] --do_mc_hits If only the mc_hits (no BG hits!) should be used for the image processing. --data_cut_triggered If non-triggered hits should be thrown away for the images. --data_cut_e_low E_LOW Cut events that are lower than the specified threshold value in GeV. --data_cut_e_high E_HIGH Cut events that are higher than the specified threshold value in GeV. --data_cut_throw_away FRACTION Throw away a random fraction (percentage) of events. [default: 0.00] --prod_ident PROD_IDENT Optional int identifier for the used mc production. This is useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC. The prod_ident int will be saved in the 'y' dataset of the output file of OrcaSong. [default: 1] """ __author__ = 'Michael Moser' __license__ = 'AGPL' __version__ = '1.0' __email__ = 'michael.m.moser@fau.de' __status__ = 'Prototype' import os import sys import warnings import toml from docopt import docopt #from memory_profiler import profile # for memory profiling, call with @profile; myfunc() #import line_profiler # call with kernprof -l -v file.py args import km3pipe as kp import matplotlib as mpl mpl.use('Agg') from matplotlib.backends.backend_pdf import PdfPages from orcasong.file_to_hits import * from orcasong.histograms_to_files import * from orcasong.hits_to_histograms import * def parse_input(): """ Parses and returns all necessary input options for the data_to_images function. Check the data_to_images function to get docs about the individual parameters. """ args = docopt(__doc__) if args['-c']: config = toml.load(args['-c']) args.update(config) fname = args['FILENAME'] detx_filepath = args['DETXFILE'] n_bins = tuple(map(int, args['--n_bins'].split(','))) det_geo = args['--det_geo'] do2d = args['--do2d'] do2d_plots = (args['--do2d_plots'], int(args['--do2d_plots_n'])) do3d = args['--do3d'] do4d = (not bool(args['--dont_do4d']), args['--do4d_mode']) timecut = (args['--timecut_mode'], args['--timecut_timespan']) do_mc_hits = args['--do_mc_hits'] data_cuts = dict() data_cuts['triggered'] = args['--data_cut_triggered'] data_cuts['energy_lower_limit'] = float(args['--data_cut_e_low']) if args['--data_cut_e_low'] is not None else None data_cuts['energy_upper_limit'] = float(args['--data_cut_e_high']) if args['--data_cut_e_high'] is not None else None data_cuts['throw_away_prob'] = float(args['--data_cut_throw_away']) prod_ident = int(args['--prod_ident']) return fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut,\ do_mc_hits, data_cuts def parser_check_input(args): """ Sanity check of the user input. Only necessary for options that are not boolean. Parameters ---------- args : dict Docopt parser element that contains all input information. """ #---- Checks input types ----# # Check for options with a single, non-boolean element single_args = {'--det_geo': str, '--do2d_plots_n': int, '--do4d_mode': str, '--timecut_mode': str, 'timecut_timespan': str, '--data_cut_e_low ': float, '--data_cut_e_high': float, '--data_cut_throw_away': float, '--prod_ident': int} for key in single_args: expected_arg_type = single_args[key] parsed_arg = args[key] if parsed_arg is None: # we don't want to check args when there has been no user input continue try: map(expected_arg_type, parsed_arg) except ValueError: raise TypeError('The argument option ', key, ' only accepts ', str(expected_arg_type), ' values as an input.') # Checks the n_bins tuple input try: map(int, args['--n_bins'].split(',')) except ValueError: raise TypeError('The argument option n_bins only accepts integer values as an input' ' (Format: --n_bins 11,13,18,60).') # ---- Checks input types ----# # ---- Check other things ----# if not os.path.isfile(args['FILENAME']): raise IOError('The file -' + args['FILENAME'] + '- does not exist.') if not os.path.isfile(args['DETXFILE']): raise IOError('The file -' + args['DETXFILE'] + '- does not exist.') if bool(args['--do2d']) == False and bool(args['--do2d_plots']) == True: raise ValueError('The 2D pdf images cannot be created if do2d=False!') if bool(args['--do2d_plots']) == True and int(args['--do2d_plots_n']) > 100: warnings.warn('You declared do2d_pdf=(True, int) with int > 100. This will take more than two minutes.' 'Do you really want to create pdfs images for so many events?') def calculate_bin_edges(n_bins, det_geo, detx_filepath, do4d): """ Calculates the bin edges for the corresponding detector geometry (1 DOM/bin) based on the number of specified bins. Used later on for making the event "images" with the in the np.histogramdd funcs in hits_to_histograms.py. The bin edges are necessary in order to get the same bin size for each event regardless of the fact if all bins have a hit or not. Parameters ---------- n_bins : tuple Contains the desired number of bins for each dimension, [n_bins_x, n_bins_y, n_bins_z]. det_geo : str Declares what detector geometry should be used for the binning. detx_filepath : str Filepath of a .detx detector file which contains the geometry of the detector. do4d : tuple(boo, str) Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz. Returns ------- x_bin_edges, y_bin_edges, z_bin_edges : ndarray(ndim=1) The bin edges for the x,y,z direction. """ # Loads a kp.Geometry object based on the filepath of a .detx file print("Reading detector geometry in order to calculate the detector dimensions from file " + detx_filepath) geo = kp.calib.Calibration(filename=detx_filepath) # derive maximum and minimum x,y,z coordinates of the geometry input [[xmin, ymin, zmin], [xmax, ymax, zmax]] dom_position_values = geo.get_detector().dom_positions.values() dom_pos_max = np.amax([pos for pos in dom_position_values], axis=0) dom_pos_min = np.amin([pos for pos in dom_position_values], axis=0) geo_limits = dom_pos_min, dom_pos_max print('Detector dimensions [[xmin, ymin, zmin], [xmax, ymax, zmax]]: ' + str(geo_limits)) if det_geo == 'Orca_115l_23m_h_9m_v' or det_geo == 'Orca_115l_23m_h_?m_v': x_bin_edges = np.linspace(geo_limits[0][0] - 9.95, geo_limits[1][0] + 9.95, num=n_bins[0] + 1) #try to get the lines in the bin center 9.95*2 = average x-separation of two lines y_bin_edges = np.linspace(geo_limits[0][1] - 9.75, geo_limits[1][1] + 9.75, num=n_bins[1] + 1) # Delta y = 19.483 # Fitted offsets: x,y,factor: factor*(x+x_off), # Stefan's modifications: offset_x, offset_y, scale = [6.19, 0.064, 1.0128] x_bin_edges = (x_bin_edges + offset_x) * scale y_bin_edges = (y_bin_edges + offset_y) * scale if det_geo == 'Orca_115l_23m_h_?m_v': # ORCA denser detector study z_bin_edges = np.linspace(37.84 - 7.5, 292.84 + 7.5, num=n_bins[2] + 1) # 15m vertical, 18 DOMs # z_bin_edges = np.linspace(37.84 - 6, 241.84 + 6, num=n_bins[2] + 1) # 12m vertical, 18 DOMs # z_bin_edges = np.linspace(37.84 - 4.5, 190.84 + 4.5, num=n_bins[2] + 1) # 9m vertical, 18 DOMs # z_bin_edges = np.linspace(37.84 - 3, 139.84 + 3, num=n_bins[2] + 1) # 6m vertical, 18 DOMs # z_bin_edges = np.linspace(37.84 - 2.25, 114.34 + 2.25, num=n_bins[2] + 1) # 4.5m vertical, 18 DOMs else: n_bins_z = n_bins[2] if do4d[1] != 'xzt-c' else n_bins[1] # n_bins = [xyz,t/c] or n_bins = [xzt,c] z_bin_edges = np.linspace(geo_limits[0][2] - 4.665, geo_limits[1][2] + 4.665, num=n_bins_z + 1) # Delta z = 9.329 # calculate_bin_edges_test(dom_positions, y_bin_edges, z_bin_edges) # test disabled by default. Activate it, if you change the offsets in x/y/z-bin-edges else: raise ValueError('The specified detector geometry "' + str(det_geo) + '" is not available.') return geo, x_bin_edges, y_bin_edges, z_bin_edges def calculate_bin_edges_test(geo, y_bin_edges, z_bin_edges): """ Tests, if the bins in one direction don't accidentally have more than 'one' OM. For the x-direction, an overlapping can not be avoided in an orthogonal space. For y and z though, it can! For y, every bin should contain the number of lines per y-direction * 18 for 18 doms per line. For z, every bin should contain 115 entries, since every z bin contains one storey of the 115 ORCA lines. Not 100% accurate, since only the dom positions are used and not the individual pmt positions for a dom. """ dom_positions = np.stack(list(geo.get_detector().dom_positions.values())) dom_y = dom_positions[:, 1] dom_z = dom_positions[:, 2] hist_y = np.histogram(dom_y, bins=y_bin_edges) hist_z = np.histogram(dom_z, bins=z_bin_edges) print('----------------------------------------------------------------------------------------------') print('Y-axis: Bin content: ' + str(hist_y[0])) print('It should be: ' + str(np.array( [4 * 18, 7 * 18, 9 * 18, 10 * 18, 9 * 18, 10 * 18, 10 * 18, 10 * 18, 11 * 18, 10 * 18, 9 * 18, 8 * 18, 8 * 18]))) print('Y-axis: Bin edges: ' + str(hist_y[1])) print('..............................................................................................') print('Z-axis: Bin content: ' + str(hist_z[0])) print('It should have 115 entries everywhere') print('Z-axis: Bin edges: ' + str(hist_z[1])) print('----------------------------------------------------------------------------------------------') def get_file_particle_type(event_pump): """ Returns a string that specifies the type of the particles that are contained in the input file. Parameters ---------- event_pump : kp.io.hdf5.HDF5Pump Km3pipe HDF5Pump instance which is linked to the input file. Returns ------- file_particle_type : str String that specifies the type of particles that are contained in the file: ['undefined', 'muon', 'neutrino']. """ if 'McTracks' not in event_pump[0]: file_particle_type = 'undefined' else: particle_type = event_pump[0]['McTracks'].type # if mupage file: first mc_track is an empty neutrino track, second track is the first muon if particle_type[0] == 0 and np.abs(particle_type[1]) == 13: file_particle_type = 'muon' # the first mc_track is the primary neutrino if the input file is containing neutrino events elif np.abs(particle_type[0]) in [12, 14, 16]: file_particle_type = 'neutrino' else: raise ValueError('The type of the particles in the "McTracks" folder, <', str(particle_type), '> is not known.') return file_particle_type def skip_event(event_track, data_cuts): """ Function that checks if an event should be skipped, depending on the data_cuts input. Parameters ---------- event_track : ndarray(ndim=1) 1D array containing important MC information of the event, only the energy of the event (pos_2) is used here. data_cuts : dict Dictionary that contains information about any possible cuts that should be applied. Supports the following cuts: 'triggered', 'energy_lower_limit', 'energy_upper_limit', 'throw_away_prob'. Returns ------- continue_bool : bool boolean flag to specify, if this event should be skipped or not. """ continue_bool = False if data_cuts['energy_lower_limit'] is not None: continue_bool = event_track[2] < data_cuts['energy_lower_limit'] # True if E < lower limit if data_cuts['energy_upper_limit'] is not None: continue_bool = event_track[2] > data_cuts['energy_upper_limit'] # True if E > upper limit if data_cuts['throw_away_prob'] > 0: throw_away_prob = data_cuts['throw_away_prob'] throw_away = np.random.choice([False, True], p=[1 - throw_away_prob, throw_away_prob]) if throw_away == True: continue_bool = True # # TODO temporary, deprecated solution, we always need to throw away the same events if we have multiple inputs -> use fixed seed # arr = np.load('/home/woody/capn/mppi033h/Code/OrcaSong/utilities/low_e_prod_surviving_evts_elec-CC.npy') # arr_list = arr.tolist() # evt_id = event_track[0] # run_id = event_track[9] # # if [run_id, evt_id] not in arr_list: # continue return continue_bool def data_to_images(fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut, do_mc_hits, data_cuts): """ Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used for a CNN. Parameters ---------- fname : str Filename (full path!) of the input file. detx_filepath : str String with the full filepath to the corresponding .detx file of the input file. Used for the binning and for the hits calibration if the input file is not calibrated yet (e.g. hits do not contain pos_x/y/z, time, ...). n_bins : tuple of int Declares the number of bins that should be used for each dimension, e.g. (x,y,z,t). det_geo : str Declares what detector geometry should be used for the binning. E.g. 'Orca_115l_23m_h_9m_v'. do2d : bool Declares if 2D histograms, 'images', should be created. do2d_plots : tuple(bool, int) Declares if pdf visualizations of the 2D histograms should be created, cannot be called if do2d=False. The event loop will be stopped after the integer specified in the second argument. do3d : bool Declares if 3D histograms should be created. do4d : tuple(bool, str) Tuple that declares if 4D histograms should be created [0] and if yes, what should be used as the 4th dim after xyz. Currently, only 'time' and 'channel_id' are available. prod_ident : int Optional int identifier for the used mc production. This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC. In this case, the events are not fully distinguishable with only the run_id and the event_id! In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable. timecut : tuple(str, str/None) Tuple that defines what timecut should be used in hits_to_histograms.py. Currently available: ('timeslice_relative', None): Cuts out the central 30% of the snapshot. ('trigger_cluster', 'all' / 'tight-1' / 'tight-2'): Cuts based on the mean of the triggered hits. (None, ...): No timecut. all: [-350ns, 850ns] -> 20ns / bin (60 bins) tight-1: [-250ns, 500ns] -> 12.5ns / bin , tight-2: [-150ns, 200ns] -> 5.8ns / bin do_mc_hits : bool Declares if hits (False, mc_hits + BG) or mc_hits (True) should be processed. data_cuts : dict Dictionary that contains information about any possible cuts that should be applied. Supports the following cuts: 'triggered', 'energy_lower_limit', 'energy_upper_limit', 'throw_away_prob'. """ np.random.seed(42) # set random seed filename = os.path.basename(os.path.splitext(fname)[0]) filename_output = filename.replace('.','_') geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges(n_bins, det_geo, detx_filepath, do4d) all_4d_to_2d_hists, all_4d_to_3d_hists, all_4d_to_4d_hists, mc_infos = [], [], [], [] pdf_2d_plots = PdfPages('Results/4dTo2d/' + filename_output + '_plots.pdf') if do2d_plots[0] is True else None # Initialize HDF5Pump of the input file event_pump = kp.io.hdf5.HDF5Pump(filename=fname) file_particle_type = get_file_particle_type(event_pump) print('Generating histograms from the hits in XYZT format for files based on ' + fname) for i, event_blob in enumerate(event_pump): if i % 10 == 0: print('Event No. ' + str(i)) # filter out all hit and track information belonging that to this event event_hits, event_track = get_event_data(event_blob, file_particle_type, geo, do_mc_hits, data_cuts, do4d, prod_ident) continue_bool = skip_event(event_track, data_cuts) if continue_bool: continue # event_track: [event_id, particle_type, energy, isCC, bjorkeny, dir_x/y/z, time] mc_infos.append(event_track) if do2d: compute_4d_to_2d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_2d_hists, timecut, event_track, do2d_plots[0], pdf_2d_plots) if do3d: compute_4d_to_3d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_3d_hists, timecut) if do4d[0]: compute_4d_to_4d_histograms(event_hits, x_bin_edges, y_bin_edges, z_bin_edges, n_bins, all_4d_to_4d_hists, timecut, do4d) if do2d_plots[0] is True and i >= do2d_plots[1]: pdf_2d_plots.close() break if do2d: store_histograms_as_hdf5(np.stack([hist_tuple[0] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xy/' + filename_output + '_xy.h5') store_histograms_as_hdf5(np.stack([hist_tuple[1] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xz/' + filename_output + '_xz.h5') store_histograms_as_hdf5(np.stack([hist_tuple[2] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/yz/' + filename_output + '_yz.h5') store_histograms_as_hdf5(np.stack([hist_tuple[3] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/xt/' + filename_output + '_xt.h5') store_histograms_as_hdf5(np.stack([hist_tuple[4] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/yt/' + filename_output + '_yt.h5') store_histograms_as_hdf5(np.stack([hist_tuple[5] for hist_tuple in all_4d_to_2d_hists]), np.array(mc_infos), 'Results/4dTo2d/h5/zt/' + filename_output + '_zt.h5') if do3d: store_histograms_as_hdf5(np.stack([hist_tuple[0] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xyz/' + filename_output + '_xyz.h5', compression=('gzip', 1)) store_histograms_as_hdf5(np.stack([hist_tuple[1] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xyt/' + filename_output + '_xyt.h5', compression=('gzip', 1)) store_histograms_as_hdf5(np.stack([hist_tuple[2] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/xzt/' + filename_output + '_xzt.h5', compression=('gzip', 1)) store_histograms_as_hdf5(np.stack([hist_tuple[3] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/yzt/' + filename_output + '_yzt.h5', compression=('gzip', 1)) store_histograms_as_hdf5(np.stack([hist_tuple[4] for hist_tuple in all_4d_to_3d_hists]), np.array(mc_infos), 'Results/4dTo3d/h5/rzt/' + filename_output + '_rzt.h5', compression=('gzip', 1)) if do4d[0]: folder = '' if not os.path.exists('Results/4dTo4d/h5/xyzt/' + folder): os.makedirs('Results/4dTo4d/h5/xyzt/' + folder) if folder != '': folder += '/' if do4d[1] == 'channel_id': store_histograms_as_hdf5(np.array(all_4d_to_4d_hists), np.array(mc_infos), 'Results/4dTo4d/h5/xyzc/' + folder + filename_output + '_xyzc.h5', compression=('gzip', 1)) else: store_histograms_as_hdf5(np.array(all_4d_to_4d_hists), np.array(mc_infos), 'Results/4dTo4d/h5/xyzt/' + folder + filename_output + '_xyzt.h5', compression=('gzip', 1)) def main(): """ Parses the input to the main data_to_images function. """ fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut, do_mc_hits, data_cuts = parse_input() data_to_images(fname, detx_filepath, n_bins, det_geo, do2d, do2d_plots, do3d, do4d, prod_ident, timecut, do_mc_hits, data_cuts) if __name__ == '__main__': main()