diff --git a/docs/orcasong_plag.rst b/docs/orcasong_plag.rst index aa0e9475e32c02a2564a81800a36be2afb7d343e..ba34f18e8011de8540610b921203ad2811c62575 100644 --- a/docs/orcasong_plag.rst +++ b/docs/orcasong_plag.rst @@ -35,8 +35,8 @@ Calling the object like this will show you the binning: .. code-block:: python - fb - >>> <FileBinner: ('pos_z', 'time') (10, 100)> + >>> fb + <FileBinner: ('pos_z', 'time') (10, 100)> As you can see, the FileBinner will produce zt data, with 10 and 100 bins, respectively. diff --git a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py index 7580f22eca825df11db7e60eb43f22b798f2e9dd..05f302b66bc2bebb445e7ba449f8a91a78f3bfd5 100644 --- a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py +++ b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py @@ -202,7 +202,8 @@ def get_f_compression_and_chunking(filepath): def concatenate_h5_files(output_filepath, file_list, - chunksize=None, complib=None, complevel=None): + chunksize=None, complib=None, complevel=None, + event_skipper=None): """ Function that concatenates hdf5 files based on an output_filepath and a file_list of input files. @@ -229,6 +230,9 @@ def concatenate_h5_files(output_filepath, file_list, A compression level is only available for gzip compression, not lzf! If None, the compression level is read from the first input file. Else, a custom compression level will be used. + event_skipper : function, optional + Function that gets the "y" dataset, and returns an array with bools + showing which events to skip (ie not include in the output). """ cum_rows_list = get_cum_number_of_rows(file_list) @@ -251,6 +255,10 @@ def concatenate_h5_files(output_filepath, file_list, if 'format_version' in list(input_file.attrs.keys()) and n == 0: file_output.attrs['format_version'] = input_file.attrs['format_version'] + if event_skipper is not None: + y_dataset = input_file["y"] + skips = event_skipper(y_dataset) + for folder_name in input_file: if is_folder_ignored(folder_name): @@ -270,6 +278,12 @@ def concatenate_h5_files(output_filepath, file_list, print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype)) + if event_skipper is not None: + folder_data = folder_data[skips] + print('Event Skipper: Shape and dtype of dataset ' + + folder_name + ': ' + str(folder_data.shape) + + ' ; ' + str(folder_data.dtype)) + if n == 0: # first file; create the dummy dataset with no max shape maxshape = (None,) + folder_data.shape[1:] # change shape of axis zero to None @@ -293,6 +307,9 @@ def concatenate_h5_files(output_filepath, file_list, ' is the chunksize in axis_0): \n' + str(file_output[folder_name].shape) + ' ; ' + str(file_output[folder_name].dtype) + ' ; ' + str(file_output[folder_name].chunks)) + file_list_ascii = [n.encode("ascii", "ignore") for n in file_list] + file_output.create_dataset("used_files", data=file_list_ascii) + file_output.close() diff --git a/orcasong_plag/mc_info_types.py b/orcasong_plag/mc_info_types.py index 4f21cb6acddefca83c27e81703500690c96a4a69..8398cd171481b559b44cf2bf2c41b9596dcb96a4 100644 --- a/orcasong_plag/mc_info_types.py +++ b/orcasong_plag/mc_info_types.py @@ -4,6 +4,7 @@ in the h5 files. """ import numpy as np +import warnings def get_mc_info_extr(mc_info_extr): @@ -24,7 +25,7 @@ def get_mc_info_extr(mc_info_extr): mc_info_extr = get_event_and_run_id else: - raise ValueError("Unknown mc_info_type " + mc_info_extr) + raise NameError("Unknown mc_info_type " + mc_info_extr) return mc_info_extr @@ -46,6 +47,9 @@ def get_mupage_mc(blob): """ For mupage muon simulations. + Will only take into account muons with at least 1 McHit in the active + line of the detector. + e.g. mcv5.1_r3.mupage_10G.km3_AAv1.jterbr00002800.5103.root.h5 Parameters @@ -64,12 +68,37 @@ def get_mupage_mc(blob): track = dict() + # total number of simulated muons, not all might deposit counts + n_muons_sim = blob['McTracks'].shape[0] + track["n_muons_sim"] = n_muons_sim + + # Origin of each mchit (as int) in the active line + in_active_du = blob["McHits"]["du"] == active_du + origin = blob["McHits"]["origin"][in_active_du] + # get how many mchits were produced per muon in the bundle + origin_dict = dict(zip(*np.unique(origin, return_counts=True))) + origin_list = [] + for i in range(1, n_muons_sim + 1): + origin_list.append(origin_dict.get(i, 0)) + # origin_list[i] is num of mc_hits of muon i in active du + origin_list = np.array(origin_list) + + visible_mc_tracks = blob["McTracks"][origin_list > 0] + visible_origin_list = origin_list[origin_list > 0] + + # only muons with at least one mchit in active line + n_muons = len(visible_origin_list) + if n_muons == 0: + warnings.warn("No visible muons in blob!") + + track["n_muons"] = n_muons + track["event_id"] = blob['EventInfo'].event_id[0] track["run_id"] = blob["EventInfo"].run_id[0] # run_id = blob['Header'].start_run.run_id.astype('float32') # take 0: assumed that this is the same for all muons in a bundle - track["particle_type"] = blob['McTracks'][0].type + track["particle_type"] = visible_mc_tracks[0].type if n_muons != 0 else 0 # always 1 actually # track["is_cc"] = blob['McTracks'][0].is_cc @@ -77,33 +106,19 @@ def get_mupage_mc(blob): # always 0 actually # track["bjorkeny"] = blob['McTracks'][0].bjorkeny - # same for all muons in a bundle #TODO not? - track["time_interaction"] = blob['McTracks'][0].time - - # takes position of time_residual_vertex in 'neutrino' case - n_muons = blob['McTracks'].shape[0] - track["n_muons"] = n_muons + # same for all muons in a bundle TODO not? + track["time_interaction"] = visible_mc_tracks[0].time if n_muons != 0 else 0 - # sum up the energy of all muons - energy = blob['McTracks'].energy + # sum up the energy of all visible muons + energy = visible_mc_tracks.energy if n_muons != 0 else 0 track["energy"] = np.sum(energy) - # Origin of each mchit (as int) in the active line - in_active_du = blob["McHits"]["du"] == active_du - origin = blob["McHits"]["origin"][in_active_du] - - # get how many mchits were produced per muon in the bundle - origin_dict = dict(zip(*np.unique(origin, return_counts=True))) - origin_list = [] - for i in range(1, n_muons+1): - origin_list.append(origin_dict.get(i, 0)) - origin_list = np.array(origin_list) - track["n_mc_hits"] = np.sum(origin_list) - desc_order = np.argsort(-origin_list) + track["n_mc_hits"] = np.sum(visible_origin_list) + desc_order = np.argsort(-visible_origin_list) # Sort by energy, highest first - sorted_energy = energy[desc_order] - sorted_mc_hits = origin_list[desc_order] + sorted_energy = energy[desc_order] if n_muons != 0 else [] + sorted_mc_hits = visible_origin_list[desc_order] if n_muons != 0 else [] # Store number of mchits of the 10 highest mchits muons (-1 if it has less) for i in range(10): @@ -127,24 +142,27 @@ def get_mupage_mc(blob): track[field_name] = field_data - # only muons with at least one mchit in active line - track["n_muons_visible"] = len(origin_list[origin_list > 0]) # only muons with at least 5 mchits in active line track["n_muons_5_mchits"] = len(origin_list[origin_list > 4]) # only muons with at least 10 mchits in active line track["n_muons_10_mchits"] = len(origin_list[origin_list > 9]) # all muons in a bundle are parallel, so just take dir of first muon - track["dir_x"] = blob['McTracks'][0].dir_x - track["dir_y"] = blob['McTracks'][0].dir_y - track["dir_z"] = blob['McTracks'][0].dir_z - - # vertex is the weighted (energy) mean of the individual vertices - track["vertex_pos_x"] = np.average(blob['McTracks'][:].pos_x, - weights=blob['McTracks'][:].energy) - track["vertex_pos_y"] = np.average(blob['McTracks'][:].pos_y, - weights=blob['McTracks'][:].energy) - track["vertex_pos_z"] = np.average(blob['McTracks'][:].pos_z, - weights=blob['McTracks'][:].energy) + track["dir_x"] = visible_mc_tracks[0].dir_x if n_muons != 0 else 0 + track["dir_y"] = visible_mc_tracks[0].dir_y if n_muons != 0 else 0 + track["dir_z"] = visible_mc_tracks[0].dir_z if n_muons != 0 else 0 + + if n_muons != 0: + # vertex is the weighted (energy) mean of the individual vertices + track["vertex_pos_x"] = np.average(visible_mc_tracks[:].pos_x, + weights=visible_mc_tracks[:].energy) + track["vertex_pos_y"] = np.average(visible_mc_tracks[:].pos_y, + weights=visible_mc_tracks[:].energy) + track["vertex_pos_z"] = np.average(visible_mc_tracks[:].pos_z, + weights=visible_mc_tracks[:].energy) + else: + track["vertex_pos_x"] = 0 + track["vertex_pos_y"] = 0 + track["vertex_pos_z"] = 0 return track diff --git a/orcasong_plag/util/split_conc.py b/orcasong_plag/util/split_conc.py new file mode 100644 index 0000000000000000000000000000000000000000..35613d7986a8232b4e8633c7ca97c6cc4c8e1c56 --- /dev/null +++ b/orcasong_plag/util/split_conc.py @@ -0,0 +1,119 @@ +import os +import numpy as np + + +def get_files(folder): + """ + Get pathes of all h5 files in given folder. + """ + infiles = os.listdir(folder) + infiles.sort() + + infile_paths = [] + for infile in infiles: + if infile.endswith(".h5"): + infile_paths.append(os.path.join(folder, infile)) + + return np.array(infile_paths) + + +def split_path_list(files, train_frac, n_train_files, n_val_files): + """ + Get train and val files split according to given fraction, and + distributed over given number of files. + + Parameters + ---------- + files : List + The files. + train_frac : float + The fraction of files. + n_train_files : int + Total number of resulting train files. + n_val_files : int + Total number of resulting val files. + + Returns + ------- + job_files_train : ndarray + The train files. They are chosen randomly from the files list. + The total number of files is the given fraction of the input files. + job_files_val : ndarray + The val files, similar to the train files. + + """ + if n_train_files < 1 or n_val_files < 1: + raise ValueError("Need at least 1 file for train and val.") + + order = np.arange(len(files)) + np.random.shuffle(order) + + len_train_files = int(len(files) * train_frac) + train_files = files[order[:len_train_files]] + val_files = files[order[len_train_files:]] + + job_files_train = np.array_split(train_files, n_train_files) + job_files_val = np.array_split(val_files, n_val_files) + + for fs in job_files_train: + if len(fs) == 0: + raise ValueError("No files for an output train file!") + + for fs in job_files_val: + if len(fs) == 0: + raise ValueError("No files for an output val file!") + + return job_files_train, job_files_val + + +def get_split(folder, outfile_basestr, n_train_files=1, n_val_files=1, + train_frac=0.8): + """ + Prepare to concatentate binned .h5 files to training and validation files. + + The files in each train or val file will be drawn randomly from the + available files. Each train or val files will be created by its own + seperately submitted job. + + Parameters + ---------- + folder : str + Containing the files to concat. + n_train_files : int + Total number of resulting train files. + n_val_files : int + Total number of resulting val files. + outfile_basestr : str + Path and a base for the name. "train"/"val" and a file number will + get automatically attached to the name. + train_frac : float + The fraction of files in the train set. + + Returns + ------- + jobs : dict + Contains the arguments for the concatenate script. + + """ + files = get_files(folder) + job_files_train, job_files_val = split_path_list(files, train_frac, n_train_files, n_val_files) + + jobs = [] + + for i, job_files in enumerate(job_files_train): + output_filepath = "{}_train_{}.h5".format(outfile_basestr, i) + job_dict = { + "file_list": job_files, + "output_filepath": output_filepath + } + jobs.append(job_dict) + + for i, job_files in enumerate(job_files_val): + output_filepath = "{}_val_{}.h5".format(outfile_basestr, i) + job_dict = { + "file_list": job_files, + "output_filepath": output_filepath + } + jobs.append(job_dict) + + return jobs