Some small additions.

95bcd75e · Stefan Reck · b0d29102 · 95bcd75e · 95bcd75e · 95bcd75e
Commit 95bcd75e authored 5 years ago by Stefan Reck
--- a/docs/orcasong_plag.rst
+++ b/docs/orcasong_plag.rst
@@ -35,8 +35,8 @@ Calling the object like this will show you the binning:

 .. code-block:: python

-    fb
-    >>> <FileBinner: ('pos_z', 'time') (10, 100)>
+    >>> fb
+    <FileBinner: ('pos_z', 'time') (10, 100)>

 As you can see, the FileBinner will produce zt data, with 10 and 100 bins,
 respectively.

--- a/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
+++ b/orcasong_contrib/data_tools/concatenate/concatenate_h5.py
@@ -202,7 +202,8 @@ def get_f_compression_and_chunking(filepath):


 def concatenate_h5_files(output_filepath, file_list,
-                         chunksize=None, complib=None, complevel=None):
+                         chunksize=None, complib=None, complevel=None,
+                         event_skipper=None):
    """
    Function that concatenates hdf5 files based on an output_filepath and a file_list of input files.

@@ -229,6 +230,9 @@ def concatenate_h5_files(output_filepath, file_list,
        A compression level is only available for gzip compression, not lzf!
        If None, the compression level is read from the first input file.
        Else, a custom compression level will be used.
+    event_skipper : function, optional
+        Function that gets the "y" dataset, and returns an array with bools
+        showing which events to skip (ie not include in the output).

    """
    cum_rows_list = get_cum_number_of_rows(file_list)
@@ -251,6 +255,10 @@ def concatenate_h5_files(output_filepath, file_list,
        if 'format_version' in list(input_file.attrs.keys()) and n == 0:
            file_output.attrs['format_version'] = input_file.attrs['format_version']

+        if event_skipper is not None:
+            y_dataset = input_file["y"]
+            skips = event_skipper(y_dataset)
+
        for folder_name in input_file:

            if is_folder_ignored(folder_name):
@@ -270,6 +278,12 @@ def concatenate_h5_files(output_filepath, file_list,

            print('Shape and dtype of dataset ' + folder_name + ': ' + str(folder_data.shape) + ' ; ' + str(folder_data.dtype))

+            if event_skipper is not None:
+                folder_data = folder_data[skips]
+                print('Event Skipper: Shape and dtype of dataset ' +
+                      folder_name + ': ' + str(folder_data.shape) +
+                      ' ; ' + str(folder_data.dtype))
+
            if n == 0:
                # first file; create the dummy dataset with no max shape
                maxshape = (None,) + folder_data.shape[1:]  # change shape of axis zero to None
@@ -293,6 +307,9 @@ def concatenate_h5_files(output_filepath, file_list,
              ' is the chunksize in axis_0): \n' + str(file_output[folder_name].shape) + ' ; ' +
              str(file_output[folder_name].dtype) + ' ; ' + str(file_output[folder_name].chunks))

+    file_list_ascii = [n.encode("ascii", "ignore") for n in file_list]
+    file_output.create_dataset("used_files", data=file_list_ascii)
+
    file_output.close()



--- a/orcasong_plag/mc_info_types.py
+++ b/orcasong_plag/mc_info_types.py
@@ -4,6 +4,7 @@ in the h5 files.
 """

 import numpy as np
+import warnings


 def get_mc_info_extr(mc_info_extr):
@@ -24,7 +25,7 @@ def get_mc_info_extr(mc_info_extr):
        mc_info_extr = get_event_and_run_id

    else:
-        raise ValueError("Unknown mc_info_type " + mc_info_extr)
+        raise NameError("Unknown mc_info_type " + mc_info_extr)

    return mc_info_extr

@@ -46,6 +47,9 @@ def get_mupage_mc(blob):
    """
    For mupage muon simulations.

+    Will only take into account muons with at least 1 McHit in the active
+    line of the detector.
+
    e.g. mcv5.1_r3.mupage_10G.km3_AAv1.jterbr00002800.5103.root.h5

    Parameters
@@ -64,12 +68,37 @@ def get_mupage_mc(blob):

    track = dict()

+    # total number of simulated muons, not all might deposit counts
+    n_muons_sim = blob['McTracks'].shape[0]
+    track["n_muons_sim"] = n_muons_sim
+
+    # Origin of each mchit (as int) in the active line
+    in_active_du = blob["McHits"]["du"] == active_du
+    origin = blob["McHits"]["origin"][in_active_du]
+    # get how many mchits were produced per muon in the bundle
+    origin_dict = dict(zip(*np.unique(origin, return_counts=True)))
+    origin_list = []
+    for i in range(1, n_muons_sim + 1):
+        origin_list.append(origin_dict.get(i, 0))
+    # origin_list[i] is num of mc_hits of muon i in active du
+    origin_list = np.array(origin_list)
+
+    visible_mc_tracks = blob["McTracks"][origin_list > 0]
+    visible_origin_list = origin_list[origin_list > 0]
+
+    # only muons with at least one mchit in active line
+    n_muons = len(visible_origin_list)
+    if n_muons == 0:
+        warnings.warn("No visible muons in blob!")
+
+    track["n_muons"] = n_muons
+
    track["event_id"] = blob['EventInfo'].event_id[0]
    track["run_id"] = blob["EventInfo"].run_id[0]
    # run_id = blob['Header'].start_run.run_id.astype('float32')

    # take 0: assumed that this is the same for all muons in a bundle
-    track["particle_type"] = blob['McTracks'][0].type
+    track["particle_type"] = visible_mc_tracks[0].type if n_muons != 0 else 0

    # always 1 actually
    # track["is_cc"] = blob['McTracks'][0].is_cc
@@ -77,33 +106,19 @@ def get_mupage_mc(blob):
    # always 0 actually
    # track["bjorkeny"] = blob['McTracks'][0].bjorkeny

-    # same for all muons in a bundle #TODO not?
-    track["time_interaction"] = blob['McTracks'][0].time
-
-    # takes position of time_residual_vertex in 'neutrino' case
-    n_muons = blob['McTracks'].shape[0]
-    track["n_muons"] = n_muons
+    # same for all muons in a bundle TODO not?
+    track["time_interaction"] = visible_mc_tracks[0].time if n_muons != 0 else 0

-    # sum up the energy of all muons
-    energy = blob['McTracks'].energy
+    # sum up the energy of all visible muons
+    energy = visible_mc_tracks.energy if n_muons != 0 else 0
    track["energy"] = np.sum(energy)

-    # Origin of each mchit (as int) in the active line
-    in_active_du = blob["McHits"]["du"] == active_du
-    origin = blob["McHits"]["origin"][in_active_du]
-
-    # get how many mchits were produced per muon in the bundle
-    origin_dict = dict(zip(*np.unique(origin, return_counts=True)))
-    origin_list = []
-    for i in range(1, n_muons+1):
-        origin_list.append(origin_dict.get(i, 0))
-    origin_list = np.array(origin_list)
-    track["n_mc_hits"] = np.sum(origin_list)
-    desc_order = np.argsort(-origin_list)
+    track["n_mc_hits"] = np.sum(visible_origin_list)
+    desc_order = np.argsort(-visible_origin_list)

    # Sort by energy, highest first
-    sorted_energy = energy[desc_order]
-    sorted_mc_hits = origin_list[desc_order]
+    sorted_energy = energy[desc_order] if n_muons != 0 else []
+    sorted_mc_hits = visible_origin_list[desc_order] if n_muons != 0 else []

    # Store number of mchits of the 10 highest mchits muons (-1 if it has less)
    for i in range(10):
@@ -127,24 +142,27 @@ def get_mupage_mc(blob):

        track[field_name] = field_data

-    # only muons with at least one mchit in active line
-    track["n_muons_visible"] = len(origin_list[origin_list > 0])
    # only muons with at least 5 mchits in active line
    track["n_muons_5_mchits"] = len(origin_list[origin_list > 4])
    # only muons with at least 10 mchits in active line
    track["n_muons_10_mchits"] = len(origin_list[origin_list > 9])

    # all muons in a bundle are parallel, so just take dir of first muon
-    track["dir_x"] = blob['McTracks'][0].dir_x
-    track["dir_y"] = blob['McTracks'][0].dir_y
-    track["dir_z"] = blob['McTracks'][0].dir_z
-
-    # vertex is the weighted (energy) mean of the individual vertices
-    track["vertex_pos_x"] = np.average(blob['McTracks'][:].pos_x,
-                                       weights=blob['McTracks'][:].energy)
-    track["vertex_pos_y"] = np.average(blob['McTracks'][:].pos_y,
-                                       weights=blob['McTracks'][:].energy)
-    track["vertex_pos_z"] = np.average(blob['McTracks'][:].pos_z,
-                                       weights=blob['McTracks'][:].energy)
+    track["dir_x"] = visible_mc_tracks[0].dir_x if n_muons != 0 else 0
+    track["dir_y"] = visible_mc_tracks[0].dir_y if n_muons != 0 else 0
+    track["dir_z"] = visible_mc_tracks[0].dir_z if n_muons != 0 else 0
+
+    if n_muons != 0:
+        # vertex is the weighted (energy) mean of the individual vertices
+        track["vertex_pos_x"] = np.average(visible_mc_tracks[:].pos_x,
+                                           weights=visible_mc_tracks[:].energy)
+        track["vertex_pos_y"] = np.average(visible_mc_tracks[:].pos_y,
+                                           weights=visible_mc_tracks[:].energy)
+        track["vertex_pos_z"] = np.average(visible_mc_tracks[:].pos_z,
+                                           weights=visible_mc_tracks[:].energy)
+    else:
+        track["vertex_pos_x"] = 0
+        track["vertex_pos_y"] = 0
+        track["vertex_pos_z"] = 0

    return track
--- a/orcasong_plag/util/split_conc.py
+++ b/orcasong_plag/util/split_conc.py
+import os
+import numpy as np
+
+
+def get_files(folder):
+    """
+    Get pathes of all h5 files in given folder.
+    """
+    infiles = os.listdir(folder)
+    infiles.sort()
+
+    infile_paths = []
+    for infile in infiles:
+        if infile.endswith(".h5"):
+            infile_paths.append(os.path.join(folder, infile))
+
+    return np.array(infile_paths)
+
+
+def split_path_list(files, train_frac, n_train_files, n_val_files):
+    """
+    Get train and val files split according to given fraction, and
+    distributed over given number of files.
+
+    Parameters
+    ----------
+    files : List
+        The files.
+    train_frac : float
+        The fraction of files.
+    n_train_files : int
+        Total number of resulting train files.
+    n_val_files : int
+        Total number of resulting val files.
+
+    Returns
+    -------
+    job_files_train : ndarray
+        The train files. They are chosen randomly from the files list.
+        The total number of files is the given fraction of the input files.
+    job_files_val : ndarray
+        The val files, similar to the train files.
+
+    """
+    if n_train_files < 1 or n_val_files < 1:
+        raise ValueError("Need at least 1 file for train and val.")
+
+    order = np.arange(len(files))
+    np.random.shuffle(order)
+
+    len_train_files = int(len(files) * train_frac)
+    train_files = files[order[:len_train_files]]
+    val_files = files[order[len_train_files:]]
+
+    job_files_train = np.array_split(train_files, n_train_files)
+    job_files_val = np.array_split(val_files, n_val_files)
+
+    for fs in job_files_train:
+        if len(fs) == 0:
+            raise ValueError("No files for an output train file!")
+
+    for fs in job_files_val:
+        if len(fs) == 0:
+            raise ValueError("No files for an output val file!")
+
+    return job_files_train, job_files_val
+
+
+def get_split(folder, outfile_basestr, n_train_files=1, n_val_files=1,
+              train_frac=0.8):
+    """
+    Prepare to concatentate binned .h5 files to training and validation files.
+
+    The files in each train or val file will be drawn randomly from the
+    available files. Each train or val files will be created by its own
+    seperately submitted job.
+
+    Parameters
+    ----------
+    folder : str
+        Containing the files to concat.
+    n_train_files : int
+        Total number of resulting train files.
+    n_val_files : int
+        Total number of resulting val files.
+    outfile_basestr : str
+        Path and a base for the name. "train"/"val" and a file number will
+        get automatically attached to the name.
+    train_frac : float
+        The fraction of files in the train set.
+
+    Returns
+    -------
+    jobs : dict
+        Contains the arguments for the concatenate script.
+
+    """
+    files = get_files(folder)
+    job_files_train, job_files_val = split_path_list(files, train_frac, n_train_files, n_val_files)
+
+    jobs = []
+
+    for i, job_files in enumerate(job_files_train):
+        output_filepath = "{}_train_{}.h5".format(outfile_basestr, i)
+        job_dict = {
+            "file_list": job_files,
+            "output_filepath": output_filepath
+        }
+        jobs.append(job_dict)
+
+    for i, job_files in enumerate(job_files_val):
+        output_filepath = "{}_val_{}.h5".format(outfile_basestr, i)
+        job_dict = {
+            "file_list": job_files,
+            "output_filepath": output_filepath
+        }
+        jobs.append(job_dict)
+
+    return jobs