diff --git a/docs/getting_started.rst b/docs/getting_started.rst index d95a8accfa49fc98fe1e6a46da1cb5a85a9ebc84..b4de5da1a06d9ca40a68f43f1784e63876f0862f 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -29,6 +29,7 @@ If you have an orcasong config file, you can use it via the command line like th orcasong run aanet_file.h5 orcasong_config.toml --detx_file detector.detx +Check out the git repo here https://git.km3net.de/ml/OrcaSong/-/tree/master/examples for some examples of config files. Alternatively, you can use the python frontend of orcasong. See :ref:`orcasong_page` for instructions on how to do this. diff --git a/examples/orcasong_bundle_corsika.toml b/examples/orcasong_bundle_corsika.toml new file mode 100644 index 0000000000000000000000000000000000000000..aface2a4ddf4303c0ad929874e0bbc9aff28496b --- /dev/null +++ b/examples/orcasong_bundle_corsika.toml @@ -0,0 +1,13 @@ +# for atmospheric muon bundles, ORCA4, corsika sibyll 2.3c, graph network +mode = "graph" +max_n_hits = 2000 +time_window = [-250, 1000] +extractor = "bundle_mc" +# center of mupage detector: +center_hits_to = [0.020, -0.040, 111.186] + +[extractor_config] +inactive_du = 1 +plane_point = [17, 17, 111] +only_downgoing_tracks = true +is_corsika = true diff --git a/examples/orcasong_bundle_data.toml b/examples/orcasong_bundle_data.toml new file mode 100644 index 0000000000000000000000000000000000000000..16ad7390cf8e4d503be8a704519d53dcaded769a --- /dev/null +++ b/examples/orcasong_bundle_data.toml @@ -0,0 +1,8 @@ +# for atmospheric muon bundles, data ORCA4, graph network +mode = "graph" +max_n_hits = 2000 +time_window = [-250, 1000] +extractor = "bundle_data" + +[extractor_config] +only_downgoing_tracks = true diff --git a/examples/orcasong_bundle_mupage.toml b/examples/orcasong_bundle_mupage.toml new file mode 100644 index 0000000000000000000000000000000000000000..63d30d10c22cbdc5e37056b774390cbae94c4874 --- /dev/null +++ b/examples/orcasong_bundle_mupage.toml @@ -0,0 +1,10 @@ +# for atmospheric muon bundles, mupage ORCA4, graph network +mode = "graph" +max_n_hits = 2000 +time_window = [-250, 1000] +extractor = "bundle_mc" + +[extractor_config] +inactive_du = 1 +plane_point = [17, 17, 111] +only_downgoing_tracks = true diff --git a/orcasong/core.py b/orcasong/core.py index 67bb916498db498950e25109b8c1c9e22d12cab0..57de103db300fc6a98e787092e4f0c3366e0de3d 100644 --- a/orcasong/core.py +++ b/orcasong/core.py @@ -59,9 +59,11 @@ class BaseProcessor: overwrite : bool If True, overwrite the output file if it exists already. If False, throw an error instead. - mc_info_to_float64 : bool - Convert everything in the mcinfo array to float 64 (Default: True). - Hint: int dtypes can not store nan! + sort_y : bool + Sort the columns in the y dataset alphabetically. + y_to_float64 : bool + Convert everything in the y dataset to float 64 (Default: True). + Hint: Not all other dtypes can store nan! Attributes ---------- @@ -97,7 +99,8 @@ class BaseProcessor: chunksize=32, keep_event_info=False, overwrite=True, - mc_info_to_float64=True): + sort_y=True, + y_to_float64=True): self.extractor = extractor self.det_file = det_file self.correct_mc_time = correct_mc_time @@ -109,7 +112,8 @@ class BaseProcessor: self.chunksize = chunksize self.keep_event_info = keep_event_info self.overwrite = overwrite - self.mc_info_to_float64 = mc_info_to_float64 + self.sort_y = sort_y + self.y_to_float64 = y_to_float64 self.n_statusbar = 1000 self.n_memory_observer = 1000 @@ -212,7 +216,8 @@ class BaseProcessor: if self.extractor is not None: cmpts.append((modules.McInfoMaker, { "extractor": self.extractor, - "to_float64": self.mc_info_to_float64, + "to_float64": self.y_to_float64, + "sort_y": self.sort_y, "store_as": "mc_info"})) if self.event_skipper is not None: diff --git a/orcasong/extractors/__init__.py b/orcasong/extractors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8df4e7591dc68d2fffe6c246bd0357577cdf1be --- /dev/null +++ b/orcasong/extractors/__init__.py @@ -0,0 +1,2 @@ +from .neutrino_chain import get_neutrino_mc_info_extr, get_muon_mc_info_extr, get_real_data_info_extr, get_random_noise_mc_info_extr +from .bundles import BundleMCExtractor, BundleDataExtractor diff --git a/orcasong/extractors/bundles.py b/orcasong/extractors/bundles.py new file mode 100644 index 0000000000000000000000000000000000000000..8bbab3893740b9a6172d1eae8a2c930060811283 --- /dev/null +++ b/orcasong/extractors/bundles.py @@ -0,0 +1,349 @@ +import warnings +import numpy as np + + +class BundleDataExtractor: + """ Get info present in real data. """ + def __init__(self, infile, only_downgoing_tracks=False): + self.only_downgoing_tracks = only_downgoing_tracks + + def __call__(self, blob): + # just take everything from event info + if not len(blob['EventInfo']) == 1: + warnings.warn(f"Event info has length {len(blob['EventInfo'])}, not 1") + track = dict(zip(blob['EventInfo'].dtype.names, blob['EventInfo'][0])) + track.update(**get_best_track( + blob, only_downgoing_tracks=self.only_downgoing_tracks)) + + track["n_hits"] = len(blob["Hits"]) + track["n_triggered_hits"] = blob["Hits"]["triggered"].sum() + is_triggered = blob["Hits"]["triggered"].astype(bool) + track["n_triggered_doms"] = len(np.unique(blob["Hits"]["dom_id"][is_triggered])) + track["t_last_triggered"] = blob["Hits"]["time"][is_triggered].max() + + unique_hits = get_only_first_hit_per_pmt(blob["Hits"]) + track["n_pmts"] = len(unique_hits) + track["n_triggered_pmts"] = unique_hits["triggered"].sum() + + if "n_hits_intime" in blob["EventInfo"]: + n_hits_intime = blob["EventInfo"]["n_hits_intime"] + else: + n_hits_intime = np.nan + track["n_hits_intime"] = n_hits_intime + return track + + +def get_only_first_hit_per_pmt(hits): + """ Keep only the first hit of each pmt. """ + idents = np.stack((hits["dom_id"], hits["channel_id"]), axis=-1) + sorted_time_indices = np.argsort(hits["time"]) + # indices of first hit per pmt in time sorted array: + indices = np.unique(idents[sorted_time_indices], axis=0, return_index=True)[1] + # indices of first hit per pmt in original array: + first_hit_indices = np.sort(sorted_time_indices[indices]) + return hits[first_hit_indices] + + +def get_best_track(blob, missing_value=np.nan, only_downgoing_tracks=False): + """ + I mean first track, i.e. the one with longest chain and highest lkl/nhits. + Can also take the best track only of those that are downgoing. + """ + # hardcode names here since the first blob might not have Tracks + names = ('E', + 'JCOPY_Z_M', + 'JENERGY_CHI2', + 'JENERGY_ENERGY', + 'JENERGY_MUON_RANGE_METRES', + 'JENERGY_NDF', + 'JENERGY_NOISE_LIKELIHOOD', + 'JENERGY_NUMBER_OF_HITS', + 'JGANDALF_BETA0_RAD', + 'JGANDALF_BETA1_RAD', + 'JGANDALF_CHI2', + 'JGANDALF_LAMBDA', + 'JGANDALF_NUMBER_OF_HITS', + 'JGANDALF_NUMBER_OF_ITERATIONS', + 'JSHOWERFIT_ENERGY', + 'JSTART_LENGTH_METRES', + 'JSTART_NPE_MIP', + 'JSTART_NPE_MIP_TOTAL', + 'JVETO_NPE', + 'JVETO_NUMBER_OF_HITS', + 'dir_x', + 'dir_y', + 'dir_z', + 'id', + 'idx', + 'length', + 'likelihood', + 'pos_x', + 'pos_y', + 'pos_z', + 'rec_type', + 't', + 'group_id') + index = None + if "Tracks" in blob: + if only_downgoing_tracks: + downs = np.where(blob["Tracks"].dir_z < 0)[0] + if len(downs) != 0: + index = downs[0] + else: + index = 0 + + if index is not None: + track = blob["Tracks"][index] + return {f"jg_{name}_reco": track[name] for name in names} + else: + return {f"jg_{name}_reco": missing_value for name in names} + + +class BundleMCExtractor: + """ + For atmospheric muon studies on mupage or corsika simulations. + + Parameters + ---------- + inactive_du : int or None + Don't count mchits in this du. + min_n_mchits_list : tuple + How many mchits does a muon have to produce to be counted? + Create a seperate set of entries for each number in the tuple. + plane_point : tuple + For bundle diameter: XYZ coordinates of where the center of the + plane is in which the muon positions get calculated. Should be set + to the center of the detector! + with_mc_index : bool + Add a column called mc_index containing the mc run number, + which is attempted to be read from the filename. This is for + when the same run id/event id combination appears in mc files, + which can happend e.g. in run by run simulations when there are + multiplie mc runs per data run. + is_corsika : bool + Use this when using Corsika!!! + only_downgoing_tracks : bool + For the best track (JG reco), consider only the ones that are downgoing. + missing_value : float + If a value is missing, use this value instead. + + """ + def __init__(self, + infile, + inactive_du=1, + min_n_mchits_list=(0, 1, 10), + plane_point=(17, 17, 111), + with_mc_index=True, + is_corsika=False, + only_downgoing_tracks=False, + missing_value=np.nan, + ): + self.inactive_du = inactive_du + self.min_n_mchits_list = min_n_mchits_list + self.plane_point = plane_point + self.with_mc_index = with_mc_index + self.missing_value = missing_value + self.is_corsika = is_corsika + self.only_downgoing_tracks = only_downgoing_tracks + + self.data_extractor = BundleDataExtractor( + infile, only_downgoing_tracks=only_downgoing_tracks) + + if self.with_mc_index: + self.mc_index = get_mc_index(infile) + print(f"Using mc_index {self.mc_index}") + else: + self.mc_index = None + + def __call__(self, blob): + mc_info = self.data_extractor(blob) + + if self.is_corsika: + # Corsika has a primary particle. Store infos about it + prim_track = blob["McTracks"][0] + + # primary should be track 0 with id 0 + if prim_track["id"] != 0: + raise ValueError("Error finding primary: mc_tracks[0]['id'] != 0") + + # direction of the primary + mc_info["dir_x"] = prim_track.dir_x + mc_info["dir_y"] = prim_track.dir_y + mc_info["dir_z"] = prim_track.dir_z + # use primary direction as plane normal + plane_normal = np.array(prim_track[["dir_x", "dir_y", "dir_z"]]) + + for fld in ("pos_x", "pos_y", "pos_z", "pdgid", "energy", "time"): + mc_info[f"primary_{fld}"] = prim_track[fld] + + # remove primary for the following, since it's not a muon + blob["McTracks"] = blob["McTracks"][1:] + else: + # In mupage, all muons in a bundle are parallel. So just take dir of first muon + mc_info["dir_x"] = blob["McTracks"].dir_x[0] + mc_info["dir_y"] = blob["McTracks"].dir_y[0] + mc_info["dir_z"] = blob["McTracks"].dir_z[0] + plane_normal = None + + # n_mc_hits of each muon in active dus + mchits_per_muon = get_mchits_per_muon(blob, inactive_du=self.inactive_du) + + for min_n_mchits in self.min_n_mchits_list: + if min_n_mchits == 0: + mc_tracks_sel = blob["McTracks"] + suffix = "sim" + else: + mc_tracks_sel = blob["McTracks"][mchits_per_muon >= min_n_mchits] + suffix = f"{min_n_mchits}_mchits" + + # total number of mchits of all muons + mc_info[f"n_mc_hits_{suffix}"] = np.sum( + mchits_per_muon[mchits_per_muon >= min_n_mchits]) + + # number of muons with at least the given number of mchits + mc_info[f"n_muons_{suffix}"] = len(mc_tracks_sel) + + # summed up energy of all muons + mc_info[f"energy_{suffix}"] = np.sum(mc_tracks_sel.energy) + mc_info[f"energy_lost_in_can_{suffix}"] = np.sum( + mc_tracks_sel.energy_lost_in_can) + + # bundle diameter; only makes sense for 2+ muons + if len(mc_tracks_sel) >= 2: + positions_plane = get_plane_positions( + positions=mc_tracks_sel[["pos_x", "pos_y", "pos_z"]].to_dataframe().to_numpy(), + directions=mc_tracks_sel[["dir_x", "dir_y", "dir_z"]].to_dataframe().to_numpy(), + plane_point=self.plane_point, + plane_normal=plane_normal, + ) + pairwise_distances = get_pairwise_distances(positions_plane) + mc_info[f"max_pair_dist_{suffix}"] = pairwise_distances.max() + mc_info[f"mean_pair_dist_{suffix}"] = pairwise_distances.mean() + else: + mc_info[f"max_pair_dist_{suffix}"] = self.missing_value + mc_info[f"mean_pair_dist_{suffix}"] = self.missing_value + + if self.with_mc_index: + mc_info["mc_index"] = self.mc_index + + return mc_info + + +def get_plane_positions(positions, directions, plane_point, plane_normal=None): + """ + Get the position of each muon in a 2d plane. + Length will be preserved, i.e. 1m in 3d space is also 1m in plane space. + + Parameters + ---------- + positions : np.array + The position of each muon in 3d cartesian space, shape (n_muons, 3). + directions : np.array + The direction of each muon as a cartesian unit vector, shape (n_muons, 3). + plane_point : np.array + A 3d cartesian point on the plane. This will be (0, 0) in the plane + coordinate system. Shape (3, ). + plane_normal : np.array, optional + A 3d cartesian vector perpendicular to the plane, shape (3, ). + Default: Use directions if all muons are parallel, otherwise raise. + + Returns + ------- + positions_plane : np.array + The 2d position of each muon in the plane, shape (n_muons, 2). + + """ + if plane_normal is None: + if not np.all(directions == directions[0]): + raise ValueError( + "Muon tracks are not all parallel: plane_normal has to be specified!") + plane_normal = directions[0] + + # get the 3d points where each muon collides with the plane + points = [] + for i in range(len(directions)): + ndotu = np.dot(plane_normal, directions[i]) + if abs(ndotu) < 1e-6: + raise ValueError("no intersection or line is within plane") + + w = positions[i] - plane_point + si = -np.dot(plane_normal, w) / ndotu + psi = w + si * directions[i] + plane_point + points.append(psi) + points = np.array(points) + + # Get the unit vectors of the plane. u is 0 in x, v is 0 in y. + u = np.array([1, 0, -plane_normal[0] / plane_normal[2]]) + v = np.array([0, 1, -plane_normal[1] / plane_normal[2]]) + # norm: + u = u / np.linalg.norm(u) + v = v / np.linalg.norm(v) + + # xy coordinates in plane + x_dash = (points[:, 0] - plane_point[0]) / u[0] + y_dash = (points[:, 1] - plane_point[1]) / v[1] + position_plane = np.array([x_dash, y_dash]).T + + return position_plane + + +def get_pairwise_distances(positions_plane, as_matrix=False): + """ + Get the perpendicular distance between each muon pair. + + Parameters + ---------- + positions_plane : np.array + The 2d position of each muon in a plane, shape (n_muons, 2). + as_matrix : bool + Return the whole 2D distance matrix. + + Returns + ------- + np.array + The distances between each pair of muons. + 1D if as_matrix is False (default), else 2D. + + """ + pos_x, pos_y = positions_plane[:, 0], positions_plane[:, 1] + + dists_x = np.expand_dims(pos_x, -2) - np.expand_dims(pos_x, -1) + dists_y = np.expand_dims(pos_y, -2) - np.expand_dims(pos_y, -1) + l2_dists = np.sqrt(dists_x**2 + dists_y**2) + if as_matrix: + return l2_dists + else: + return l2_dists[np.triu_indices_from(l2_dists, k=1)] + + +def get_mchits_per_muon(blob, inactive_du=None): + """ + For each muon in McTracks, get the number of McHits. + + Parameters + ---------- + blob + The blob. + inactive_du : int, optional + McHits in this DU will not be counted. + + Returns + ------- + np.array + n_mchits, len = number of muons --> blob["McTracks"]["id"] + + """ + ids = blob["McTracks"]["id"] + # Origin of each mchit (as int) in the active line + origin = blob["McHits"]["origin"] + if inactive_du: + # only hits in active line + origin = origin[blob["McHits"]["du"] != inactive_du] + # get how many mchits were produced per muon in the bundle + origin_dict = dict(zip(*np.unique(origin, return_counts=True))) + return np.array([origin_dict.get(i, 0) for i in ids]) + + +def get_mc_index(aanet_filename): + # e.g. mcv5.40.mupage_10G.sirene.jterbr00005782.jorcarec.aanet.365.h5 + return int(aanet_filename.split(".")[-2]) diff --git a/orcasong/extractors.py b/orcasong/extractors/neutrino_chain.py similarity index 100% rename from orcasong/extractors.py rename to orcasong/extractors/neutrino_chain.py diff --git a/orcasong/from_toml.py b/orcasong/from_toml.py index 8f795a93a8d42ea961cd1d8e53948094c611a1dc..0bf6e723dd204b1b11299fd5eabe2a08f1101206 100644 --- a/orcasong/from_toml.py +++ b/orcasong/from_toml.py @@ -9,8 +9,8 @@ EXTRACTORS = { "nu_chain_muon": extractors.get_muon_mc_info_extr, "nu_chain_noise": extractors.get_random_noise_mc_info_extr, "nu_chain_data": extractors.get_real_data_info_extr, - # TODO "bundle_mc": ???, - # TODO "bundle_data": ???, + "bundle_mc": extractors.bundles.BundleMCExtractor, + "bundle_data": extractors.bundles.BundleDataExtractor, } MODES = { diff --git a/orcasong/modules.py b/orcasong/modules.py index 4cf4f4d4cd646ba917d078a4afa5ad31dc4c8b96..30d8f0c07b173f49e9a058560a5ad83b49b1aba9 100644 --- a/orcasong/modules.py +++ b/orcasong/modules.py @@ -28,9 +28,12 @@ class McInfoMaker(kp.Module): self.extractor = self.require('extractor') self.store_as = self.require('store_as') self.to_float64 = self.get("to_float64", default=True) + self.sort_y = self.get("sort_y", default=True) def process(self, blob): track = self.extractor(blob) + if self.sort_y: + track = {k: track[k] for k in sorted(track)} if self.to_float64: dtypes = [] for key, v in track.items(): diff --git a/tests/test_extractor_bundle.py b/tests/test_extractor_bundle.py new file mode 100644 index 0000000000000000000000000000000000000000..58daf8e41b17683cfdb35f35435596f4e0080229 --- /dev/null +++ b/tests/test_extractor_bundle.py @@ -0,0 +1,105 @@ +from unittest import TestCase +import numpy as np +import orcasong.extractors.bundles as bundles + + +class TestGetPlanePositions(TestCase): + def setUp(self) -> None: + self.positions = np.array([ + [0, 0, 0], + [0, 0, 5], + [5, 3, 2], + [0, 0, 2], + ]) + self.directions = np.array([ + [0, 0, 1], + [0, 0, 1], + [0, 0, 1], + [0, 1, -1], + ]) + + def test_positions_flat_plane(self): + plane_point = np.array([0, 0, 0]) + plane_normal = np.array([0, 0, 1]) + result = bundles.get_plane_positions( + self.positions, self.directions, plane_point, plane_normal, + ) + target = np.array([ + [0, 0], + [0, 0], + [5, 3], + [0, 2], + ]) + np.testing.assert_array_equal(result, target) + + def test_positions_flat_plane_no_plane_normal(self): + plane_point = np.array([0, 0, 0]) + with self.assertRaises(ValueError): + bundles.get_plane_positions( + self.positions, self.directions, plane_point, + ) + + def test_positions_shifted_plane(self): + plane_point = np.array([1, 0, 0]) + plane_normal = np.array([0, 0, 1]) + result = bundles.get_plane_positions( + self.positions, self.directions, plane_point, plane_normal, + ) + target = np.array([ + [-1, 0], + [-1, 0], + [4, 3], + [-1, 2], + ]) + np.testing.assert_array_equal(result, target) + + def test_positions_angled_plane(self): + plane_point = np.array([0, 0, 0]) + plane_normal = np.array([1, 0, 1]) + result = bundles.get_plane_positions( + self.positions, self.directions, plane_point, plane_normal, + ) + target = np.array([ + [0, 0], + [0, 0], + [np.sqrt(50), 3], + [0, 2], + ]) + np.testing.assert_array_equal(result, target) + + def test_positions_angled_plane_2(self): + result = bundles.get_plane_positions( + positions=np.array([[-1, 0, 1]]), + directions=np.array([[1, 0, -1]]), + plane_point=np.array([0, 0, 0]), + plane_normal=np.array([-1, 0, 1]), + ) + np.testing.assert_array_equal(result, np.array([[0, 0]])) + + +class TestPairwiseDistances(TestCase): + def setUp(self) -> None: + self.positions_plane = np.array([ + [0, -3], + [4, 0], + [0, 3], + ]) + + def test_matrix(self): + result = bundles.get_pairwise_distances( + positions_plane=self.positions_plane, + as_matrix=True, + ) + target = np.array([ + [0, 5, 6], + [5, 0, 5], + [6, 5, 0], + ]) + np.testing.assert_array_equal(result, target) + + def test_flat(self): + result = bundles.get_pairwise_distances( + positions_plane=self.positions_plane, + ) + target = np.array([5, 6, 5]) + np.testing.assert_array_equal(result, target)