diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5a77c7a03c352e153c5820a926e5e9297e7cc6eb..c881c2e3f2c92fc55249f7b4749d4f0c00b5b313 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,6 @@ Unreleased changes ------------------ - +* update of reco data from offline files Version 0 --------- @@ -44,7 +44,7 @@ Version 0 0.3.0 / 2019-11-19 ~~~~~~~~~~~~~~~~~~~ * Preliminary Jpp timeslice reader prototype -* Updated ``AaetReader`` +* Updated ``AanetReader`` * Updated docs 0.2.1 / 2019-11-15 diff --git a/README.rst b/README.rst index 1653a02523cbed1d6f6c5e646baf54493c50c00a..5c74862bc8900a86c58872552b9fa37693a5a5c2 100644 --- a/README.rst +++ b/README.rst @@ -645,6 +645,90 @@ to get a specific value from track 0 in event 0, let's say for example the likli >>>r[0].tracks[0].lik 294.6407542676734 +to get the reconstruction parameters, first take a look at the available reconstruction keys: + +.. code-block:: python3 + + >>>r.best_reco.dtype.names + ['JGANDALF_BETA0_RAD', + 'JGANDALF_BETA1_RAD', + 'JGANDALF_CHI2', + 'JGANDALF_NUMBER_OF_HITS', + 'JENERGY_ENERGY', + 'JENERGY_CHI2', + 'JGANDALF_LAMBDA', + 'JGANDALF_NUMBER_OF_ITERATIONS', + 'JSTART_NPE_MIP', + 'JSTART_NPE_MIP_TOTAL', + 'JSTART_LENGTH_METRES', + 'JVETO_NPE', + 'JVETO_NUMBER_OF_HITS', + 'JENERGY_MUON_RANGE_METRES', + 'JENERGY_NOISE_LIKELIHOOD', + 'JENERGY_NDF', + 'JENERGY_NUMBER_OF_HITS'] + +the keys above can also be accessed with a tab completion: + +.. image:: https://git.km3net.de/km3py/km3io/raw/master/examples/pictures/reco.png + +to get a numpy `recarray <https://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html>`__ of all fit data of the best reconstructed track: + +.. code-block:: python3 + + >>>r.best_reco + +to get an array of a parameter of interest, let's say `'JENERGY_ENERGY'`: + +.. code-block:: python3 + + >>>r.best_reco['JENERGY_ENERGY'] + array([1141.87137899, 4708.16378575, 499.7243005 , 103.54680875, + 208.6103912 , 1336.52338666, 998.87632267, 1206.54345674, + 16.28973662]) + +**Note**: In km3io, the best fit is defined as the track fit with the maximum reconstruction stages. When "nan" is returned, it means that the reconstruction parameter of interest is not found. for example, in the case of muon simulations: if `[1, 2]` are the reconstruction stages, then only the fit parameters corresponding to the stages `[1, 2]` are found in the Offline files, the remaining fit parameters corresponding to the stages `[3, 4, 5]` are all filled with nan. + +to get a numpy recarray of the fit data of tracks with specific reconstruction stages, let's say `[1, 2, 3, 4, 5]` in the case of a muon track reconstruction: + +.. code-block:: python3 + + >>>r.get_reco_fit([1, 2, 3, 4, 5]) + +again, to get the reconstruction parameters names: + +.. code-block:: python3 + + >>>r.get_reco_fit([1, 2, 3, 4, 5]).dtype.names + ('JGANDALF_BETA0_RAD', + 'JGANDALF_BETA1_RAD', + 'JGANDALF_CHI2', + 'JGANDALF_NUMBER_OF_HITS', + 'JENERGY_ENERGY', + 'JENERGY_CHI2', + 'JGANDALF_LAMBDA', + 'JGANDALF_NUMBER_OF_ITERATIONS', + 'JSTART_NPE_MIP', + 'JSTART_NPE_MIP_TOTAL', + 'JSTART_LENGTH_METRES', + 'JVETO_NPE', + 'JVETO_NUMBER_OF_HITS', + 'JENERGY_MUON_RANGE_METRES', + 'JENERGY_NOISE_LIKELIHOOD', + 'JENERGY_NDF', + 'JENERGY_NUMBER_OF_HITS') + +to get the reconstruction data of interest, for example ['JENERGY_ENERGY']: + +.. code-block:: python3 + + >>>r.get_reco_fit([1, 2, 3, 4, 5])['JENERGY_ENERGY'] + array([1141.87137899, 4708.16378575, 499.7243005 , 103.54680875, + 208.6103912 , 1336.52338666, 998.87632267, 1206.54345674, + 16.28973662]) + +**Note**: When the reconstruction stages of interest are not found in all your data file, an error is raised. + reading mc hits data """""""""""""""""""" diff --git a/examples/pictures/reco.png b/examples/pictures/reco.png new file mode 100644 index 0000000000000000000000000000000000000000..ce5fa3c6ebef1d541d93ef5a8cf563e024c9d9b7 Binary files /dev/null and b/examples/pictures/reco.png differ diff --git a/km3io/offline.py b/km3io/offline.py index 8902bc929e3c21ad35838d23b8651933e8f2ec7e..335342a51efff565099863c7243724b317465356 100644 --- a/km3io/offline.py +++ b/km3io/offline.py @@ -1,4 +1,5 @@ import uproot +import numpy as np import warnings # 110 MB based on the size of the largest basket found so far in km3net @@ -179,8 +180,7 @@ class OfflineKeys: 'JSTART_NPE_MIP', 'JSTART_NPE_MIP_TOTAL', 'JSTART_LENGTH_METRES', 'JVETO_NPE', 'JVETO_NUMBER_OF_HITS', 'JENERGY_MUON_RANGE_METRES', 'JENERGY_NOISE_LIKELIHOOD', - 'JENERGY_NDF', 'JENERGY_NUMBER_OF_HITS', 'JCOPY_Z_M' - ] + 'JENERGY_NDF', 'JENERGY_NUMBER_OF_HITS', 'JCOPY_Z_M'] return self._fit_keys @property @@ -320,6 +320,7 @@ class OfflineReader: self._mc_hits = None self._mc_tracks = None self._keys = None + self._best_reco = None self._header = None def __getitem__(self, item): @@ -431,6 +432,157 @@ class OfflineReader: [self._data[key] for key in self.keys.mc_tracks_keys]) return self._mc_tracks + @property + def best_reco(self): + """returns the best reconstructed track fit data. The best fit is defined + as the track fit with the maximum reconstruction stages. When "nan" is + returned, it means that the reconstruction parameter of interest is not + found. for example, in the case of muon simulations: if [1, 2] are the + reconstruction stages, then only the fit parameters corresponding to the + stages [1, 2] are found in the Offline files, the remaining fit parameters + corresponding to the stages 3, 4, 5 are all filled with nan. + + Returns + ------- + numpy recarray + a recarray of the best track fit data (reconstruction data). + """ + if self._best_reco is None: + keys = ", ".join(self.keys.fit_keys[:-1]) + empty_fit_info = np.array([match for match in + self._find_empty(self.tracks.fitinf)]) + fit_info = [i for i,j in zip(self.tracks.fitinf, + empty_fit_info[:,1]) if j is not None] + stages = self._get_max_reco_stages(self.tracks.rec_stages) + fit_data = np.array([i[j] for i,j in zip(fit_info, stages[:,2])]) + rows_size = len(max(fit_data, key=len)) + equal_size_data = np.vstack([np.hstack([i, np.zeros(rows_size-len(i)) + + np.nan]) for i in fit_data]) + self._best_reco = np.core.records.fromarrays(equal_size_data.transpose(), + names=keys) + return self._best_reco + + def _get_max_reco_stages(self, reco_stages): + """find the longest reconstructed track based on the maximum size of + reconstructed stages. + + Parameters + ---------- + reco_stages : chunked array + chunked array of all the reconstruction stages of all tracks. + In km3io, it is accessed with + km3io.OfflineReader(my_file).tracks.rec_stages . + + Returns + ------- + numpy array + array with 3 columns: *list of the maximum reco_stages + *lentgh of the maximum reco_stages + *position of the maximum reco_stages + """ + empty_reco_stages = np.array([match for match in + self._find_empty(reco_stages)]) + max_reco_stages = np.array([[max(i, key=len), len(max(i, key=len)), + i.index(max(i, key=len))] for i,j in + zip(reco_stages, empty_reco_stages[:,1]) + if j is not None]) + return max_reco_stages + + def get_reco_fit(self, stages): + """construct a numpy recarray of the fit information (reconstruction + data) of the tracks reconstructed following the reconstruction stages + of interest. + + Parameters + ---------- + stages : list + list of reconstruction stages of interest. for example + [1, 2, 3, 4, 5]. + + Returns + ------- + numpy recarray + a recarray of the fit information (reconstruction data) of + the tracks of interest. + + Raises + ------ + ValueError + ValueError raised when the reconstruction stages of interest + are not found in the file. + """ + keys = ", ".join(self.keys.fit_keys[:-1]) + fit_info = self.tracks.fitinf + rec_stages = np.array([match for match in + self._find_rec_stages(stages)]) + if np.all(rec_stages[:,1]==None): + raise ValueError("The stages {} are not found in your file." + .format(str(stages))) + else: + fit_data = np.array([i[k] for i,j,k in zip(fit_info, + rec_stages[:,0], rec_stages[:,1]) + if k is not None]) + rec_array = np.core.records.fromarrays(fit_data.transpose(), + names=keys) + return rec_array + + def _find_rec_stages(self, stages): + """find the index of reconstruction stages of interest in a + list of multiple reconstruction stages. + + Parameters + ---------- + stages : list + list of reconstruction stages of interest. for example + [1, 2, 3, 4, 5]. + + Yieldsma + ------ + generator + the track id and the index of the reconstruction stages of + interest if found. If the reconstruction stages of interest + are not found, None is returned as the stages index. + """ + for trk_index, rec_stages in enumerate(self.tracks.rec_stages): + try: + stages_index = rec_stages.index(stages) + except ValueError: + stages_index = None + yield trk_index, stages_index + continue + + yield trk_index, stages_index + + def _find_empty(self, array): + """finds empty lists/arrays in an awkward array + + Parameters + ---------- + array : awkward array + Awkward array of data of interest. For example: + km3io.OfflineReader(my_file).tracks.fitinf . + + Yields + ------ + generator + the empty list id and the index of the empty list. When + data structure (list) is simply empty, None is written in the + corresponding index. However, when data structure (list) is not + empty and does not contain an empty list, then False is written in the + corresponding index. + """ + for i, rs in enumerate(array): + try: + if len(rs)==0: + j = None + if len(rs)!=0: + j = rs.index([]) + except ValueError: + j = False # rs not empty but [] not found + yield i, j + continue + yield i, j + class OfflineEvents: """wrapper for offline events""" diff --git a/notebooks/OfflineReader_tutorial.ipynb b/notebooks/OfflineReader_tutorial.ipynb index c1df4f229672c4736ee59aa49721a9b0fa1aa2ca..2469cd0d51c3ee01b10ffcee88fad753beb15717 100644 --- a/notebooks/OfflineReader_tutorial.ipynb +++ b/notebooks/OfflineReader_tutorial.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ { "data": { "text/plain": [ - "<km3io.aanet.OfflineReader at 0x7f87bd4ce310>" + "<km3io.offline.OfflineReader at 0x7fb60c2dc590>" ] }, "execution_count": 2, @@ -245,7 +245,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [1 2 3 ... 8 9 10] at 0x7f87906e86d0>" + "<ChunkedArray [1 2 3 ... 8 9 10] at 0x7fb5da350b50>" ] }, "execution_count": 6, @@ -265,7 +265,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [44 44 44 ... 44 44 44] at 0x7f87906e87d0>" + "<ChunkedArray [44 44 44 ... 44 44 44] at 0x7fb5da3506d0>" ] }, "execution_count": 7, @@ -285,7 +285,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [182 183 202 ... 185 185 204] at 0x7f87906e8c10>" + "<ChunkedArray [182 183 202 ... 185 185 204] at 0x7fb5da350950>" ] }, "execution_count": 8, @@ -305,7 +305,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [176 125 318 ... 84 255 105] at 0x7f87906f9210>" + "<ChunkedArray [176 125 318 ... 84 255 105] at 0x7fb5da350350>" ] }, "execution_count": 9, @@ -332,7 +332,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [5.170483995038151 4.8283137373023015 5.762051382780177 ... 4.430816798843313 5.541263545158426 4.653960350157523] at 0x7f879052d4d0>" + "<ChunkedArray [5.170483995038151 4.8283137373023015 5.762051382780177 ... 4.430816798843313 5.541263545158426 4.653960350157523] at 0x7fb60fcdd2d0>" ] }, "execution_count": 10, @@ -449,7 +449,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [[806451572 806451572 806451572 ... 809544061 809544061 809544061] [806451572 806451572 806451572 ... 809524432 809526097 809544061] [806451572 806451572 806451572 ... 809544061 809544061 809544061] ... [806451572 806455814 806465101 ... 809526097 809544058 809544061] [806455814 806455814 806455814 ... 809544061 809544061 809544061] [806455814 806455814 806455814 ... 809544058 809544058 809544061]] at 0x7f87906f9450>" + "<ChunkedArray [[806451572 806451572 806451572 ... 809544061 809544061 809544061] [806451572 806451572 806451572 ... 809524432 809526097 809544061] [806451572 806451572 806451572 ... 809544061 809544061 809544061] ... [806451572 806455814 806465101 ... 809526097 809544058 809544061] [806455814 806455814 806455814 ... 809544061 809544061 809544061] [806455814 806455814 806455814 ... 809544058 809544058 809544061]] at 0x7fb5da350fd0>" ] }, "execution_count": 14, @@ -469,7 +469,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [[24 30 22 ... 38 26 23] [29 26 22 ... 26 28 24] [27 19 13 ... 27 24 16] ... [22 22 9 ... 27 32 27] [30 32 17 ... 30 24 29] [27 41 36 ... 29 24 28]] at 0x7f87906f9810>" + "<ChunkedArray [[24 30 22 ... 38 26 23] [29 26 22 ... 26 28 24] [27 19 13 ... 27 24 16] ... [22 22 9 ... 27 32 27] [30 32 17 ... 30 24 29] [27 41 36 ... 29 24 28]] at 0x7fb5da3a3310>" ] }, "execution_count": 15, @@ -645,7 +645,7 @@ { "data": { "text/plain": [ - "<ChunkedArray [[294.6407542676734 294.6407542676734 294.6407542676734 ... 67.81221253265059 67.7756405143316 67.77250505700384] [96.75133289411137 96.75133289411137 96.75133289411137 ... 39.21916536442286 39.184645826013806 38.870325146341884] [560.2775306614813 560.2775306614813 560.2775306614813 ... 118.88577278801066 118.72271313687405 117.80785995187605] ... [71.03251451148226 71.03251451148226 71.03251451148226 ... 16.714140573909347 16.444395245214945 16.34639241716669] [326.440133294878 326.440133294878 326.440133294878 ... 87.79818671079849 87.75488082571873 87.74839444768625] [159.77779654216795 159.77779654216795 159.77779654216795 ... 33.8669134999348 33.821631538334984 33.77240735670646]] at 0x7f8790683d50>" + "<ChunkedArray [[294.6407542676734 294.6407542676734 294.6407542676734 ... 67.81221253265059 67.7756405143316 67.77250505700384] [96.75133289411137 96.75133289411137 96.75133289411137 ... 39.21916536442286 39.184645826013806 38.870325146341884] [560.2775306614813 560.2775306614813 560.2775306614813 ... 118.88577278801066 118.72271313687405 117.80785995187605] ... [71.03251451148226 71.03251451148226 71.03251451148226 ... 16.714140573909347 16.444395245214945 16.34639241716669] [326.440133294878 326.440133294878 326.440133294878 ... 87.79818671079849 87.75488082571873 87.74839444768625] [159.77779654216795 159.77779654216795 159.77779654216795 ... 33.8669134999348 33.821631538334984 33.77240735670646]] at 0x7fb5da302b50>" ] }, "execution_count": 21, @@ -788,6 +788,63 @@ "r[0].tracks[0].lik" ] }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('JGANDALF_BETA0_RAD',\n", + " 'JGANDALF_BETA1_RAD',\n", + " 'JGANDALF_CHI2',\n", + " 'JGANDALF_NUMBER_OF_HITS',\n", + " 'JENERGY_ENERGY',\n", + " 'JENERGY_CHI2',\n", + " 'JGANDALF_LAMBDA',\n", + " 'JGANDALF_NUMBER_OF_ITERATIONS',\n", + " 'JSTART_NPE_MIP',\n", + " 'JSTART_NPE_MIP_TOTAL',\n", + " 'JSTART_LENGTH_METRES',\n", + " 'JVETO_NPE',\n", + " 'JVETO_NUMBER_OF_HITS',\n", + " 'JENERGY_MUON_RANGE_METRES',\n", + " 'JENERGY_NOISE_LIKELIHOOD',\n", + " 'JENERGY_NDF',\n", + " 'JENERGY_NUMBER_OF_HITS')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r.tracks.reco.dtype.names" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([99.10458562, 99.10458562, 99.10458562, 37.85515249, 99.10458562,\n", + " 7.16916787, 99.10458562, 99.10458562, 49.13672986, 20.35137468])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r.tracks.reco[\"JENERGY_ENERGY\"]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -797,7 +854,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -806,7 +863,7 @@ "<OfflineHits: 10 parsed elements>" ] }, - "execution_count": 26, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -824,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -833,7 +890,7 @@ "<OfflineTracks: 10 parsed elements>" ] }, - "execution_count": 27, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -842,13 +899,6 @@ "r.mc_tracks" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/tests/test_offline.py b/tests/test_offline.py index 185008bcbf983b788c10a283cad2d2c60c1123de..fa708e28015004bf9f9720a4560baae9574fa39f 100644 --- a/tests/test_offline.py +++ b/tests/test_offline.py @@ -1,4 +1,5 @@ import unittest +import numpy as np from pathlib import Path from km3io.offline import Reader, OfflineEvents, OfflineHits, OfflineTracks @@ -115,6 +116,7 @@ class TestReader(unittest.TestCase): class TestOfflineReader(unittest.TestCase): def setUp(self): self.r = OfflineReader(OFFLINE_FILE) + self.nu = OfflineReader(OFFLINE_NUMUCC) self.Nevents = 10 self.selected_data = OfflineReader(OFFLINE_FILE, data=self.r._data[0])._data @@ -132,6 +134,56 @@ class TestOfflineReader(unittest.TestCase): # check that there are 10 events self.assertEqual(Nevents, self.Nevents) + def test_find_empty(self): + fitinf = self.nu.tracks.fitinf + rec_stages = self.nu.tracks.rec_stages + + empty_fitinf = np.array([match for match in + self.nu._find_empty(fitinf)]) + empty_stages = np.array([match for match in + self.nu._find_empty(rec_stages)]) + + self.assertListEqual(empty_fitinf[:5, 1].tolist(), [23, 14, + 14, 4, None]) + self.assertListEqual(empty_stages[:5, 1].tolist(), [False, False, + False, False, + None]) + + def test_find_rec_stages(self): + stages = np.array([match for match in + self.nu._find_rec_stages([1, 2, 3, 4, 5])]) + + self.assertListEqual(stages[:5, 1].tolist(), [0, 0, 0, 0, None]) + + def test_get_reco_fit(self): + JGANDALF_BETA0_RAD = [0.0020367251782607574, + 0.003306725805622178, + 0.0057877124222254885, + 0.015581698352185896] + reco_fit = self.nu.get_reco_fit([1, 2, 3, 4, 5])['JGANDALF_BETA0_RAD'] + + self.assertListEqual(JGANDALF_BETA0_RAD, reco_fit[:4].tolist()) + with self.assertRaises(ValueError): + self.nu.get_reco_fit([1000, 4512, 5625]) + + def test_get_max_reco_stages(self): + rec_stages = self.nu.tracks.rec_stages + max_reco = self.nu._get_max_reco_stages(rec_stages) + + self.assertEqual(len(max_reco.tolist()), 9) + self.assertListEqual(max_reco[0].tolist(), [[1, 2, 3, 4, 5], + 5, 0]) + + def test_best_reco(self): + JGANDALF_BETA1_RAD = [0.0014177681261476852, + 0.002094094517471032, + 0.003923368624980349, + 0.009491461076780453] + best = self.nu.best_reco + + self.assertEqual(best.size, 9) + self.assertEqual(best['JGANDALF_BETA1_RAD'][:4].tolist(), JGANDALF_BETA1_RAD) + def test_reading_header(self): # head is the supported format head = OfflineReader(OFFLINE_NUMUCC).header