diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 02a4137f70f518fb3dbe870c0096d3c58a61158b..bd50318ea4557faa33a524b093bdab14c7eea2a9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,34 +1,76 @@ +variables: + PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" + + stages: - - install + - test + - coverage - doc - release -before_script: - - apt-get update -qq && apt-get install -y -qq libhdf5-dev - - pip install -U pip setuptools wheel numpy - - pip install . -install-os: - image: docker.km3net.de/base/python:3 - stage: install +cache: + paths: + - .cache/pip + - venv/ + key: "$CI_COMMIT_REF_SLUG" + + +.virtualenv_template: &virtualenv_definition | + python -V + pip install virtualenv + virtualenv venv + source venv/bin/activate + make install + + +test: + image: docker.km3net.de/base/python:3.6 + stage: test script: - - pip install . + - *virtualenv_definition + - make test + + +coverage: + image: docker.km3net.de/base/python:3.6 + stage: coverage + script: + - *virtualenv_definition + - "make test-cov|grep TOTAL| awk '{printf \"COVERAGE: %.2f%%\", (1-$3/$2)*100 }'" + coverage: '/COVERAGE:\s*([0-9]*\.[0-9]*%)/' + artifacts: + paths: + - reports/coverage + + +code-style: + image: docker.km3net.de/base/python:3.7 + stage: test + script: + - *virtualenv_definition + - yapf -r -d -e "venv" . + allow_failure: true + pages: - image: docker.km3net.de/base/python:3 + image: docker.km3net.de/base/python:3.6 stage: doc script: + - *virtualenv_definition - cd docs && make html - mv _build/html/ ../public/ + - cd .. && mv reports/coverage public/coverage artifacts: paths: - public + cache: {} only: - tags - master pypi: - image: docker.km3net.de/base/python:3 + image: docker.km3net.de/base/python:3.6 stage: release cache: {} script: @@ -37,4 +79,3 @@ pypi: - twine upload dist/* only: - tags - diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7fa617367dc4c586634e165f55fdc8404c19e3f6 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +PKGNAME=orcasong +ALLNAMES = $(PKGNAME) +ALLNAMES += orcasong_contrib + +default: build + +all: install + +build: + @echo "No need to build anymore :)" + +install: + pip install . + +install-dev: + pip install -e . + +clean: + python setup.py clean --all + rm -f -r build/ + +test: + py.test --junitxml=./reports/junit.xml -o junit_suite_name=$(PKGNAME) tests + +test-cov: + py.test tests --cov $(ALLNAMES) --cov-report term-missing --cov-report xml:reports/coverage.xml --cov-report html:reports/coverage tests + +flake8: + py.test --flake8 + +pep8: flake8 + +docstyle: + py.test --docstyle + +lint: + py.test --pylint + +dependencies: + pip install -Ur requirements.txt + +.PHONY: yapf +yapf: + yapf -i -r $(PKGNAME) + yapf -i setup.py + +.PHONY: all clean build install install-dev test test-nocov flake8 pep8 dependencies docstyle diff --git a/Readme.md b/Readme.rst similarity index 61% rename from Readme.md rename to Readme.rst index 8ca83fb7d627ded1cfd180be0ed5eb1732f3a711..c3651fadc5595e96a79d67155552339d7ad3b40b 100644 --- a/Readme.md +++ b/Readme.rst @@ -1,13 +1,12 @@ -## OrcaSong: Generating DL images based on KM3NeT data +OrcaSong: Generating DL images from KM3NeT data +=============================================== -[![alt text][image_1]][hyperlink_1] [![alt text][image_2]][hyperlink_2] +.. image:: https://git.km3net.de/ml/OrcaSong/badges/master/build.svg + :target: https://git.km3net.de/ml/OrcaSong/pipelines - [hyperlink_1]: https://git.km3net.de/ml/OrcaSong/pipelines - [image_1]: https://git.km3net.de/ml/OrcaSong/badges/master/build.svg +.. image:: https://examples.pages.km3net.de/km3badges/docs-latest-brightgreen.svg + :target: https://ml.pages.km3net.de/OrcaSong - [hyperlink_2]: https://ml.pages.km3net.de/OrcaSong - [image_2]: https://examples.pages.km3net.de/km3badges/docs-latest-brightgreen.svg - The documentation for OrcaSong can be found at https://ml.pages.km3net.de/OrcaSong! @@ -19,3 +18,8 @@ This means that OrcaSong takes a datafile with (neutrino-) events and based on t Currently, only simulations with a hdf5 data format are supported as an input. These event 'images' are required for some Deep Learning machine learning algorithms, e.g. Convolutional Neural Networks. + +OrcaSong can be installed via pip by running:: + + pip install orcasong + diff --git a/docs/CONTRIBUTING.rst b/docs/CONTRIBUTING.rst index 229cbca232c76a659c5a116a0acb12f18954ada4..e11fd4b43062cdb600aa097e7511564ec7d6ca2a 100644 --- a/docs/CONTRIBUTING.rst +++ b/docs/CONTRIBUTING.rst @@ -24,7 +24,7 @@ necessary information which will help other people to understand the situation. Make a Fork of OrcaSong ----------------------- +----------------------- You create a fork (your full own copy of the repository), change the code and when you are happy with the changes, you create diff --git a/docs/conf.py b/docs/conf.py index 6ce6f491a0d3f50f2c5911d910028222e7e90907..45be6ae2e83a274a439f533ea8c48f4ca7ce5104 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,20 +12,18 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import os -import sys from datetime import date from pkg_resources import get_distribution import orcasong -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = "OrcaSong {}".format(orcasong.__version__) -copyright = u'{0}, Michael Moser'.format(date.today().year) -author = 'Michael Moser' +copyright = u'{0}, Stefan Reck, Michael Moser'.format(date.today().year) +author = 'Stefan Reck, Michael Moser' # The full version, including alpha/beta/rc tags release = get_distribution('orcasong').version @@ -33,7 +31,6 @@ release = get_distribution('orcasong').version version = '.'.join(release.split('.')[:2]) - # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. @@ -57,7 +54,7 @@ autosummary_generate = True # Document Python Code autoapi_type = 'python' -autoapi_dirs = ['../orcasong', '../orcasong_contrib', '../orcasong_2'] +autoapi_dirs = ['../orcasong', '../orcasong_contrib'] autoapi_options = [ 'members', 'undoc-members' # , 'private-members', 'special-members' @@ -73,9 +70,9 @@ templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -#source_parsers = { +# source_parsers = { # '.md': 'recommonmark.parser.CommonMarkParser',} -#source_suffix = ['.rst', '.md'] +# source_suffix = ['.rst', '.md'] source_suffix = ['.rst'] # The master toctree document. @@ -169,7 +166,7 @@ latex_elements = { # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'OrcaSong.tex', 'OrcaSong Documentation', - 'Michael Moser', 'manual'), + 'Stefan Reck, Michael Moser', 'manual'), ] @@ -219,5 +216,7 @@ epub_exclude_files = ['search.html'] # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True + + def setup(app): - app.add_stylesheet('_static/style.css') \ No newline at end of file + app.add_stylesheet('_static/style.css') diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 5b3cb1fdaae9b2b6350187ecb1e7fea524b883f4..0aaea684d421df804734dded041dec1f6192ef80 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,13 +1,15 @@ -Getting started with OrcaSong -============================= +Getting started +=============== .. contents:: :local: Introduction ------------ -On this page, you can find a step by step introduction into the usage of OrcaSong. -The guide starts with some exemplary root simulation files made with jpp and ends with hdf5 event 'images' that can be used for deep neural networks. +On this page, you can find a step by step introduction of how to prepare +root files for OrcaSong. +The guide starts with some exemplary root simulation files made with jpp and +ends with hdf5 files ready for the use with OrcaSong. Preprocessing ------------- @@ -120,117 +122,22 @@ channel_id of a hit. Calibrating the .h5 file ~~~~~~~~~~~~~~~~~~~~~~~~ -In order to fix this, we can run another tool, :code:`calibrate`, that will add the pos_xyz information to the hdf5 datafile:: +In order to fix this, the data needs to be calibrated. +This can be done in two ways: You can either: + +- calibrate the files on the fly by providing the detx file to orcasong (recommended), +- or use a seperate tool from km3pipe called :code:`calibrate`, that will add the pos_xyz information to the hdf5 datafile. + +While the first method is the recommended one in principal, the second one can be useful for determining the proper bin edges by looking +at single files. It can be used like this:: calibrate /sps/km3net/users/mmoser/det_files/orca_115strings_av23min20mhorizontal_18OMs_alt9mvertical_v1.detx testfile.h5 As you can see, you need a .detx geometry file for this "calibration". Typically, you can find the path of this detx -file on the wiki page of the simulation production that you are using. This calibration step is optional, since OrcaSong -can also do it on the fly, using a .detx file. +file on the wiki page of the simulation production that you are using. At this point, we are now ready to start using OrcaSong for the generation of event images. - - -Usage of OrcaSong ------------------ - -In order to use OrcaSong, you can just install it with :code:`pip`:: - - ~/$: pip install orcasong - -Before you can start to use OrcaSong, you need a .detx detector geometry file that corresponds to your input files. -OrcaSong is currently producing event "images" based on a 1 DOM / XYZ-bin assumption. This image generation is done -automatically, based on the number of bins (n_bins) for each dimension XYZ that you supply as an input and based on the -.detx file which contains the DOM positions. - -If your .detx file is not contained in the OrcaSong/detx_files folder, please add it to the repository! -Currently, only the 115l ORCA 2016 detx file is available. - -At this point, you're finally ready to use OrcaSong. -OrcaSong can be called from every directory by using the :code:`make_nn_images` command:: - - ~/$: make_nn_images testfile.h5 geofile.detx configfile.toml - -OrcaSong will then generate a hdf5 file with images that will be put in a "Results" folder at the path that -you've specified in the configfile current path. -Please checkout the default_config.toml file in the orcasong folder of the OrcaSong repo in order to get an idea about -the structure of the config files. - -All available configuration options of OrcaSong can be found in /orcasong/default_config:: - - --- Documentation for every config parameter that is available --- - - None arguments should be written as string: 'None' - - Parameters - ---------- - output_dirpath : str - Full path to the directory, where the orcasong output should be stored. - chunksize : int - Chunksize (along axis_0) that is used for saving the OrcaSong output to a .h5 file. - complib : str - Compression library that is used for saving the OrcaSong output to a .h5 file. - All PyTables compression filters are available, e.g. 'zlib', 'lzf', 'blosc', ... . - complevel : int - Compression level for the compression filter that is used for saving the OrcaSong output to a .h5 file. - n_bins : tuple of int - Declares the number of bins that should be used for each dimension, e.g. (x,y,z,t). - The option should be written as string, e.g. '11,13,18,60'. - det_geo : str - Declares what detector geometry should be used for the binning. E.g. 'Orca_115l_23m_h_9m_v'. - do2d : bool - Declares if 2D histograms, 'images', should be created. - do2d_plots : bool - Declares if pdf visualizations of the 2D histograms should be created, cannot be called if do2d=False. - do2d_plots_n: int - After how many events the event loop will be stopped (making the 2d plots in do2d_plots takes long time). - do3d : bool - Declares if 3D histograms should be created. - do4d : bool - Declares if 4D histograms should be created. - do4d_mode : str - If do4d is True, what should be used as the 4th dim after xyz. - Currently, only 'time' and 'channel_id' are available. - prod_ident : int - Optional int identifier for the used mc production. - This is e.g. useful, if you use events from two different mc productions, e.g. the 1-5GeV & 3-100GeV Orca 2016 MC. - In this case, the events are not fully distinguishable with only the run_id and the event_id! - In order to keep a separation, an integer can be set in the event_track for all events, such that they stay distinguishable. - timecut_mode : str - Defines what timecut should be used in hits_to_histograms.py. - Currently available: - 'timeslice_relative': Cuts out the central 30% of the snapshot. The value of timecut_timespan doesn't matter in this case. - 'trigger_cluster': Cuts based on the mean of the triggered hits. - 'None': No timecut. The value of timecut_timespan doesn't matter in this case. - timecut_timespan : str/None - Defines what timespan should be used if a timecut is applied. Only relevant for timecut_mode = 'trigger_cluster'. - Currently available: - 'all': [-350ns, 850ns] -> 20ns / bin (if e.g. 60 timebins) - 'tight-0': [-450ns, 500ns] -> 15.8ns / bin (if e.g. 60 timebins) - 'tight-1': [-250ns, 500ns] -> 12.5ns / bin (if e.g. 60 timebins) - 'tight-2': [-150ns, 200ns] -> 5.8ns / bin (if e.g. 60 timebins) - do_mc_hits : bool - Declares if hits (False, mc_hits + BG) or mc_hits (True) should be processed. - data_cut_triggered : bool - Cuts away hits that haven't been triggered. - data_cut_e_low : float - Cuts away events that have an energy lower than data_cut_e_low. - data_cut_e_high : float - Cuts away events that have an energy higher than data_cut_e_high. - data_cut_throw_away : float - Cuts away random events with a certain probability (1: 100%, 0: 0%). - flush_freq : int - After how many events the accumulated output should be flushed to the harddisk. - A larger value leads to a faster orcasong execution, but it increases the RAM usage as well. - - --- Documentation for every config parameter that is available --- - - - - - -If anything is still unclear after this introduction just tell me in the deep_learning channel on chat.km3net.de or -write me an email at michael.m.moser@fau.de, such that I can improve this guide! +See the page :ref:`orcasong_page` for instructions on how to use it. diff --git a/docs/imgs/orcasong_function.PNG b/docs/imgs/orcasong_function.PNG new file mode 100644 index 0000000000000000000000000000000000000000..a4ff67ee3277a80bfe4080bb8c1c28e713804460 Binary files /dev/null and b/docs/imgs/orcasong_function.PNG differ diff --git a/docs/index.rst b/docs/index.rst index d184789c31ea444155a642c2ba35547bd6afb966..3483d0d32294aa579c11316e44ff8a73a8bb1b8b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,33 +11,20 @@ |vspace| -Welcome to OrcaSong's documentation! -==================================== +.. include:: ../Readme.rst -.. image:: https://git.km3net.de/ml/OrcaSong/badges/master/build.svg - :target: https://git.km3net.de/ml/OrcaSong/pipelines - -| OrcaSong is a part of the Deep Learning efforts for the neutrino telescope KM3NeT. -| Find more information about KM3NeT on http://www.km3net.org. - -In this regard, OrcaSong is a project that produces KM3NeT event images based on the raw detector data. -This means that OrcaSong takes a datafile with (neutrino-) events and based on this data, it produces 2D/3D/4D 'images' (histograms). -Currently, only simulations with a hdf5 data format are supported as an input. -These event 'images' are required for some Deep Learning machine learning algorithms, e.g. Convolutional Neural Networks. - -As of now, only ORCA detector simulations are supported, but ARCA geometries can be easily implemented as well. - -The main code for generating the images is located in orcanet/make_nn_images.py. +.. toctree:: + :hidden: + :titlesonly: -As of now, the documentation contains a small introduction to get started and and a complete API documentation. -Please feel free to contact me or just open an issue on Gitlab / Github if you have any suggestions. + self .. toctree:: :maxdepth: 2 :caption: Contents: getting_started - orcasong_2 + orcasong CONTRIBUTING Source (Git) <https://git.km3net.de/ml/OrcaSong.git> diff --git a/docs/orcasong.rst b/docs/orcasong.rst new file mode 100644 index 0000000000000000000000000000000000000000..c2dca73f7476a369b03dc6b67137adc81593b231 --- /dev/null +++ b/docs/orcasong.rst @@ -0,0 +1,100 @@ +.. _orcasong_page: + +Producing images +================ + +The main functionality of OrcaSong is to generate multidimensional images +out of ORCA data. + +.. image:: imgs/orcasong_function.PNG + :height: 400px + +Basic Use +--------- + +Import the main class, the FileBinner (see +:py:class:`orcasong.core.FileBinner`), +like this: + +.. code-block:: python + + from orcasong.core import FileBinner + +The FileBinner allows to make nd histograms ("images") from h5-converted root files. +To do this, you can pass a list defining the binning. E.g., the following would +set up the file binner to generate zt data: + +.. code-block:: python + + bin_edges_list = [ + ["pos_z", np.linspace(0, 200, 11)], + ["time", np.linspace(-50, 550, 101)], + ] + + fb = FileBinner(bin_edges_list) + +Calling the object like this will show you the binning: + +.. code-block:: python + + >>> fb + <FileBinner: ('pos_z', 'time') (10, 100)> + +As you can see, the FileBinner will produce zt data, with 10 and 100 bins, +respectively. +Convert a file like this: + +.. code-block:: python + + fb.run(infile, outfile) + +Or convert multiple files, which will all be saved in the given folder: + +.. code-block:: python + + fb.run_multi(infiles, outfolder) + +Calibration +----------- + +You can supply a detx file to the file binner, in order to +calibrate the data on the fly: + +.. code-block:: python + + fb = FileBinner(bin_edges_list, det_file="path/to/det_file.detx") + + +Adding mc_info +-------------- + +To add info from the mc_tracks (or from anywhere in the blob), you can define some +function ``my_mcinfo_extractor`` which takes as an input a km3pipe blob, +and outputs a dict mapping str to float. + +This will be saved as a numpy structured array "y" in the output file, with +the str being the dtype names. Set up like follows: + +.. code-block:: python + + fb = FileBinner(bin_edges_list, mc_info_extr=my_mcinfo_extractor) + + +Plotting binning statistics +--------------------------- + +After the binning has succeeded, you can generate a plot which shows the +distribution of hits among the bins you defined. For this, call the following +console command:: + + plot_binstats my_plotname.pdf file_1_binned.h5 file_2_binned.h5 ... + +This will plot the statistics for the files file_1_binned.h5, file_2_binned.h5, ... +into the file my_plotname.pdf. + +Using existing binnings +----------------------- + +You can use existing bin edges and mc info extractors from ``orcasong.bin_edges`` +and ``orcasong.mc_info_extr``. These were designed for specific detector layouts +and productions, though, and might not work properly when used on other data. diff --git a/docs/orcasong_2.rst b/docs/orcasong_2.rst deleted file mode 100644 index 09a9d5226b4b48a856d82a4a9c22d36a32f5eefc..0000000000000000000000000000000000000000 --- a/docs/orcasong_2.rst +++ /dev/null @@ -1,78 +0,0 @@ -OrcaSong 2 -========== - -OrcaSong 2 is an alternative to orcasong, with (hopefully) more -accessible features. -It has a slightly reduced functionality (no plots), but apart from that -does the same job as orcasong. - -Basic Use ---------- - -Import the main class, the FileBinner (see -:py:class:`orcasong_2.core.FileBinner`), -like this: - -.. code-block:: python - - from orcasong_2.core import FileBinner - -The FileBinner allows to make nd histograms ("images") from calibrated and -h5-converted root files. -To do this, you can pass a list defining the binning. E.g., the following would -set up the file binner to generate zt data: - -.. code-block:: python - - bin_edges_list = [ - ["pos_z", np.linspace(0, 10, 11)], - ["time", np.linspace(-50, 550, 101)], - ] - - fb = FileBinner(bin_edges_list) - -Calling the object like this will show you the binning: - -.. code-block:: python - - >>> fb - <FileBinner: ('pos_z', 'time') (10, 100)> - -As you can see, the FileBinner will produce zt data, with 10 and 100 bins, -respectively. -Convert a file like this: - -.. code-block:: python - - fb.run(infile, outfile) - -Or event this for multiple files, which will all be saved in the given folder: - -.. code-block:: python - - fb.run_multi(infiles, outfolder) - -Calibration ------------ -You can supply a detx file to the file binner, in order to -calibrate the data on the fly: - -.. code-block:: python - - fb = FileBinner(bin_edges_list, det_file="path/to/det_file.detx") - - -Adding mc_info --------------- - -To add info from the mc_tracks (or from wherever), you can define some -function `my_mcinfo_extractor` which takes as an input a km3pipe blob, -and outputs a dict mapping str to float. - -This will be saved as a numpy structured array "y" in the output file, with -the str being the dtype names. Set up like follows: - -.. code-block:: python - - fb = FileBinner(bin_edges_list, mc_info_extr=my_mcinfo_extractor) - diff --git a/examples/example_do2d_plots_output.pdf b/examples/example_do2d_plots_output.pdf deleted file mode 100644 index 255c56e27176b2bc00e1502c67b44d0d204fab0b..0000000000000000000000000000000000000000 Binary files a/examples/example_do2d_plots_output.pdf and /dev/null differ diff --git a/orcasong/tests/__init__.py b/legacy/__init__.py similarity index 100% rename from orcasong/tests/__init__.py rename to legacy/__init__.py diff --git a/orcasong/default_config.toml b/legacy/default_config.toml similarity index 100% rename from orcasong/default_config.toml rename to legacy/default_config.toml diff --git a/orcasong/file_to_hits.py b/legacy/file_to_hits.py similarity index 100% rename from orcasong/file_to_hits.py rename to legacy/file_to_hits.py diff --git a/orcasong/geo_binning.py b/legacy/geo_binning.py similarity index 100% rename from orcasong/geo_binning.py rename to legacy/geo_binning.py diff --git a/orcasong/hits_to_histograms.py b/legacy/hits_to_histograms.py similarity index 100% rename from orcasong/hits_to_histograms.py rename to legacy/hits_to_histograms.py diff --git a/orcasong/io.py b/legacy/io.py similarity index 100% rename from orcasong/io.py rename to legacy/io.py diff --git a/orcasong/make_nn_images.py b/legacy/make_nn_images.py similarity index 94% rename from orcasong/make_nn_images.py rename to legacy/make_nn_images.py index cca183b6a4cafb0e3f889756013d5f8ef3cfeab4..39cab955ebf70ca918ae930351f38b56df2a85e0 100644 --- a/orcasong/make_nn_images.py +++ b/legacy/make_nn_images.py @@ -34,6 +34,7 @@ __version__ = '1.0' __email__ = 'michael.m.moser@fau.de' __status__ = 'Prototype' +import warnings import os import sys #from memory_profiler import profile # for memory profiling, call with @profile; myfunc() @@ -45,11 +46,16 @@ from docopt import docopt mpl.use('Agg') from matplotlib.backends.backend_pdf import PdfPages -from orcasong.file_to_hits import EventDataExtractor -from orcasong.hits_to_histograms import HistogramMaker -from orcasong.io import load_config, check_user_input, make_output_dirs -from orcasong.geo_binning import calculate_bin_edges -from orcasong.utils import get_file_particle_type, EventSkipper +from legacy.file_to_hits import EventDataExtractor +from legacy.hits_to_histograms import HistogramMaker +from legacy.io import load_config, check_user_input, make_output_dirs +from legacy.geo_binning import calculate_bin_edges +from legacy.utils import get_file_particle_type, EventSkipper + + +# TODO deprecated +warnings.warn("The original Orcasong is deprecated, and is no longer supported. " + "Consider switching to the new orcasong.") def parse_input(): diff --git a/orcasong/utils.py b/legacy/utils.py similarity index 100% rename from orcasong/utils.py rename to legacy/utils.py diff --git a/orcasong/__init__.py b/orcasong/__init__.py index c1bb8a7dde64793732725c05f5c7fbea903e21ab..58814dc0620379f8c6e7cf309ec3ec560e9c2c43 100644 --- a/orcasong/__init__.py +++ b/orcasong/__init__.py @@ -1 +1,3 @@ -from .__version__ import version \ No newline at end of file +from .__version__ import version + +__version__ = version diff --git a/orcasong/__version__.py b/orcasong/__version__.py index 6879d8da2ed22c3bb5bdaa208c53f071e660bf1d..ceb23da3a155cc2929fb00e82f7f3b00a2df33eb 100644 --- a/orcasong/__version__.py +++ b/orcasong/__version__.py @@ -19,7 +19,8 @@ try: version = get_version(root='..', relative_to=__file__) except LookupError: try: - with open(join(realpath(dirname(__file__)), "version.txt"), 'r') as fobj: + with open(join(realpath(dirname(__file__)), "version.txt"), + 'r') as fobj: version = fobj.read() except IOError: pass diff --git a/orcasong/bin_edges.py b/orcasong/bin_edges.py new file mode 100644 index 0000000000000000000000000000000000000000..41b828a65e5317c0af42d6529085c3944a871bdc --- /dev/null +++ b/orcasong/bin_edges.py @@ -0,0 +1,29 @@ +""" +Binnings used for some existing detector configurations. + +These are made for the specific given runs. They might not be +applicable to other data, and could cause errors or produce unexpected +results when used on data other then the specified. +""" + +import numpy as np + + +def get_edges_2017_ztc(): + """ + Designed for the 2017 runs with the one line detector. + + Will produce (18, 100, 31) 3d data, with dimensions ztc. + + Z binning: 9.45 meters each + Time binning: 6 ns each + Channel id binning: 1 DOM per bin + + """ + bin_edges_list = [ + ["pos_z", np.linspace(26, 198, 18 + 1)], + ["time", np.linspace(-50, 550, 100 + 1)], + ["channel_id", np.linspace(-0.5, 30.5, 31 + 1)], + ] + return bin_edges_list + diff --git a/orcasong/core.py b/orcasong/core.py new file mode 100644 index 0000000000000000000000000000000000000000..b60beb688117bc5b795439e976dff229b59dc496 --- /dev/null +++ b/orcasong/core.py @@ -0,0 +1,276 @@ +import os +import h5py +import km3pipe as kp +import km3modules as km + +import orcasong +import orcasong.modules as modules +import orcasong.plotting.plot_binstats as plot_binstats +from orcasong.mc_info_extr import get_mc_info_extr + + +__author__ = 'Stefan Reck' + + +class FileBinner: + """ + For making binned images and mc_infos, which can be used for conv. nets. + + Can also add statistics of the binning to the h5 files, which can + be plotted to show the distribution of hits among the bins and how + many hits were cut off. + + Attributes + ---------- + n_statusbar : int, optional + Print a statusbar every n blobs. + n_memory_observer : int, optional + Print memory usage every n blobs. + complib : str + Compression library used for saving the output to a .h5 file. + All PyTables compression filters are available, e.g. 'zlib', + 'lzf', 'blosc', ... . + complevel : int + Compression level for the compression filter that is used for + saving the output to a .h5 file. + flush_frequency : int + After how many events the accumulated output should be flushed to + the harddisk. + A larger value leads to a faster orcasong execution, + but it increases the RAM usage as well. + bin_plot_freq : int or None + If int is given, defines after how many blobs data for an overview + histogram is extracted. + It shows the distribution of hits, the bin edges, and how many hits + were cut off for each field name in bin_edges_list. + It will be saved to the same path as the outfile in run. + + """ + def __init__(self, + bin_edges_list, + mc_info_extr=None, + det_file=None, + center_time=True, + event_skipper=None, + add_bin_stats=True, + chunksize=32, + keep_event_info=True, + keep_mc_tracks=False, + add_t0=False,): + """ + Parameters + ---------- + bin_edges_list : List + List with the names of the fields to bin, and the respective bin + edges, including the left- and right-most bin edge. + Example: For 10 bins in the z direction, and 100 bins in time: + bin_edges_list = [ + ["pos_z", np.linspace(0, 10, 11)], + ["time", np.linspace(-50, 550, 101)], + ] + Some examples can be found in orcasong.bin_edges. + mc_info_extr : function, optional + Function that extracts desired mc_info from a blob, which is then + stored as the "y" datafield in the .h5 file. + The function takes the km3pipe blob as an input, and returns + a dict mapping str to floats. + Some examples can be found in orcasong.mc_info_extr. + det_file : str, optional + Path to a .detx detector geometry file, which can be used to + calibrate the hits. + center_time : bool + Subtract time of first triggered hit from all hit times. Will + also be done for McHits if they are in the blob [default: True]. + event_skipper : func, optional + Function that takes the blob as an input, and returns a bool. + If the bool is true, the blob will be skipped. + add_bin_stats : bool + Add statistics of the binning to the output file. They can be + plotted with util/bin_stats_plot.py [default: True]. + chunksize : int + Chunksize (along axis_0) used for saving the output + to a .h5 file [default: 32]. + keep_event_info : bool + If True, will keep the "event_info" table [default: True]. + keep_mc_tracks : bool + If True, will keep the "McTracks" table [default: False]. + add_t0 : bool + If true, add t0 to the time of hits. If using a det_file, + this will already have been done automatically [default: False]. + + """ + self.bin_edges_list = bin_edges_list + self.mc_info_extr = mc_info_extr + self.det_file = det_file + self.add_t0 = add_t0 + self.center_time = center_time + self.event_skipper = event_skipper + + self.keep_event_info = keep_event_info + self.keep_mc_tracks = keep_mc_tracks + self.chunksize = chunksize + + if add_bin_stats: + self.bin_plot_freq = 1 + else: + self.bin_plot_freq = None + + self.n_statusbar = 1000 + self.n_memory_observer = 1000 + self.complib = 'zlib' + self.complevel = 1 + self.flush_frequency = 1000 + + def run(self, infile, outfile=None, save_plot=False): + """ + Generate images from the infile, and save them as the outfile. + + Parameters + ---------- + infile : str + Path to the input file. + outfile : str, optional + Path to the output file (will be created). If none is given, + will auto generate the name and save it in the cwd. + save_plot : bool + Save the binning hists as a pdf. Only possible if add_bin_stats + is True. + + """ + if save_plot and self.bin_plot_freq is None: + raise ValueError("Can not make plot when add_bin_stats is False") + + name, shape = self.get_names_and_shape() + print("Generating {} images with shape {}".format(name, shape)) + + if outfile is None: + infile_basename = os.path.basename(infile) + outfile_name = os.path.splitext(infile_basename)[0] + "_binned.h5" + outfile = os.path.join(os.getcwd(), outfile_name) + + pipe = self.build_pipe(infile, outfile) + smry = pipe.drain() + + if self.bin_plot_freq is not None: + hists = smry["BinningStatsMaker"] + plot_binstats.add_hists_to_h5file(hists, outfile) + + if save_plot: + save_as = os.path.splitext(outfile)[0] + "_hists.pdf" + plot_binstats.plot_hists(hists, save_as) + + add_version_info(outfile) + + def run_multi(self, infiles, outfolder, save_plot=False): + """ + Bin multiple files into their own output files each. + The output file names will be generated automatically. + + Parameters + ---------- + infiles : List + The path to infiles as str. + outfolder : str + The output folder to place them in. + save_plot : bool + Save the binning hists as a pdf. Only possible if add_bin_stats + is True. + + """ + if save_plot and self.bin_plot_freq is None: + raise ValueError("Can not make plot when add_bin_stats is False") + + outfiles = [] + for infile in infiles: + outfile_name = os.path.splitext(os.path.basename(infile))[0] \ + + "_hist.h5" + outfile = os.path.join(outfolder, outfile_name) + outfiles.append(outfile) + + self.run(infile, outfile, save_plot=False) + + if save_plot: + plot_binstats.plot_hist_of_files( + files=outfiles, save_as=outfolder+"binning_hist.pdf") + + def build_pipe(self, infile, outfile): + """ + Build the pipeline to generate images and mc_info for a file. + """ + pipe = kp.Pipeline() + + if self.n_statusbar is not None: + pipe.attach(km.common.StatusBar, every=self.n_statusbar) + if self.n_memory_observer is not None: + pipe.attach(km.common.MemoryObserver, every=self.n_memory_observer) + + pipe.attach(kp.io.hdf5.HDF5Pump, filename=infile) + pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader', + 'McTracks', 'Hits', 'McHits']) + + if self.det_file: + pipe.attach(modules.DetApplier, det_file=self.det_file) + + if self.center_time or self.add_t0: + pipe.attach(modules.TimePreproc, + add_t0=self.add_t0, + center_time=self.center_time) + + if self.event_skipper is not None: + pipe.attach(modules.EventSkipper, event_skipper=self.event_skipper) + + if self.bin_plot_freq is not None: + pipe.attach(modules.BinningStatsMaker, + bin_plot_freq=self.bin_plot_freq, + bin_edges_list=self.bin_edges_list) + + pipe.attach(modules.ImageMaker, + bin_edges_list=self.bin_edges_list, + store_as="histogram") + + if self.mc_info_extr is not None: + if isinstance(self.mc_info_extr, str): + mc_info_extr = get_mc_info_extr(self.mc_info_extr) + else: + mc_info_extr = self.mc_info_extr + + pipe.attach(modules.McInfoMaker, + mc_info_extr=mc_info_extr, + store_as="mc_info") + + keys_keep = ['histogram', 'mc_info'] + if self.keep_event_info: + keys_keep.append('EventInfo') + if self.keep_mc_tracks: + keys_keep.append('McTracks') + pipe.attach(km.common.Keep, keys=keys_keep) + + pipe.attach(kp.io.HDF5Sink, + filename=outfile, + complib=self.complib, + complevel=self.complevel, + chunksize=self.chunksize, + flush_frequency=self.flush_frequency) + return pipe + + def get_names_and_shape(self): + """ + Get names and shape of the resulting x data, + e.g. (pos_z, time), (18, 50). + """ + names, shape = [], [] + for bin_name, bin_edges in self.bin_edges_list: + names.append(bin_name) + shape.append(len(bin_edges) - 1) + + return tuple(names), tuple(shape) + + def __repr__(self): + name, shape = self.get_names_and_shape() + return "<FileBinner: {} {}>".format(name, shape) + + +def add_version_info(file): + """ Add current orcasong version to h5 file. """ + with h5py.File(file, "a") as f: + f.attrs.create("orcasong", orcasong.__version__, dtype="S6") diff --git a/orcasong_2/mc_info_types.py b/orcasong/mc_info_extr.py similarity index 82% rename from orcasong_2/mc_info_types.py rename to orcasong/mc_info_extr.py index 5132f94a76fee188441987bb552cf8050f0fce77..b9737864faade8c899e9500276e8fc2680990438 100644 --- a/orcasong_2/mc_info_types.py +++ b/orcasong/mc_info_extr.py @@ -2,8 +2,9 @@ Functions that extract info from a blob for the mc_info / y datafield in the h5 files. -These are examples made for the specific given runs. They might not be -applicable to other data. +These are made for the specific given runs. They might not be +applicable to other data, and could cause errors or produce unexpected +results when used on data other then the specified. """ @@ -25,47 +26,58 @@ def get_mc_info_extr(mc_info_extr): """ if mc_info_extr == "mupage": + funct = "get_mupage_mc" mc_info_extr = get_mupage_mc elif mc_info_extr == "real_data": - mc_info_extr = get_data_info + funct = "get_real_data" + mc_info_extr = get_real_data elif mc_info_extr == "random_noise": - mc_info_extr = get_rn_mc + funct = "get_pure_noise" + mc_info_extr = get_pure_noise else: raise NameError("Unknown mc_info_type " + mc_info_extr) + # TODO deprecated + wrng = "The use of a str for mc_info_extr is deprecated. Import the " \ + "function {} from orcasong.mc_info_types instead, and use this" \ + "as mc_info_extr".format(funct) + warnings.warn(wrng) + return mc_info_extr -def get_data_info(blob): +def get_real_data(blob): """ - Get info present for real data, e.g. - for the 2017 one line real data. + Get info present in real data. + Designed for the 2017 one line runs. """ event_info = blob['EventInfo'] track = { - 'event_id': event_info.event_id, # was .event_id[0] up to km3pipe 8.16.0 + 'event_id': event_info.event_id, + # was .event_id[0] up to km3pipe 8.16.0 'run_id': event_info.run_id, 'trigger_mask': event_info.trigger_mask, } return track -def get_rn_mc(blob): +def get_pure_noise(blob): """ - For random noise, which has particle_type 0. + For simulated pure noise events, which have particle_type 0. + """ - event_id = blob['EventInfo'].event_id[0] - run_id = blob["EventInfo"].run_id - particle_type = 0 + event_info = blob['EventInfo'] - track = {'event_id': event_id, - 'run_id': run_id, - 'particle_type': particle_type} + track = { + 'event_id': event_info.event_id[0], + 'run_id': event_info.run_id, + 'particle_type': 0 + } return track @@ -76,6 +88,7 @@ def get_mupage_mc(blob): Will only take into account muons with at least 1 McHit in the active line of the detector. + Designed for the 2017 run by run mupage simulations. e.g. mcv5.1_r3.mupage_10G.km3_AAv1.jterbr00002800.5103.root.h5 Parameters diff --git a/orcasong_2/modules.py b/orcasong/modules.py similarity index 77% rename from orcasong_2/modules.py rename to orcasong/modules.py index 9a6b64feba3b549760c2b9daccee1b233ef47842..2829f44fb51b4fa2f75cc5740077d41472c414e3 100644 --- a/orcasong_2/modules.py +++ b/orcasong/modules.py @@ -22,6 +22,7 @@ class McInfoMaker(kp.Module): Store the mcinfo with this name in the blob. """ + def configure(self): self.mc_info_extr = self.require('mc_info_extr') self.store_as = self.require('store_as') @@ -29,10 +30,8 @@ class McInfoMaker(kp.Module): def process(self, blob): track = self.mc_info_extr(blob) dtypes = [(key, np.float64) for key in track.keys()] - kp_hist = kp.dataclasses.Table(track, - dtype=dtypes, - h5loc='y', - name='event_info') + kp_hist = kp.dataclasses.Table( + track, dtype=dtypes, h5loc='y', name='event_info') blob[self.store_as] = kp_hist return blob @@ -54,6 +53,7 @@ class TimePreproc(kp.Module): If true, center hit and mchit times. """ + def configure(self): self.add_t0 = self.require('add_t0') self.center_time = self.get('center_time', default=True) @@ -77,9 +77,11 @@ class TimePreproc(kp.Module): if not self._t0_flag: self._t0_flag = True print("Adding t0 to hit times") - hits_time = blob["Hits"].time - hits_t0 = blob["Hits"].t0 - blob["Hits"].time = np.add(hits_time, hits_t0) + blob["Hits"].time = np.add(blob["Hits"].time, blob["Hits"].t0) + + if self.has_mchits: + blob["McHits"].time = np.add(blob["McHits"].time, + blob["McHits"].t0) return blob @@ -106,7 +108,7 @@ class TimePreproc(kp.Module): class ImageMaker(kp.Module): """ - Make a n-d histogram from the blob. + Make a n-d histogram from "Hits" in blob, and store it. Attributes ---------- @@ -117,6 +119,7 @@ class ImageMaker(kp.Module): Store the images with this name in the blob. """ + def configure(self): self.bin_edges_list = self.require('bin_edges_list') self.store_as = self.require('store_as') @@ -133,7 +136,8 @@ class ImageMaker(kp.Module): title = name + "event_images" hist_one_event = histogram[np.newaxis, ...].astype(np.uint8) - kp_hist = kp.dataclasses.NDArray(hist_one_event, h5loc='x', title=title) + kp_hist = kp.dataclasses.NDArray( + hist_one_event, h5loc='x', title=title) blob[self.store_as] = kp_hist return blob @@ -166,16 +170,17 @@ class BinningStatsMaker(kp.Module): for the time binning (field name "time"). """ + def configure(self): self.bin_edges_list = self.require('bin_edges_list') self.pdf_path = self.get('pdf_path', default=None) self.bin_plot_freq = self.get("bin_plot_freq", default=1) self.res_increase = self.get('res_increase', default=5) - self.plot_bin_edges = self.get('plot_bin_edges', default=True) self.hists = {} for bin_name, org_bin_edges in self.bin_edges_list: + # dont space bin edges for time if bin_name == "time": bin_edges = org_bin_edges else: @@ -196,8 +201,8 @@ class BinningStatsMaker(kp.Module): Increase resolution of given binning. """ increased_n_bins = (len(bin_edges) - 1) * self.res_increase + 1 - bin_edges = np.linspace(bin_edges[0], bin_edges[-1], - increased_n_bins) + bin_edges = np.linspace( + bin_edges[0], bin_edges[-1], increased_n_bins) return bin_edges @@ -209,12 +214,17 @@ class BinningStatsMaker(kp.Module): for bin_name, hists_data in self.hists.items(): hist_bin_edges = hists_data["hist_bin_edges"] - data = blob["Hits"][bin_name] - hist = np.histogram(data, bins=hist_bin_edges)[0] - + hits = blob["Hits"] + data = hits[bin_name] + # get how much is cut off due to these limits out_pos = data[data > np.max(hist_bin_edges)].size out_neg = data[data < np.min(hist_bin_edges)].size + # get all hits which are not cut off by other bin edges + data = hits[bin_name][self._is_in_limits( + hits, excluded=bin_name)] + hist = np.histogram(data, bins=hist_bin_edges)[0] + self.hists[bin_name]["hist"] += hist self.hists[bin_name]["cut_off"] += np.array([out_neg, out_pos]) @@ -238,6 +248,21 @@ class BinningStatsMaker(kp.Module): """ return self.hists + def _is_in_limits(self, hits, excluded=None): + """ Get which hits are in the limits defined by ALL bin edges + (except for given one). """ + inside = None + for dfield, edges in self.bin_edges_list: + if dfield == excluded: + continue + is_in = np.logical_and(hits[dfield] >= min(edges), + hits[dfield] <= max(edges)) + if inside is None: + inside = is_in + else: + inside = np.logical_and(inside, is_in) + return inside + class EventSkipper(kp.Module): """ @@ -250,6 +275,7 @@ class EventSkipper(kp.Module): If the bool is true, the blob will be skipped. """ + def configure(self): self.event_skipper = self.require('event_skipper') @@ -271,29 +297,39 @@ class DetApplier(kp.Module): Path to a .detx detector geometry file. """ + def configure(self): self.det_file = self.require("det_file") - self.assert_t0_is_added = self.get("check_t0", default=False) self.calib = kp.calib.Calibration(filename=self.det_file) + self._calib_checked = False + + # for debugging + self._assert_t0_is_added = False def process(self, blob): - if self.assert_t0_is_added: - original_time = blob["Hits"].time + if self._calib_checked is False: + if "pos_x" in blob["Hits"]: + warnings.warn("Warning: Using a det file, but pos_x in Hits " + " detected. Is the file already " + "calibrated? This might lead to errors with t0.") + self._calib_checked = True + + # original_time = blob["Hits"].time blob = self.calib.process(blob, key="Hits", outkey="Hits") if "McHits" in blob: blob = self.calib.process(blob, key="McHits", outkey="McHits") - if self.assert_t0_is_added: - actual_time = blob["Hits"].time - t0 = blob["Hits"].t0 - target_time = np.add(original_time, t0) - if not np.array_equal(actual_time, target_time): - print(actual_time) - print(target_time) - raise AssertionError("t0 not added!") - else: - print("t0 was added ok") - + """ + actual_time = blob["Hits"].time + t0 = blob["Hits"].t0 + target_time = np.add(original_time, t0) + if not np.array_equal(actual_time, target_time): + print(actual_time) + print(target_time) + raise AssertionError("t0 not added!") + else: + print("t0 was added ok") + """ return blob diff --git a/orcasong_2/util/__init__.py b/orcasong/plotting/__init__.py similarity index 100% rename from orcasong_2/util/__init__.py rename to orcasong/plotting/__init__.py diff --git a/orcasong_2/util/binning_1d_visualizer.py b/orcasong/plotting/binning_1d_visualizer.py similarity index 99% rename from orcasong_2/util/binning_1d_visualizer.py rename to orcasong/plotting/binning_1d_visualizer.py index 1cc697d4764803ff9f3ceb607834686e10b174f1..07ece8c030458e0d9bfa37c3347dbe781fbc3d90 100644 --- a/orcasong_2/util/binning_1d_visualizer.py +++ b/orcasong/plotting/binning_1d_visualizer.py @@ -14,7 +14,7 @@ import numpy as np import km3pipe as kp import matplotlib.pyplot as plt -from orcasong_2.modules import TimePreproc, DetApplier +from orcasong.modules import TimePreproc, DetApplier __author__ = 'Stefan Reck' diff --git a/orcasong_2/util/bin_stats_plot.py b/orcasong/plotting/plot_binstats.py similarity index 86% rename from orcasong_2/util/bin_stats_plot.py rename to orcasong/plotting/plot_binstats.py index 4c4edbcb9c934fd34db4d54faca8dd8bf825ffd8..425fa7840d99180c3f9a3a6a8b85b6478078dff4 100644 --- a/orcasong_2/util/bin_stats_plot.py +++ b/orcasong/plotting/plot_binstats.py @@ -3,13 +3,14 @@ Run with a parser to plot the binning statistics. Functions for plotting the bin stats made by the BinningStatsMaker module. """ +import os +import warnings +import argparse import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import h5py import numpy as np -import argparse -import os -import warnings + __author__ = 'Stefan Reck' @@ -168,29 +169,22 @@ def plot_hist_of_files(save_as, files=None): Path of files to use instead. """ + if files is None: + files = get_all_h5_files() + hists_list = [] opened_files = [] - - if not files: - all_files = os.listdir(os.getcwd()) - files = [] - for file in all_files: - if file.endswith(".h5"): - files.append(file) + print("Plotting stats of {} file(s)".format(len(files))) try: - print("Plotting stats of {} files".format(len(files))) - for i, file in enumerate(files): - if i % 100 == 0: - print("File {}..." .format(i)) - + print("Opening files...") + for file in files: f = h5py.File(file, "r") if "bin_stats/" not in f: warnings.warn("ERROR: File {} does not have bin_stats dataset. " "Skipping ...".format(file)) f.close() continue - hists_list.append(f["bin_stats/"]) opened_files.append(f) @@ -203,21 +197,32 @@ def plot_hist_of_files(save_as, files=None): file.close() +def get_all_h5_files(): + """ Get a list of all h5 files in the cwd. """ + files = [] + for file in os.listdir(os.getcwd()): + if file.endswith(".h5"): + files.append(file) + return files + + def main(): parser = argparse.ArgumentParser( - description='Plot the bin stats in h5 files. Navigate to the folder ' - 'where the h5 files are, and then run this script.') + description='Generate a plot with statistics of the binning. ' + 'Can only be used on files generated with the FileBinner when ' + 'add_bin_stats was set to true (default). ') parser.add_argument('save_as', type=str, nargs="?", default="bin_stats_plot.pdf", - help='Overwrite the default path or filename where' - 'this gets saved to.') + help='Filename of the plot. Default: ' + 'bin_stats_plot.pdf.') - parser.add_argument('files', type=str, nargs='*', default=None, - help='List of files to plot. Default: ls.') + parser.add_argument('file', type=str, nargs='*', default=None, + help='File(s) to plot. Default: Plot for all h5 ' + 'files in current dir.') args = parser.parse_args() - plot_hist_of_files(args.save_as, args.files) + plot_hist_of_files(args.save_as, args.file) if __name__ == "__main__": diff --git a/orcasong_2/Readme.rst b/orcasong_2/Readme.rst deleted file mode 100644 index b824164f3b1bcddddb810408a0aa982a94caa47c..0000000000000000000000000000000000000000 --- a/orcasong_2/Readme.rst +++ /dev/null @@ -1,6 +0,0 @@ -OrcaSong 2 -========== - -Several changes to the original OrcaSong. Allows to set desired binning via -a list. -Does not contain all features of OrcaSong, like getting mchits, plotting, etc. \ No newline at end of file diff --git a/orcasong_2/core.py b/orcasong_2/core.py index 82154833f99d316bf4a915156dde8723f89ca999..7a1ed8a7959a9a4cfcb950c8863123a8b920fe05 100644 --- a/orcasong_2/core.py +++ b/orcasong_2/core.py @@ -1,261 +1,8 @@ -import os -import km3pipe as kp -import km3modules as km - -import orcasong_2.modules as modules -import orcasong_2.util.bin_stats_plot as bs_plot -from orcasong_2.mc_info_types import get_mc_info_extr - - -__author__ = 'Stefan Reck' - - -class FileBinner: - """ - For making binned images and mc_infos, which can be used for conv. nets. - - Can also add statistics of the binning to the h5 files, which can - be plotted to show the distribution of hits among the bins and how - many hits were cut off. - - Attributes - ---------- - bin_plot_freq : int or None - If int is given, defines after how many blobs data for an overview - histogram is extracted. - It shows the distribution of hits, the bin edges, and how many hits - were cut off for each field name in bin_edges_list. - It will be saved to the same path as the outfile in run. - keep_event_info : bool - If True, will keep the "event_info" table. - keep_mc_tracks : bool - If True, will keep the "McTracks" table. - n_statusbar : int, optional - Print a statusbar every n blobs. - n_memory_observer : int, optional - Print memory usage every n blobs. - chunksize : int - Chunksize (along axis_0) used for saving the output to a .h5 file. - complib : str - Compression library used for saving the output to a .h5 file. - All PyTables compression filters are available, e.g. 'zlib', - 'lzf', 'blosc', ... . - complevel : int - Compression level for the compression filter that is used for - saving the output to a .h5 file. - flush_frequency : int - After how many events the accumulated output should be flushed to - the harddisk. - A larger value leads to a faster orcasong execution, - but it increases the RAM usage as well. - - """ - def __init__(self, - bin_edges_list, - mc_info_extr=None, - det_file=None, - add_t0=False, - center_time=True, - event_skipper=None, - add_bin_stats=True): - """ - Parameters - ---------- - bin_edges_list : List - List with the names of the fields to bin, and the respective bin - edges, including the left- and right-most bin edge. - Example: For 10 bins in the z direction, and 100 bins in time: - bin_edges_list = [ - ["pos_z", np.linspace(0, 10, 11)], - ["time", np.linspace(-50, 550, 101)], - ] - mc_info_extr : function or string, optional - Function that extracts desired mc_info from a blob, which is then - stored as the "y" datafield in the .h5 file. - Can also give a str identifier for an existing extractor. - det_file : str, optional - Path to a .detx detector geometry file, which can be used to - calibrate the hits. - add_t0 : bool - If true, add t0 to the time of hits. If using a det_file, - this will already have been done automatically. - center_time : bool - Subtract time of first triggered hit from all hit times. - Will also be done for McHits if they are in the blob. - event_skipper : func, optional - Function that takes the blob as an input, and returns a bool. - If the bool is true, the blob will be skipped. - add_bin_stats : bool - Add statistics of the binning to the output file. They can be - plotted with util/bin_stats_plot.py. - - """ - self.bin_edges_list = bin_edges_list - self.mc_info_extr = mc_info_extr - self.det_file = det_file - self.add_t0 = add_t0 - self.center_time = center_time - self.event_skipper = event_skipper - - if add_bin_stats: - self.bin_plot_freq = 1 - else: - self.bin_plot_freq = None - - self.keep_event_info = True - self.keep_mc_tracks = False - - self.n_statusbar = 1000 - self.n_memory_observer = 1000 - self.chunksize = 32 - self.complib = 'zlib' - self.complevel = 1 - self.flush_frequency = 1000 - - def run(self, infile, outfile=None, save_plot=False): - """ - Make images for a file. - - Parameters - ---------- - infile : str - Path to the input file. - outfile : str, optional - Path to the output file (will be created). If none is given, - will auto generate the name and save it in the cwd. - save_plot : bool - Save the binning hists as a pdf. Only possible if add_bin_stats - is True. - - """ - if save_plot and self.bin_plot_freq is None: - raise ValueError("Can not make plot when add_bin_stats is False") - - name, shape = self.get_names_and_shape() - print("Generating {} images with shape {}".format(name, shape)) - - if outfile is None: - infile_basename = os.path.basename(infile) - outfile_name = os.path.splitext(infile_basename)[0] + "_binned.h5" - outfile = os.path.join(os.getcwd(), outfile_name) - - pipe = self.build_pipe(infile, outfile) - smry = pipe.drain() - - if self.bin_plot_freq is not None: - hists = smry["BinningStatsMaker"] - bs_plot.add_hists_to_h5file(hists, outfile) - - if save_plot: - save_as = os.path.splitext(outfile)[0] + "_hists.pdf" - bs_plot.plot_hists(hists, save_as) - - def run_multi(self, infiles, outfolder, save_plot=False): - """ - Bin multiple files into their own output files each. - - Parameters - ---------- - infiles : List - The path to infiles as str. - outfolder : str - The output folder to place them in. The output file name will - be generated automatically. - save_plot : bool - Save the binning hists as a pdf. Only possible if add_bin_stats - is True. - - """ - if save_plot and self.bin_plot_freq is None: - raise ValueError("Can not make plot when add_bin_stats is False") - - outfiles = [] - for infile in infiles: - outfile_name = os.path.splitext(os.path.basename(infile))[0] \ - + "_hist.h5" - outfile = os.path.join(outfolder, outfile_name) - outfiles.append(outfile) - - self.run(infile, outfile, save_plot=False) - - if save_plot: - bs_plot.plot_hist_of_files(files=outfiles, - save_as=outfolder+"binning_hist.pdf") - - def build_pipe(self, infile, outfile): - """ - Build the pipeline to generate images and mc_info for a file. - """ - - pipe = kp.Pipeline() - - if self.n_statusbar is not None: - pipe.attach(km.common.StatusBar, every=self.n_statusbar) - if self.n_memory_observer is not None: - pipe.attach(km.common.MemoryObserver, every=self.n_memory_observer) - - pipe.attach(kp.io.hdf5.HDF5Pump, filename=infile) - - pipe.attach(km.common.Keep, keys=['EventInfo', 'Header', 'RawHeader', - 'McTracks', 'Hits', 'McHits']) - - if self.det_file: - pipe.attach(modules.DetApplier, det_file=self.det_file) - - if self.center_time or self.add_t0: - pipe.attach(modules.TimePreproc, - add_t0=self.add_t0, - center_time=self.center_time) - - if self.event_skipper is not None: - pipe.attach(modules.EventSkipper, event_skipper=self.event_skipper) - - if self.bin_plot_freq is not None: - pipe.attach(modules.BinningStatsMaker, - bin_plot_freq=self.bin_plot_freq, - bin_edges_list=self.bin_edges_list) - - pipe.attach(modules.ImageMaker, - bin_edges_list=self.bin_edges_list, - store_as="histogram") - - if self.mc_info_extr is not None: - if isinstance(self.mc_info_extr, str): - mc_info_extr = get_mc_info_extr(self.mc_info_extr) - else: - mc_info_extr = self.mc_info_extr - - pipe.attach(modules.McInfoMaker, - mc_info_extr=mc_info_extr, - store_as="mc_info") - - keys_keep = ['histogram', 'mc_info'] - if self.keep_event_info: - keys_keep.append('EventInfo') - if self.keep_mc_tracks: - keys_keep.append('McTracks') - pipe.attach(km.common.Keep, keys=keys_keep) - - pipe.attach(kp.io.HDF5Sink, - filename=outfile, - complib=self.complib, - complevel=self.complevel, - chunksize=self.chunksize, - flush_frequency=self.flush_frequency) - return pipe - - def get_names_and_shape(self): - """ - Get names and shape of the resulting x data, - e.g. (pos_z, time), (18, 50). - """ - names, shape = [], [] - for bin_name, bin_edges in self.bin_edges_list: - names.append(bin_name) - shape.append(len(bin_edges) - 1) - - return tuple(names), tuple(shape) - - def __repr__(self): - name, shape = self.get_names_and_shape() - return "<FileBinner: {} {}>".format(name, shape) +""" +For backwards compatibility. +""" +import warnings +from orcasong.core import FileBinner + +# TODO deprecated +warnings.warn("orcasong_2 has been renamed to orcasong, please update your code.") diff --git a/orcasong_contrib/data_tools/make_data_split/make_data_split.py b/orcasong_contrib/data_tools/make_data_split/make_data_split.py index cce5f2fef4a523dfbf9e09e8503bb803d8e9808a..f278fa3b125e282c36158e43e3bac7544c323092 100644 --- a/orcasong_contrib/data_tools/make_data_split/make_data_split.py +++ b/orcasong_contrib/data_tools/make_data_split/make_data_split.py @@ -15,6 +15,8 @@ Options: """ +__author__ = 'Michael Moser' + import os import toml import docopt diff --git a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py index 6488408712e3443a60611ff92ef60eb41c42d9cc..9103dd8e5ff8b450cd43b3bdce685021c4785f26 100644 --- a/orcasong_contrib/data_tools/shuffle/shuffle_h5.py +++ b/orcasong_contrib/data_tools/shuffle/shuffle_h5.py @@ -22,7 +22,7 @@ import h5py import km3pipe as kp import km3modules as km from orcasong_contrib.data_tools.concatenate.concatenate_h5 import get_f_compression_and_chunking -from orcasong_2.modules import EventSkipper +from orcasong.modules import EventSkipper # from memory_profiler import profile # for memory profiling, call with @profile; myfunc() diff --git a/orcasong_2/util/split_conc.py b/orcasong_contrib/data_tools/split_conc.py similarity index 100% rename from orcasong_2/util/split_conc.py rename to orcasong_contrib/data_tools/split_conc.py diff --git a/setup.py b/setup.py index 1bc018980037c0ce14af5a0b1fa05fe06100ef95..15239ff826b799b7e35fcf65587a37d7daa3a536 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,17 @@ #!/usr/bin/env python from setuptools import setup, find_packages -from pkg_resources import get_distribution, DistributionNotFound +# from pkg_resources import get_distribution, DistributionNotFound with open('requirements.txt') as fobj: requirements = [l.strip() for l in fobj.readlines()] setup( name='orcasong', - description='Makes images for a NN based on the hit information of neutrino events in the neutrino telescope KM3NeT', + description='Makes images for a NN based on the hit information of neutrino ' + 'events in the neutrino telescope KM3NeT', url='https://git.km3net.de/ml/OrcaSong', - author='Michael Moser, Stefan Reck', - author_email='mmoser@km3net.de, michael.m.moser@fau.de, stefan.reck@fau.de', + author='Stefan Reck, Michael Moser', + author_email='stefan.reck@fau.de, mmoser@km3net.de, michael.m.moser@fau.de', license='AGPL', install_requires=requirements, packages=find_packages(), @@ -25,12 +26,13 @@ setup( use_scm_version={'write_to': 'orcasong/version.txt', 'tag_regex': r'^(?P<prefix>v)?(?P<version>[^\+]+)(?P<suffix>.*)?$', }, - entry_points={'console_scripts': ['make_nn_images=orcasong.make_nn_images:main', - 'shuffle=orcasong_contrib.data_tools.shuffle.shuffle_h5:main', - 'concatenate=orcasong_contrib.data_tools.concatenate.concatenate_h5:main', - 'make_dsplit=orcasong_contrib.data_tools.make_data_split.make_data_split:main', - 'plot_binstats=orcasong_2.util.bin_stats_plot:main']} + entry_points={'console_scripts': [ + 'make_nn_images=legacy.make_nn_images:main', + 'shuffle=orcasong_contrib.data_tools.shuffle.shuffle_h5:main', + 'concatenate=orcasong_contrib.data_tools.concatenate.concatenate_h5:main', + 'make_dsplit=orcasong_contrib.data_tools.make_data_split.make_data_split:main', + 'plot_binstats=orcasong.plotting.plot_binstats:main']} ) -__author__ = 'Michael Moser' \ No newline at end of file +__author__ = 'Stefan Reck, Michael Moser' diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/test_bin_edges.py b/tests/test_bin_edges.py new file mode 100644 index 0000000000000000000000000000000000000000..8d3cd2853d6033648eba7367668613ce981c00e5 --- /dev/null +++ b/tests/test_bin_edges.py @@ -0,0 +1,21 @@ +import inspect +from unittest import TestCase +import orcasong.bin_edges +from orcasong.core import FileBinner + + +__author__ = 'Stefan Reck' + + +class TestEdges(TestCase): + """ + Just call all functions in the bin_edges module and see if they work + with the filebinner. + """ + def test_them(self): + funcs = [memb[1] for memb in inspect.getmembers(orcasong.bin_edges) + if inspect.isfunction(memb[1])] + + for func in funcs: + fb = FileBinner(func()) + fb.get_names_and_shape() diff --git a/tests/test_modules.py b/tests/test_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..49605cb06e8cdbf209d25c74d71f7d61a53033ad --- /dev/null +++ b/tests/test_modules.py @@ -0,0 +1,341 @@ +from unittest import TestCase +import numpy as np +import orcasong.modules as modules +from km3pipe.dataclasses import Table + + +__author__ = 'Stefan Reck' + + +class TestModules(TestCase): + def test_mc_info_maker(self): + """ Test the mcinfo maker on some dummy data. """ + def mc_info_extr(blob): + hits = blob["Hits"] + return {"dom_id_0": hits.dom_id[0], + "time_2": hits.time[2]} + + in_blob = { + "Hits": Table({ + 'dom_id': [2, 3, 3], + 'channel_id': [0, 1, 2], + 'time': [10.1, 11.2, 12.3] + }) + } + module = modules.McInfoMaker( + mc_info_extr=mc_info_extr, store_as="test") + out_blob = module.process(in_blob) + + self.assertSequenceEqual(list(out_blob.keys()), ["Hits", "test"]) + self.assertSequenceEqual(list(out_blob["test"].dtype.names), + ('dom_id_0', 'time_2')) + np.testing.assert_array_equal(out_blob["test"]["dom_id_0"], + np.array([2, ])) + np.testing.assert_array_equal(out_blob["test"]["time_2"], + np.array([12.3, ])) + + def test_event_skipper(self): + def event_skipper(blob): + return blob == 42 + + module = modules.EventSkipper(event_skipper=event_skipper) + + self.assertEqual(module.process(42), None) + self.assertEqual(module.process(25), 25) + + +class TestTimePreproc(TestCase): + def setUp(self): + self.in_blob = { + "Hits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + self.in_blob_mc = { + "Hits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }), + "McHits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + def test_time_preproc_t0(self): + module = modules.TimePreproc( + add_t0=True, center_time=False) + + target = { + "Hits": Table({ + 'time': [1.1, 2.2, 3.3], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + out_blob = module.process(self.in_blob) + + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_equal(np.array(out_blob["Hits"]), + np.array(target["Hits"])) + + def test_time_preproc_center(self): + module = modules.TimePreproc( + add_t0=False, center_time=True) + + target = { + "Hits": Table({ + 'time': [-1., 0., 1.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + out_blob = module.process(self.in_blob) + + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_equal(np.array(out_blob["Hits"]), + np.array(target["Hits"])) + + def test_time_preproc_t0_and_center(self): + module = modules.TimePreproc( + add_t0=True, center_time=True) + + target = { + "Hits": Table({ + 'time': [-1.1, 0., 1.1], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + out_blob = module.process(self.in_blob) + + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_almost_equal( + np.array(out_blob["Hits"].view("<f8")), + np.array(target["Hits"].view("<f8"))) + + def test_time_preproc_mchits_t0_and_center(self): + module = modules.TimePreproc( + add_t0=True, center_time=True) + + target = { + "Hits": Table({ + 'time': [-1.1, 0., 1.1], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }), + "McHits": Table({ + 'time': [-1.1, 0., 1.1], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }), + } + out_blob = module.process(self.in_blob_mc) + + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_almost_equal( + np.array(out_blob["McHits"].view("<f8")), + np.array(target["McHits"].view("<f8"))) + + +class TestImageMaker(TestCase): + def test_2d_xt_binning(self): + # (3 x 2) x-t binning + bin_edges_list = [ + ["x", [3.5, 4.5, 5.5, 6.5]], + ["time", [0.5, 2, 3.5]] + ] + + module = modules.ImageMaker( + bin_edges_list=bin_edges_list, store_as="histogram") + in_blob = { + "Hits": Table({ + "x": [4, 5, 6], + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + target = { + "Hits": Table({ + "x": [4, 5, 6], + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }), + "histogram": np.array([[ + [1, 0], + [0, 1], + [0, 1], + ]]) + } + + out_blob = module.process(in_blob) + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_almost_equal( + np.array(out_blob["Hits"].view("<f8")), + np.array(target["Hits"].view("<f8"))) + np.testing.assert_array_almost_equal( + np.array(out_blob["histogram"]), + np.array(target["histogram"])) + + def test_unknown_field(self): + # (3 x 2) x-t binning + bin_edges_list = [ + ["aggg", [3.5, 4.5, 5.5, 6.5]], + ["time", [0.5, 2, 3.5]] + ] + + module = modules.ImageMaker( + bin_edges_list=bin_edges_list, store_as="histogram") + in_blob = { + "Hits": Table({ + "x": [4, 5, 6], + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + with self.assertRaises(ValueError): + module.process(in_blob) + + def test_1d_binning(self): + # (1, ) t binning + bin_edges_list = [ + ["time", [2.5, 3.5]] + ] + + module = modules.ImageMaker( + bin_edges_list=bin_edges_list, store_as="histogram") + in_blob = { + "Hits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + target = { + "Hits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }), + "histogram": np.array([ + [1, ], + ]) + } + + out_blob = module.process(in_blob) + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_almost_equal( + np.array(out_blob["Hits"].view("<f8")), + np.array(target["Hits"].view("<f8"))) + np.testing.assert_array_almost_equal( + np.array(out_blob["histogram"]), + np.array(target["histogram"])) + + def test_1d_binning_no_hits(self): + # (1, ) t binning + bin_edges_list = [ + ["time", [3.5, 4.5]] + ] + + module = modules.ImageMaker( + bin_edges_list=bin_edges_list, store_as="histogram") + in_blob = { + "Hits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }) + } + + target = { + "Hits": Table({ + 'time': [1., 2., 3.], + "t0": [0.1, 0.2, 0.3], + "triggered": [0, 1, 1], + }), + "histogram": np.array([ + [0, ], + ]) + } + + out_blob = module.process(in_blob) + self.assertSetEqual(set(out_blob.keys()), set(target.keys())) + np.testing.assert_array_almost_equal( + np.array(out_blob["Hits"].view("<f8")), + np.array(target["Hits"].view("<f8"))) + np.testing.assert_array_almost_equal( + np.array(out_blob["histogram"]), + np.array(target["histogram"])) + + +class TestBinningStatsMaker(TestCase): + def test_it(self): + # (3 x 2) x-t binning + bin_edges_list = [ + ["x", [3.5, 4.5, 5.5, 6.5]], + ["time", [0.5, 2, 3.5]], + ["z", [1, 4]] + ] + + in_blob = { + "Hits": Table({ + "x": [4, 5, 6, 6], + 'time': [1., 2., 3., 50], + "z": [0, 3, 4, 5], + + "t0": [0.1, 0.2, 0.3, 0.4], + "triggered": [0, 1, 1, 1], + }) + } + + target = { + 'x': { + 'hist': np.array([0., 0., 0., 1., 0., 1.]), + 'hist_bin_edges': np.array([3.5, 4., 4.5, 5., 5.5, 6., 6.5]), + 'bin_edges': [3.5, 4.5, 5.5, 6.5], + 'cut_off': np.array([0., 0.]) + }, + 'time': { + 'hist': np.array([0., 2.]), + 'hist_bin_edges': [0.5, 2, 3.5], + 'bin_edges': [0.5, 2, 3.5], + 'cut_off': np.array([0., 1.]) + }, + 'z': { + 'hist': np.array([0., 2.]), + 'hist_bin_edges': np.array([1., 2.5, 4.]), + 'bin_edges': [1, 4], + 'cut_off': np.array([1., 1.]) + } + } + + module = modules.BinningStatsMaker( + bin_edges_list=bin_edges_list, res_increase=2) + module.process(in_blob) + output = module.finish() + check_dicts_n_ray(output, target) + + +def check_dicts_n_ray(a, b): + """ Check if dicts with dicts with ndarrays are equal. """ + if set(a.keys()) != set(b.keys()): + raise KeyError("{} != {}".format(a.keys(), b.keys())) + for key in a.keys(): + if set(a[key].keys()) != set(b[key].keys()): + raise KeyError("{} != {}".format(a[key].keys(), b[key].keys())) + for skey in a[key].keys(): + np.testing.assert_array_almost_equal(a[key][skey], b[key][skey])