Merge branch 'parser' into 'master'

- moved all commands to central parser 'orcasong' (old ones are kept for now but will give warning and will be removed) - add new command 'orcasong run' which uses a toml config to setup See merge request !20

Merge branch 'parser' into 'master'
2e431b69 · Stefan Reck · c79af52d · f12e862c · 2e431b69 · 2e431b69
Commit 2e431b69 authored 3 years ago by Stefan Reck
--- a/Readme.rst
+++ b/Readme.rst
@@ -36,6 +36,11 @@ OrcaSong can be installed via pip by running::

    pip install orcasong

+
+You can get a list of all the bash commands in orcasong by typing::
+
+    orcasong --help
+
 A Singularity image of the latest stable version of OrcaSong is also provided.
 You can download it from the km3net sftp server ``pi1139.physik.uni-erlangen.de``
 in ``singularity/orcasong.sif``.
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -12,7 +12,7 @@ Step 1: From root aanet files to h5 aanet files
 Convert offline files (aka aanet files) from root format to h5 format using
 the 'h5extract' command of km3pipe like so::

-    h5extract filename.root
+    h5extract aanet_file.root

 .. note::
    This has to be done only once for each file. Check if somebody did this
@@ -23,8 +23,14 @@ the 'h5extract' command of km3pipe like so::
 Step 2: From h5 aanet files to h5 DL files
 ------------------------------------------
 Produce DL h5 files from the aanet h5 files using OrcaSong.
-You can either produce images or graphs. See :ref:`orcasong_page` for
-instructions on how to do this.
+You can either produce images or graphs.
+If you have an orcasong config file, you can use it via the command line like this::
+
+    orcasong run aanet_file.h5 orcasong_config.toml --detx_file detector.detx
+
+
+Alternatively, you can use the python frontend of orcasong.
+See :ref:`orcasong_page` for instructions on how to do this.

 The resulting DL h5 files can already be used as input for networks!


--- a/docs/orcasong.rst
+++ b/docs/orcasong.rst
@@ -3,7 +3,7 @@
 Producing DL h5 files from aanet h5 files
 =========================================

-Describes how to use OrcaSong to produce h5 files for Deep Learning
+Describes how to use OrcaSong in python to produce h5 files for Deep Learning
 from aanet h5 files. These files can contain either images (for convolutional
 networks), or graphs (for Graph networks).


--- a/docs/tools.rst
+++ b/docs/tools.rst
@@ -37,7 +37,7 @@ km3pipe. The input can also be a txt file like from make_data_split.

 Can be used via the commandline like so::

-    concatenate --help
+    orcasong concatenate --help

 or import as:

@@ -58,7 +58,7 @@ Shuffle an h5 file using km3pipe.

 Can be used via the commandline like so::

-    h5shuffle --help
+    orcasong h5shuffle --help

 or import function for general postprocessing:

@@ -69,4 +69,7 @@ or import function for general postprocessing:
    postproc_file(output_filepath_concat)


-Theres also a faster (beta) version available called h5shuffle2.
+Theres also a faster (beta) version available called h5shuffle2::
+
+    orcasong h5shuffle2 --help
+
--- a/examples/orcasong_example.toml
+++ b/examples/orcasong_example.toml
+# This is an example config for running orcasong. It's not intended
+# to be used for actual large-scale productions.
+
+# the mode to run orcasong in; either 'graph' or 'image'
+mode="graph"
+# arguments for FileGraph or FileBinner (see orcasong.core)
+max_n_hits = 2000
+time_window = [-100, 5000]
+# can also give the arguments of orcasong.core.BaseProcessor,
+# which are shared between modes
+chunksize=16
+# built-in extractor function to use (see orcasong.from_toml.EXTRACTORS)
+extractor = "neutrino_mc"
+
+[extractor_config]
+# arguments for setting up the extractor function can go here. None in this case.
--- a/orcasong/from_toml.py
+++ b/orcasong/from_toml.py
+import toml
+import orcasong.core
+import orcasong.extractors as extractors
+
+# built-in extractors. First argument has to be the input filename,
+# other parameters can be set via 'extractor_config' dict in the toml
+EXTRACTORS = {
+    "neutrino_mc": extractors.get_neutrino_mc_info_extr,
+    "neutrino_data": extractors.get_real_data_info_extr,
+}
+
+MODES = {
+    "graph": orcasong.core.FileGraph,
+    "image": orcasong.core.FileBinner,
+}
+
+
+def add_parser_run(subparsers):
+    parser = subparsers.add_parser(
+        "run",
+        description='Produce a dl file from an aanet file.')
+    parser.add_argument('infile', type=str, help="Aanet file in h5 format.")
+    parser.add_argument('toml_file', type=str, help="Orcasong configuration in toml format.")
+    parser.add_argument('--detx_file', type=str, default=None, help=(
+        "Optional detx file to calibrate on the fly. Can not be used if a "
+        "detx_file has also been given in the toml file."))
+    parser.add_argument('--outfile', type=str, default=None, help=(
+        "Path to output file. Default: Save with auto generated name in cwd."))
+    parser.set_defaults(func=run_orcasong)
+
+
+def run_orcasong(infile, toml_file, detx_file=None, outfile=None):
+    setup_processor(infile, toml_file, detx_file).run(
+        infile=infile, outfile=outfile)
+
+
+def setup_processor(infile, toml_file, detx_file=None):
+    cfg = toml.load(toml_file)
+    processor = _get_verbose(cfg.pop("mode"), MODES)
+
+    if "detx_file" in cfg:
+        if detx_file is not None:
+            raise ValueError("detx_file passed to run AND defined in toml")
+        detx_file = cfg.pop("detx_file")
+
+    if "extractor" in cfg:
+        extractor_cfg = cfg.pop("extractor_config", {})
+        extractor = _get_verbose(cfg.pop("extractor"), EXTRACTORS)(infile, **extractor_cfg)
+    else:
+        extractor = None
+
+    return processor(
+        det_file=detx_file,
+        extractor=extractor,
+        **cfg,
+    )
+
+
+def _get_verbose(key, d):
+    if key not in d:
+        raise KeyError(f"Unknown key '{key}' (available: {list(d.keys())})")
+    return d[key]
--- a/orcasong/parser.py
+++ b/orcasong/parser.py
+"""
+Run OrcaSong functionalities from command line.
+
+"""
+import argparse
+from orcasong.tools.concatenate import concatenate
+from orcasong.tools.postproc import postproc_file
+from orcasong.tools.shuffle2 import h5shuffle2
+import orcasong.from_toml as from_toml
+import orcasong.plotting.plot_binstats as plot_binstats
+import orcasong.tools.make_data_split as make_data_split
+
+
+def _add_parser_concatenate(subparsers):
+    parser = subparsers.add_parser(
+        "concatenate",
+        description='Concatenate many small h5 files to a single large one '
+                    'in a km3pipe compatible format. This is intended for '
+                    'files that get generated by orcasong, i.e. all datsets '
+                    'should have the same length, with one row per '
+                    'blob. '
+                    'Compression options and the datasets to be created in '
+                    'the new file will be read from the first input file.')
+    parser.add_argument(
+        'file', type=str, nargs="*",
+        help="Define the files to concatenate. If it's one argument: A txt list "
+             "with pathes of h5 files to concatenate (one path per line). "
+             "If it's multiple arguments: "
+             "The pathes of h5 files to concatenate.")
+    parser.add_argument(
+        '--outfile', type=str, default="concatenated.h5",
+        help='The absoulte filepath of the output .h5 file that will be created. ')
+    parser.add_argument(
+        '--no_used_files', action='store_true',
+        help="Per default, the paths of the input files are added "
+             "as their own datagroup in the output file. Use this flag to "
+             "disable. ")
+    parser.add_argument(
+        '--skip_errors', action='store_true',
+        help="If true, ignore files that can't be concatenated. ")
+    parser.set_defaults(func=concatenate)
+
+
+def _add_parser_h5shuffle(subparsers):
+    parser = subparsers.add_parser(
+        "h5shuffle",
+        description='Shuffle an h5 file using km3pipe.',
+    )
+    parser.add_argument('input_file', type=str, help='File to shuffle.')
+    parser.add_argument('--output_file', type=str,
+                        help='Name of output file. Default: Auto generate name.')
+    parser.add_argument('--delete', action="store_true",
+                        help='Delete original file afterwards.')
+    parser.set_defaults(func=postproc_file)
+
+
+def _add_parser_h5shuffle2(subparsers):
+    parser = subparsers.add_parser(
+        "h5shuffle2",
+        description="Shuffle datasets in a h5file that have the same length. "
+        "Uses chunkwise readout for speed-up."
+    )
+    parser.add_argument(
+        "input_file", type=str, help="Path of the file that will be shuffled."
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default=None,
+        help="If given, this will be the name of the output file. "
+        "Default: input_file + suffix.",
+    )
+    parser.add_argument(
+        "--datasets",
+        type=str,
+        nargs="*",
+        default=("x", "y"),
+        help="Which datasets to include in output. Default: x, y",
+    )
+    parser.add_argument(
+        "--max_ram_fraction",
+        type=float,
+        default=0.25,
+        help="in [0, 1]. Fraction of all available ram to use for reading one batch of data "
+        "Note: this should "
+        "be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. "
+        "Default: 0.25",
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=None,
+        help="Shuffle the file this many times. Default: Auto choose best number.",
+    )
+    parser.add_argument(
+        "--max_ram",
+        type=int,
+        default=None,
+        help="Available ram in bytes. Default: Use fraction of maximum "
+             "available instead (see max_ram_fraction).",
+    )
+    parser.set_defaults(func=h5shuffle2)
+
+
+def _add_parser_version(subparsers):
+    def show_version():
+        from orcasong import version
+        print(version)
+
+    parser = subparsers.add_parser(
+        "version",
+        description="Show installed orcanet version.",
+    )
+    parser.set_defaults(func=show_version)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="orcasong",
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    subparsers = parser.add_subparsers()
+
+    from_toml.add_parser_run(subparsers)
+    _add_parser_concatenate(subparsers)
+    _add_parser_h5shuffle(subparsers)
+    _add_parser_h5shuffle2(subparsers)
+    plot_binstats.add_parser(subparsers)
+    make_data_split.add_parser(subparsers)
+    _add_parser_version(subparsers)
+
+    kwargs = vars(parser.parse_args())
+    func = kwargs.pop("func")
+    func(**kwargs)
--- a/orcasong/plotting/plot_binstats.py
+++ b/orcasong/plotting/plot_binstats.py
@@ -218,6 +218,9 @@ def get_all_h5_files():


 def main():
+    # TODO deprecated
+    warnings.warn(
+        "plot_binstats is deprecated and has been renamed to orcasong plot_binstats")
    parser = argparse.ArgumentParser(
        description='Generate a plot with statistics of the binning. '
                    'Can only be used on files generated with the FileBinner when '
@@ -232,5 +235,20 @@ def main():
    plot_hist_of_files(**vars(parser.parse_args()))


+def add_parser(subparsers):
+    parser = subparsers.add_parser(
+        "plot_binstats",
+        description='Generate a plot with statistics of the binning. '
+                    'Can only be used on files generated with the FileBinner when '
+                    'add_bin_stats was set to true (default). ')
+    parser.add_argument(
+        '--save_as', type=str, default="bin_stats_plot.pdf",
+        help='Filename of the plot. Default: bin_stats_plot.pdf.')
+    parser.add_argument(
+        'files', type=str, nargs='*', default=None,
+        help='File(s) to plot. Default: Plot for all h5 files in current dir.')
+    parser.set_defaults(func=plot_hist_of_files)
+
+
 if __name__ == "__main__":
    main()
--- a/orcasong/tools/concatenate.py
+++ b/orcasong/tools/concatenate.py
@@ -305,7 +305,27 @@ def _copy_attrs(src_datset, target_dataset):
            warnings.warn(f"Error: Can not copy attribute {k}: {e}")


-def get_parser():
+def concatenate(file, outfile="concatenated.h5", no_used_files=False, skip_errors=False):
+    """ Concatenate wrapped in a function. """
+    if len(file) == 1:
+        fc = FileConcatenator.from_list(
+            file[0],
+            skip_errors=skip_errors
+        )
+    else:
+        fc = FileConcatenator(
+            input_files=file,
+            skip_errors=skip_errors
+        )
+    fc.concatenate(
+        outfile,
+        append_used_files=not no_used_files,
+    )
+
+
+def main():
+    # TODO deprecated
+    warnings.warn("concatenate is deprecated and has been renamed to orcasong concatenate")
    parser = argparse.ArgumentParser(
        description='Concatenate many small h5 files to a single large one '
                    'in a km3pipe compatible format. This is intended for '
@@ -331,27 +351,7 @@ def get_parser():
    parser.add_argument(
        '--skip_errors', action='store_true',
        help="If true, ignore files that can't be concatenated. ")
-    return parser
-
-
-def main():
-    parser = get_parser()
-    parsed_args = parser.parse_args()
-
-    if len(parsed_args.file) == 1:
-        fc = FileConcatenator.from_list(
-            parsed_args.file[0],
-            skip_errors=parsed_args.skip_errors
-        )
-    else:
-        fc = FileConcatenator(
-            input_files=parsed_args.file,
-            skip_errors=parsed_args.skip_errors
-        )
-    fc.concatenate(
-        parsed_args.outfile,
-        append_used_files=not parsed_args.no_used_files,
-    )
+    concatenate(**vars(parser.parse_args()))


 if __name__ == '__main__':

--- a/orcasong/tools/make_data_split.py
+++ b/orcasong/tools/make_data_split.py
@@ -5,6 +5,7 @@
 __author__ = "Michael Moser, Daniel Guderian"

 import os
+import warnings
 import toml
 import argparse
 import h5py
@@ -13,6 +14,8 @@ import numpy as np


 def get_parser():
+    # TODO deprecated
+    warnings.warn("make_data_split is deprecated and has been renamed to orcasong make_data_split")
    parser = argparse.ArgumentParser(
        description="Create datasets based on the run_id's."
        "Use the config to add input folder and set the ranges."
@@ -26,6 +29,20 @@ def get_parser():
    return parser


+def add_parser(subparsers):
+    parser = subparsers.add_parser(
+        "make_data_split",
+        description="Create datasets based on the run_id's."
+        "Use the config to add input folder and set the ranges."
+        "Outputs a list in an txt file that can be used to "
+        "concatenate the files specfied"
+    )
+    parser.add_argument(
+        "config", type=str, help="See example config for detailed information"
+    )
+    parser.set_defaults(func=make_split)
+
+
 def get_all_ip_group_keys(cfg):
    """
    Gets the keys of all input groups in the config dict.
@@ -418,9 +435,10 @@ def main():
    # load the config
    parser = get_parser()
    parsed_args = parser.parse_args()
+    make_split(parsed_args.config)

-    config_file = parsed_args.config

+def make_split(config_file):
    # decode config
    cfg = toml.load(config_file)
    cfg["toml_filename"] = config_file

--- a/orcasong/tools/postproc.py
+++ b/orcasong/tools/postproc.py
@@ -3,6 +3,8 @@ Scripts for postprocessing h5 files, e.g. shuffling.
 """
 import os
 import argparse
+import warnings
+
 import h5py
 import km3pipe as kp
 import km3modules as km
@@ -115,6 +117,8 @@ def get_filepath_output(input_file, shuffle=True, event_skipper=None):


 def h5shuffle():
+    # TODO deprecated
+    warnings.warn("h5shuffle is deprecated and has been renamed to orcasong h5shuffle")
    parser = argparse.ArgumentParser(description='Shuffle an h5 file using km3pipe.')
    parser.add_argument('input_file', type=str, help='File to shuffle.')
    parser.add_argument('--output_file', type=str,

--- a/orcasong/tools/shuffle2.py
+++ b/orcasong/tools/shuffle2.py
@@ -2,6 +2,8 @@ import os
 import time
 import datetime
 import argparse
+import warnings
+
 import numpy as np
 import psutil
 import h5py
@@ -335,6 +337,8 @@ def slicify(fancy_indices):


 def run_parser():
+    # TODO deprecated
+    warnings.warn("h5shuffle2 is deprecated and has been renamed to orcasong h5shuffle2")
    parser = argparse.ArgumentParser(
        description="Shuffle datasets in a h5file that have the same length. "
        "Uses chunkwise readout for speed-up."

--- a/requirements.txt
+++ b/requirements.txt
 numpy
 h5py
 matplotlib
+toml
 km3pipe>=9
 psutil
 setuptools_scm
--- a/setup.py
+++ b/setup.py
@@ -27,11 +27,13 @@ setup(
                     'tag_regex': r'^(?P<prefix>v)?(?P<version>[^\+]+)(?P<suffix>.*)?$', },

    entry_points={'console_scripts': [
+        'orcasong=orcasong.parser:main',
+        # TODO all deprecated:
+        'make_dsplit=orcasong.tools.make_data_split:main',
+        'plot_binstats=orcasong.plotting.plot_binstats:main',
        'concatenate=orcasong.tools.concatenate:main',
        'h5shuffle=orcasong.tools.postproc:h5shuffle',
        'h5shuffle2=orcasong.tools.shuffle2:run_parser',
-        'make_dsplit=orcasong.tools.make_data_split:main',
-        'plot_binstats=orcasong.plotting.plot_binstats:main',
    ]}
 )


--- a/tests/test_from_toml.py
+++ b/tests/test_from_toml.py
+from unittest import TestCase
+import os
+import orcasong
+import orcasong.from_toml as from_toml
+
+EXAMPLES = os.path.join(
+    os.path.dirname(os.path.dirname(orcasong.__file__)), "examples"
+)
+
+
+def _test_extr(infile):
+    return infile + "_extr"
+
+
+orcasong.from_toml.EXTRACTORS["neutrino_mc"] = _test_extr
+
+
+class TestSetupProcessorExampleConfig(TestCase):
+    def setUp(self):
+        self.processor = from_toml.setup_processor(
+            infile="test_in",
+            toml_file=os.path.join(EXAMPLES, "orcasong_example.toml"),
+            detx_file="test_det",
+        )
+
+    def test_time_window(self):
+        self.assertEqual(self.processor.time_window, [-100, 5000])
+
+    def test_max_n_hits(self):
+        self.assertEqual(self.processor.max_n_hits, 2000)
+
+    def test_chunksize(self):
+        self.assertEqual(self.processor.chunksize, 16)
+
+    def test_extractor_is_dummy_extractor(self):
+        self.assertEqual(self.processor.extractor, "test_in_extr")