Skip to content
Snippets Groups Projects
Commit 2e431b69 authored by Stefan Reck's avatar Stefan Reck
Browse files

Merge branch 'parser' into 'master'

- moved all commands to central parser 'orcasong' (old ones are kept for now but will give warning and will be removed)
- add new command 'orcasong run' which uses a toml config to setup

See merge request !20
parents c79af52d f12e862c
No related branches found
No related tags found
1 merge request!20Parser
......@@ -36,6 +36,11 @@ OrcaSong can be installed via pip by running::
pip install orcasong
You can get a list of all the bash commands in orcasong by typing::
orcasong --help
A Singularity image of the latest stable version of OrcaSong is also provided.
You can download it from the km3net sftp server ``pi1139.physik.uni-erlangen.de``
in ``singularity/orcasong.sif``.
......@@ -12,7 +12,7 @@ Step 1: From root aanet files to h5 aanet files
Convert offline files (aka aanet files) from root format to h5 format using
the 'h5extract' command of km3pipe like so::
h5extract filename.root
h5extract aanet_file.root
.. note::
This has to be done only once for each file. Check if somebody did this
......@@ -23,8 +23,14 @@ the 'h5extract' command of km3pipe like so::
Step 2: From h5 aanet files to h5 DL files
------------------------------------------
Produce DL h5 files from the aanet h5 files using OrcaSong.
You can either produce images or graphs. See :ref:`orcasong_page` for
instructions on how to do this.
You can either produce images or graphs.
If you have an orcasong config file, you can use it via the command line like this::
orcasong run aanet_file.h5 orcasong_config.toml --detx_file detector.detx
Alternatively, you can use the python frontend of orcasong.
See :ref:`orcasong_page` for instructions on how to do this.
The resulting DL h5 files can already be used as input for networks!
......
......@@ -3,7 +3,7 @@
Producing DL h5 files from aanet h5 files
=========================================
Describes how to use OrcaSong to produce h5 files for Deep Learning
Describes how to use OrcaSong in python to produce h5 files for Deep Learning
from aanet h5 files. These files can contain either images (for convolutional
networks), or graphs (for Graph networks).
......
......@@ -37,7 +37,7 @@ km3pipe. The input can also be a txt file like from make_data_split.
Can be used via the commandline like so::
concatenate --help
orcasong concatenate --help
or import as:
......@@ -58,7 +58,7 @@ Shuffle an h5 file using km3pipe.
Can be used via the commandline like so::
h5shuffle --help
orcasong h5shuffle --help
or import function for general postprocessing:
......@@ -69,4 +69,7 @@ or import function for general postprocessing:
postproc_file(output_filepath_concat)
Theres also a faster (beta) version available called h5shuffle2.
Theres also a faster (beta) version available called h5shuffle2::
orcasong h5shuffle2 --help
# This is an example config for running orcasong. It's not intended
# to be used for actual large-scale productions.
# the mode to run orcasong in; either 'graph' or 'image'
mode="graph"
# arguments for FileGraph or FileBinner (see orcasong.core)
max_n_hits = 2000
time_window = [-100, 5000]
# can also give the arguments of orcasong.core.BaseProcessor,
# which are shared between modes
chunksize=16
# built-in extractor function to use (see orcasong.from_toml.EXTRACTORS)
extractor = "neutrino_mc"
[extractor_config]
# arguments for setting up the extractor function can go here. None in this case.
import toml
import orcasong.core
import orcasong.extractors as extractors
# built-in extractors. First argument has to be the input filename,
# other parameters can be set via 'extractor_config' dict in the toml
EXTRACTORS = {
"neutrino_mc": extractors.get_neutrino_mc_info_extr,
"neutrino_data": extractors.get_real_data_info_extr,
}
MODES = {
"graph": orcasong.core.FileGraph,
"image": orcasong.core.FileBinner,
}
def add_parser_run(subparsers):
parser = subparsers.add_parser(
"run",
description='Produce a dl file from an aanet file.')
parser.add_argument('infile', type=str, help="Aanet file in h5 format.")
parser.add_argument('toml_file', type=str, help="Orcasong configuration in toml format.")
parser.add_argument('--detx_file', type=str, default=None, help=(
"Optional detx file to calibrate on the fly. Can not be used if a "
"detx_file has also been given in the toml file."))
parser.add_argument('--outfile', type=str, default=None, help=(
"Path to output file. Default: Save with auto generated name in cwd."))
parser.set_defaults(func=run_orcasong)
def run_orcasong(infile, toml_file, detx_file=None, outfile=None):
setup_processor(infile, toml_file, detx_file).run(
infile=infile, outfile=outfile)
def setup_processor(infile, toml_file, detx_file=None):
cfg = toml.load(toml_file)
processor = _get_verbose(cfg.pop("mode"), MODES)
if "detx_file" in cfg:
if detx_file is not None:
raise ValueError("detx_file passed to run AND defined in toml")
detx_file = cfg.pop("detx_file")
if "extractor" in cfg:
extractor_cfg = cfg.pop("extractor_config", {})
extractor = _get_verbose(cfg.pop("extractor"), EXTRACTORS)(infile, **extractor_cfg)
else:
extractor = None
return processor(
det_file=detx_file,
extractor=extractor,
**cfg,
)
def _get_verbose(key, d):
if key not in d:
raise KeyError(f"Unknown key '{key}' (available: {list(d.keys())})")
return d[key]
"""
Run OrcaSong functionalities from command line.
"""
import argparse
from orcasong.tools.concatenate import concatenate
from orcasong.tools.postproc import postproc_file
from orcasong.tools.shuffle2 import h5shuffle2
import orcasong.from_toml as from_toml
import orcasong.plotting.plot_binstats as plot_binstats
import orcasong.tools.make_data_split as make_data_split
def _add_parser_concatenate(subparsers):
parser = subparsers.add_parser(
"concatenate",
description='Concatenate many small h5 files to a single large one '
'in a km3pipe compatible format. This is intended for '
'files that get generated by orcasong, i.e. all datsets '
'should have the same length, with one row per '
'blob. '
'Compression options and the datasets to be created in '
'the new file will be read from the first input file.')
parser.add_argument(
'file', type=str, nargs="*",
help="Define the files to concatenate. If it's one argument: A txt list "
"with pathes of h5 files to concatenate (one path per line). "
"If it's multiple arguments: "
"The pathes of h5 files to concatenate.")
parser.add_argument(
'--outfile', type=str, default="concatenated.h5",
help='The absoulte filepath of the output .h5 file that will be created. ')
parser.add_argument(
'--no_used_files', action='store_true',
help="Per default, the paths of the input files are added "
"as their own datagroup in the output file. Use this flag to "
"disable. ")
parser.add_argument(
'--skip_errors', action='store_true',
help="If true, ignore files that can't be concatenated. ")
parser.set_defaults(func=concatenate)
def _add_parser_h5shuffle(subparsers):
parser = subparsers.add_parser(
"h5shuffle",
description='Shuffle an h5 file using km3pipe.',
)
parser.add_argument('input_file', type=str, help='File to shuffle.')
parser.add_argument('--output_file', type=str,
help='Name of output file. Default: Auto generate name.')
parser.add_argument('--delete', action="store_true",
help='Delete original file afterwards.')
parser.set_defaults(func=postproc_file)
def _add_parser_h5shuffle2(subparsers):
parser = subparsers.add_parser(
"h5shuffle2",
description="Shuffle datasets in a h5file that have the same length. "
"Uses chunkwise readout for speed-up."
)
parser.add_argument(
"input_file", type=str, help="Path of the file that will be shuffled."
)
parser.add_argument(
"--output_file",
type=str,
default=None,
help="If given, this will be the name of the output file. "
"Default: input_file + suffix.",
)
parser.add_argument(
"--datasets",
type=str,
nargs="*",
default=("x", "y"),
help="Which datasets to include in output. Default: x, y",
)
parser.add_argument(
"--max_ram_fraction",
type=float,
default=0.25,
help="in [0, 1]. Fraction of all available ram to use for reading one batch of data "
"Note: this should "
"be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. "
"Default: 0.25",
)
parser.add_argument(
"--iterations",
type=int,
default=None,
help="Shuffle the file this many times. Default: Auto choose best number.",
)
parser.add_argument(
"--max_ram",
type=int,
default=None,
help="Available ram in bytes. Default: Use fraction of maximum "
"available instead (see max_ram_fraction).",
)
parser.set_defaults(func=h5shuffle2)
def _add_parser_version(subparsers):
def show_version():
from orcasong import version
print(version)
parser = subparsers.add_parser(
"version",
description="Show installed orcanet version.",
)
parser.set_defaults(func=show_version)
def main():
parser = argparse.ArgumentParser(
prog="orcasong",
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter,
)
subparsers = parser.add_subparsers()
from_toml.add_parser_run(subparsers)
_add_parser_concatenate(subparsers)
_add_parser_h5shuffle(subparsers)
_add_parser_h5shuffle2(subparsers)
plot_binstats.add_parser(subparsers)
make_data_split.add_parser(subparsers)
_add_parser_version(subparsers)
kwargs = vars(parser.parse_args())
func = kwargs.pop("func")
func(**kwargs)
......@@ -218,6 +218,9 @@ def get_all_h5_files():
def main():
# TODO deprecated
warnings.warn(
"plot_binstats is deprecated and has been renamed to orcasong plot_binstats")
parser = argparse.ArgumentParser(
description='Generate a plot with statistics of the binning. '
'Can only be used on files generated with the FileBinner when '
......@@ -232,5 +235,20 @@ def main():
plot_hist_of_files(**vars(parser.parse_args()))
def add_parser(subparsers):
parser = subparsers.add_parser(
"plot_binstats",
description='Generate a plot with statistics of the binning. '
'Can only be used on files generated with the FileBinner when '
'add_bin_stats was set to true (default). ')
parser.add_argument(
'--save_as', type=str, default="bin_stats_plot.pdf",
help='Filename of the plot. Default: bin_stats_plot.pdf.')
parser.add_argument(
'files', type=str, nargs='*', default=None,
help='File(s) to plot. Default: Plot for all h5 files in current dir.')
parser.set_defaults(func=plot_hist_of_files)
if __name__ == "__main__":
main()
......@@ -305,7 +305,27 @@ def _copy_attrs(src_datset, target_dataset):
warnings.warn(f"Error: Can not copy attribute {k}: {e}")
def get_parser():
def concatenate(file, outfile="concatenated.h5", no_used_files=False, skip_errors=False):
""" Concatenate wrapped in a function. """
if len(file) == 1:
fc = FileConcatenator.from_list(
file[0],
skip_errors=skip_errors
)
else:
fc = FileConcatenator(
input_files=file,
skip_errors=skip_errors
)
fc.concatenate(
outfile,
append_used_files=not no_used_files,
)
def main():
# TODO deprecated
warnings.warn("concatenate is deprecated and has been renamed to orcasong concatenate")
parser = argparse.ArgumentParser(
description='Concatenate many small h5 files to a single large one '
'in a km3pipe compatible format. This is intended for '
......@@ -331,27 +351,7 @@ def get_parser():
parser.add_argument(
'--skip_errors', action='store_true',
help="If true, ignore files that can't be concatenated. ")
return parser
def main():
parser = get_parser()
parsed_args = parser.parse_args()
if len(parsed_args.file) == 1:
fc = FileConcatenator.from_list(
parsed_args.file[0],
skip_errors=parsed_args.skip_errors
)
else:
fc = FileConcatenator(
input_files=parsed_args.file,
skip_errors=parsed_args.skip_errors
)
fc.concatenate(
parsed_args.outfile,
append_used_files=not parsed_args.no_used_files,
)
concatenate(**vars(parser.parse_args()))
if __name__ == '__main__':
......
......@@ -5,6 +5,7 @@
__author__ = "Michael Moser, Daniel Guderian"
import os
import warnings
import toml
import argparse
import h5py
......@@ -13,6 +14,8 @@ import numpy as np
def get_parser():
# TODO deprecated
warnings.warn("make_data_split is deprecated and has been renamed to orcasong make_data_split")
parser = argparse.ArgumentParser(
description="Create datasets based on the run_id's."
"Use the config to add input folder and set the ranges."
......@@ -26,6 +29,20 @@ def get_parser():
return parser
def add_parser(subparsers):
parser = subparsers.add_parser(
"make_data_split",
description="Create datasets based on the run_id's."
"Use the config to add input folder and set the ranges."
"Outputs a list in an txt file that can be used to "
"concatenate the files specfied"
)
parser.add_argument(
"config", type=str, help="See example config for detailed information"
)
parser.set_defaults(func=make_split)
def get_all_ip_group_keys(cfg):
"""
Gets the keys of all input groups in the config dict.
......@@ -418,9 +435,10 @@ def main():
# load the config
parser = get_parser()
parsed_args = parser.parse_args()
make_split(parsed_args.config)
config_file = parsed_args.config
def make_split(config_file):
# decode config
cfg = toml.load(config_file)
cfg["toml_filename"] = config_file
......
......@@ -3,6 +3,8 @@ Scripts for postprocessing h5 files, e.g. shuffling.
"""
import os
import argparse
import warnings
import h5py
import km3pipe as kp
import km3modules as km
......@@ -115,6 +117,8 @@ def get_filepath_output(input_file, shuffle=True, event_skipper=None):
def h5shuffle():
# TODO deprecated
warnings.warn("h5shuffle is deprecated and has been renamed to orcasong h5shuffle")
parser = argparse.ArgumentParser(description='Shuffle an h5 file using km3pipe.')
parser.add_argument('input_file', type=str, help='File to shuffle.')
parser.add_argument('--output_file', type=str,
......
......@@ -2,6 +2,8 @@ import os
import time
import datetime
import argparse
import warnings
import numpy as np
import psutil
import h5py
......@@ -335,6 +337,8 @@ def slicify(fancy_indices):
def run_parser():
# TODO deprecated
warnings.warn("h5shuffle2 is deprecated and has been renamed to orcasong h5shuffle2")
parser = argparse.ArgumentParser(
description="Shuffle datasets in a h5file that have the same length. "
"Uses chunkwise readout for speed-up."
......
......@@ -27,11 +27,13 @@ setup(
'tag_regex': r'^(?P<prefix>v)?(?P<version>[^\+]+)(?P<suffix>.*)?$', },
entry_points={'console_scripts': [
'orcasong=orcasong.parser:main',
# TODO all deprecated:
'make_dsplit=orcasong.tools.make_data_split:main',
'plot_binstats=orcasong.plotting.plot_binstats:main',
'concatenate=orcasong.tools.concatenate:main',
'h5shuffle=orcasong.tools.postproc:h5shuffle',
'h5shuffle2=orcasong.tools.shuffle2:run_parser',
'make_dsplit=orcasong.tools.make_data_split:main',
'plot_binstats=orcasong.plotting.plot_binstats:main',
]}
)
......
from unittest import TestCase
import os
import orcasong
import orcasong.from_toml as from_toml
EXAMPLES = os.path.join(
os.path.dirname(os.path.dirname(orcasong.__file__)), "examples"
)
def _test_extr(infile):
return infile + "_extr"
orcasong.from_toml.EXTRACTORS["neutrino_mc"] = _test_extr
class TestSetupProcessorExampleConfig(TestCase):
def setUp(self):
self.processor = from_toml.setup_processor(
infile="test_in",
toml_file=os.path.join(EXAMPLES, "orcasong_example.toml"),
detx_file="test_det",
)
def test_time_window(self):
self.assertEqual(self.processor.time_window, [-100, 5000])
def test_max_n_hits(self):
self.assertEqual(self.processor.max_n_hits, 2000)
def test_chunksize(self):
self.assertEqual(self.processor.chunksize, 16)
def test_extractor_is_dummy_extractor(self):
self.assertEqual(self.processor.extractor, "test_in_extr")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment