From 7f6a045e61cc6913a9d9ff2dc7d97d44e9f7c87f Mon Sep 17 00:00:00 2001 From: Stefan Reck <stefan.reck@fau.de> Date: Tue, 13 Apr 2021 19:08:27 +0200 Subject: [PATCH] move some commands to central parser --- orcasong/parser.py | 129 ++++++++++++++++++++++++++++++++++ orcasong/tools/concatenate.py | 43 ++++++------ orcasong/tools/postproc.py | 3 + orcasong/tools/shuffle2.py | 3 + setup.py | 6 +- 5 files changed, 160 insertions(+), 24 deletions(-) create mode 100644 orcasong/parser.py diff --git a/orcasong/parser.py b/orcasong/parser.py new file mode 100644 index 0000000..a95bc7b --- /dev/null +++ b/orcasong/parser.py @@ -0,0 +1,129 @@ +""" +Run OrcaSong functionalities from command line. + +""" +import argparse +from orcasong.tools.concatenate import concatenate +from orcasong.tools.postproc import postproc_file +from orcasong.tools.shuffle2 import h5shuffle2 + + +def _add_parser_concatenate(subparsers): + parser = subparsers.add_parser( + "concatenate", + description='Concatenate many small h5 files to a single large one ' + 'in a km3pipe compatible format. This is intended for ' + 'files that get generated by orcasong, i.e. all datsets ' + 'should have the same length, with one row per ' + 'blob. ' + 'Compression options and the datasets to be created in ' + 'the new file will be read from the first input file.') + parser.add_argument( + 'file', type=str, nargs="*", + help="Define the files to concatenate. If it's one argument: A txt list " + "with pathes of h5 files to concatenate (one path per line). " + "If it's multiple arguments: " + "The pathes of h5 files to concatenate.") + parser.add_argument( + '--outfile', type=str, default="concatenated.h5", + help='The absoulte filepath of the output .h5 file that will be created. ') + parser.add_argument( + '--no_used_files', action='store_true', + help="Per default, the paths of the input files are added " + "as their own datagroup in the output file. Use this flag to " + "disable. ") + parser.add_argument( + '--skip_errors', action='store_true', + help="If true, ignore files that can't be concatenated. ") + parser.set_defaults(func=concatenate) + + +def _add_parser_h5shuffle(subparsers): + parser = subparsers.add_parser( + "h5shuffle", + description='Shuffle an h5 file using km3pipe.', + ) + parser.add_argument('input_file', type=str, help='File to shuffle.') + parser.add_argument('--output_file', type=str, + help='Name of output file. Default: Auto generate name.') + parser.add_argument('--delete', action="store_true", + help='Delete original file afterwards.') + parser.set_defaults(func=postproc_file) + + +def _add_parser_h5shuffle2(subparsers): + parser = subparsers.add_parser( + "h5shuffle2", + description="Shuffle datasets in a h5file that have the same length. " + "Uses chunkwise readout for speed-up." + ) + parser.add_argument( + "input_file", type=str, help="Path of the file that will be shuffled." + ) + parser.add_argument( + "--output_file", + type=str, + default=None, + help="If given, this will be the name of the output file. " + "Default: input_file + suffix.", + ) + parser.add_argument( + "--datasets", + type=str, + nargs="*", + default=("x", "y"), + help="Which datasets to include in output. Default: x, y", + ) + parser.add_argument( + "--max_ram_fraction", + type=float, + default=0.25, + help="in [0, 1]. Fraction of all available ram to use for reading one batch of data " + "Note: this should " + "be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. " + "Default: 0.25", + ) + parser.add_argument( + "--iterations", + type=int, + default=None, + help="Shuffle the file this many times. Default: Auto choose best number.", + ) + parser.add_argument( + "--max_ram", + type=int, + default=None, + help="Available ram in bytes. Default: Use fraction of maximum " + "available instead (see max_ram_fraction).", + ) + parser.set_defaults(func=h5shuffle2) + + +def _add_parser_version(subparsers): + def show_version(): + from orcasong import version + print(version) + + parser = subparsers.add_parser( + "version", + description="Show installed orcanet version.", + ) + parser.set_defaults(func=show_version) + + +def main(): + parser = argparse.ArgumentParser( + prog="orcasong", + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter, + ) + subparsers = parser.add_subparsers() + + _add_parser_concatenate(subparsers) + _add_parser_h5shuffle(subparsers) + _add_parser_h5shuffle2(subparsers) + _add_parser_version(subparsers) + + kwargs = vars(parser.parse_args()) + func = kwargs.pop("func") + func(**kwargs) diff --git a/orcasong/tools/concatenate.py b/orcasong/tools/concatenate.py index 4f3ed15..0a86218 100644 --- a/orcasong/tools/concatenate.py +++ b/orcasong/tools/concatenate.py @@ -305,7 +305,26 @@ def _copy_attrs(src_datset, target_dataset): warnings.warn(f"Error: Can not copy attribute {k}: {e}") -def get_parser(): +def concatenate(file, outfile="concatenated.h5", no_used_files=False, skip_errors=False): + """ Concatenate wrapped in a function. """ + if len(file) == 1: + fc = FileConcatenator.from_list( + file[0], + skip_errors=skip_errors + ) + else: + fc = FileConcatenator( + input_files=file, + skip_errors=skip_errors + ) + fc.concatenate( + outfile, + append_used_files=not no_used_files, + ) + + +def main(): + warnings.warn("concatenate is deprecated and has been renamed to orcasong concatenate") parser = argparse.ArgumentParser( description='Concatenate many small h5 files to a single large one ' 'in a km3pipe compatible format. This is intended for ' @@ -331,27 +350,7 @@ def get_parser(): parser.add_argument( '--skip_errors', action='store_true', help="If true, ignore files that can't be concatenated. ") - return parser - - -def main(): - parser = get_parser() - parsed_args = parser.parse_args() - - if len(parsed_args.file) == 1: - fc = FileConcatenator.from_list( - parsed_args.file[0], - skip_errors=parsed_args.skip_errors - ) - else: - fc = FileConcatenator( - input_files=parsed_args.file, - skip_errors=parsed_args.skip_errors - ) - fc.concatenate( - parsed_args.outfile, - append_used_files=not parsed_args.no_used_files, - ) + concatenate(**vars(parser.parse_args())) if __name__ == '__main__': diff --git a/orcasong/tools/postproc.py b/orcasong/tools/postproc.py index b2191c8..acd3c38 100644 --- a/orcasong/tools/postproc.py +++ b/orcasong/tools/postproc.py @@ -3,6 +3,8 @@ Scripts for postprocessing h5 files, e.g. shuffling. """ import os import argparse +import warnings + import h5py import km3pipe as kp import km3modules as km @@ -115,6 +117,7 @@ def get_filepath_output(input_file, shuffle=True, event_skipper=None): def h5shuffle(): + warnings.warn("h5shuffle is deprecated and has been renamed to orcasong h5shuffle") parser = argparse.ArgumentParser(description='Shuffle an h5 file using km3pipe.') parser.add_argument('input_file', type=str, help='File to shuffle.') parser.add_argument('--output_file', type=str, diff --git a/orcasong/tools/shuffle2.py b/orcasong/tools/shuffle2.py index 4185985..11083e1 100644 --- a/orcasong/tools/shuffle2.py +++ b/orcasong/tools/shuffle2.py @@ -2,6 +2,8 @@ import os import time import datetime import argparse +import warnings + import numpy as np import psutil import h5py @@ -335,6 +337,7 @@ def slicify(fancy_indices): def run_parser(): + warnings.warn("h5shuffle2 is deprecated and has been renamed to orcasong h5shuffle2") parser = argparse.ArgumentParser( description="Shuffle datasets in a h5file that have the same length. " "Uses chunkwise readout for speed-up." diff --git a/setup.py b/setup.py index e7c3320..281186f 100644 --- a/setup.py +++ b/setup.py @@ -27,11 +27,13 @@ setup( 'tag_regex': r'^(?P<prefix>v)?(?P<version>[^\+]+)(?P<suffix>.*)?$', }, entry_points={'console_scripts': [ + 'orcasong=orcasong.parser:main', + 'make_dsplit=orcasong.tools.make_data_split:main', + 'plot_binstats=orcasong.plotting.plot_binstats:main', + # deprecated: 'concatenate=orcasong.tools.concatenate:main', 'h5shuffle=orcasong.tools.postproc:h5shuffle', 'h5shuffle2=orcasong.tools.shuffle2:run_parser', - 'make_dsplit=orcasong.tools.make_data_split:main', - 'plot_binstats=orcasong.plotting.plot_binstats:main', ]} ) -- GitLab