""" Run OrcaSong functionalities from command line. """ import argparse from orcasong.tools.concatenate import concatenate from orcasong.tools.postproc import postproc_file from orcasong.tools.shuffle2 import h5shuffle2 def _add_parser_concatenate(subparsers): parser = subparsers.add_parser( "concatenate", description='Concatenate many small h5 files to a single large one ' 'in a km3pipe compatible format. This is intended for ' 'files that get generated by orcasong, i.e. all datsets ' 'should have the same length, with one row per ' 'blob. ' 'Compression options and the datasets to be created in ' 'the new file will be read from the first input file.') parser.add_argument( 'file', type=str, nargs="*", help="Define the files to concatenate. If it's one argument: A txt list " "with pathes of h5 files to concatenate (one path per line). " "If it's multiple arguments: " "The pathes of h5 files to concatenate.") parser.add_argument( '--outfile', type=str, default="concatenated.h5", help='The absoulte filepath of the output .h5 file that will be created. ') parser.add_argument( '--no_used_files', action='store_true', help="Per default, the paths of the input files are added " "as their own datagroup in the output file. Use this flag to " "disable. ") parser.add_argument( '--skip_errors', action='store_true', help="If true, ignore files that can't be concatenated. ") parser.set_defaults(func=concatenate) def _add_parser_h5shuffle(subparsers): parser = subparsers.add_parser( "h5shuffle", description='Shuffle an h5 file using km3pipe.', ) parser.add_argument('input_file', type=str, help='File to shuffle.') parser.add_argument('--output_file', type=str, help='Name of output file. Default: Auto generate name.') parser.add_argument('--delete', action="store_true", help='Delete original file afterwards.') parser.set_defaults(func=postproc_file) def _add_parser_h5shuffle2(subparsers): parser = subparsers.add_parser( "h5shuffle2", description="Shuffle datasets in a h5file that have the same length. " "Uses chunkwise readout for speed-up." ) parser.add_argument( "input_file", type=str, help="Path of the file that will be shuffled." ) parser.add_argument( "--output_file", type=str, default=None, help="If given, this will be the name of the output file. " "Default: input_file + suffix.", ) parser.add_argument( "--datasets", type=str, nargs="*", default=("x", "y"), help="Which datasets to include in output. Default: x, y", ) parser.add_argument( "--max_ram_fraction", type=float, default=0.25, help="in [0, 1]. Fraction of all available ram to use for reading one batch of data " "Note: this should " "be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. " "Default: 0.25", ) parser.add_argument( "--iterations", type=int, default=None, help="Shuffle the file this many times. Default: Auto choose best number.", ) parser.add_argument( "--max_ram", type=int, default=None, help="Available ram in bytes. Default: Use fraction of maximum " "available instead (see max_ram_fraction).", ) parser.set_defaults(func=h5shuffle2) def _add_parser_version(subparsers): def show_version(): from orcasong import version print(version) parser = subparsers.add_parser( "version", description="Show installed orcanet version.", ) parser.set_defaults(func=show_version) def main(): parser = argparse.ArgumentParser( prog="orcasong", description=__doc__, formatter_class=argparse.RawTextHelpFormatter, ) subparsers = parser.add_subparsers() _add_parser_concatenate(subparsers) _add_parser_h5shuffle(subparsers) _add_parser_h5shuffle2(subparsers) _add_parser_version(subparsers) kwargs = vars(parser.parse_args()) func = kwargs.pop("func") func(**kwargs)