From d87c6422a79b85749a2175df625d8c0c87a97e4e Mon Sep 17 00:00:00 2001 From: Stefan Reck <stefan.reck@fau.de> Date: Thu, 28 Jan 2021 11:46:59 +0100 Subject: [PATCH] black --- orcasong/tools/make_data_split.py | 369 +++++++++++++++++------------- 1 file changed, 206 insertions(+), 163 deletions(-) diff --git a/orcasong/tools/make_data_split.py b/orcasong/tools/make_data_split.py index b7646e1..8379862 100644 --- a/orcasong/tools/make_data_split.py +++ b/orcasong/tools/make_data_split.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- -__author__ = 'Michael Moser, Daniel Guderian' +__author__ = "Michael Moser, Daniel Guderian" import os import toml @@ -15,15 +15,17 @@ import numpy as np def get_parser(): parser = argparse.ArgumentParser( description="Create datasets based on the run_id's." - "Use the config to add input folder and set the ranges." - "Outputs a list in an txt file that can be used to " - "concatenate the files specfied") + "Use the config to add input folder and set the ranges." + "Outputs a list in an txt file that can be used to " + "concatenate the files specfied" + ) parser.add_argument( - 'config', type=str, - help="See example config for detailed information") - + "config", type=str, help="See example config for detailed information" + ) + return parser + def get_all_ip_group_keys(cfg): """ Gets the keys of all input groups in the config dict. @@ -66,19 +68,19 @@ def get_h5_filepaths(dirpath): """ filepaths = [] for f in os.listdir(dirpath): - if f.endswith('.h5'): - filepaths.append(dirpath + '/' + f) + if f.endswith(".h5"): + filepaths.append(dirpath + "/" + f) - #randomize order + # randomize order random.Random(42).shuffle(filepaths) - + return filepaths -def get_number_of_evts(file,dataset_key="y"): +def get_number_of_evts(file, dataset_key="y"): """ Returns the number of events of a file looking at the given dataset. - + Parameters ---------- file : h5 file @@ -90,17 +92,20 @@ def get_number_of_evts(file,dataset_key="y"): ------- n_evts : int The number of events in that file. - + """ - f = h5py.File(file, 'r') + f = h5py.File(file, "r") dset = f[dataset_key] n_evts = dset.shape[0] f.close() - + return n_evts - -def get_number_of_evts_and_run_ids(list_of_files, dataset_key='y', run_id_col_name='run_id'): + + +def get_number_of_evts_and_run_ids( + list_of_files, dataset_key="y", run_id_col_name="run_id" +): """ Gets the number of events and the run_ids for all hdf5 files in the list_of_files. @@ -130,7 +135,7 @@ def get_number_of_evts_and_run_ids(list_of_files, dataset_key='y', run_id_col_na run_ids = [] for i, fpath in enumerate(list_of_files): - f = h5py.File(fpath, 'r') + f = h5py.File(fpath, "r") dset = f[dataset_key] n_evts = dset.shape[0] @@ -165,7 +170,9 @@ def split(a, n): """ # from https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length k, m = divmod(len(a), n) - a_split = list((a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))) + a_split = list( + (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + ) return a_split @@ -182,21 +189,27 @@ def print_input_statistics(cfg, ip_group_keys): """ - print('----------------------------------------------------------------------') - print('Printing input statistics for your ' + cfg['toml_filename'] + ' input:') - print('----------------------------------------------------------------------') + print("----------------------------------------------------------------------") + print("Printing input statistics for your " + cfg["toml_filename"] + " input:") + print("----------------------------------------------------------------------") - print('Your input .toml file has the following data input groups: ' + str(ip_group_keys)) - print('Total number of events: ' + str(cfg['n_evts_total'])) + print( + "Your input .toml file has the following data input groups: " + + str(ip_group_keys) + ) + print("Total number of events: " + str(cfg["n_evts_total"])) for key in ip_group_keys: - print('--------------------------------------------------------------------') - print('Info for group ' + key + ':') - print('Directory: ' + cfg[key]['dir']) - print('Total number of files: ' + str(cfg[key]['n_files'])) - print('Total number of events: ' + str(cfg[key]['n_evts'])) - print('Mean number of events per file: ' + str(round(cfg[key]['n_evts_per_file_mean'], 3))) - print('--------------------------------------------------------------------') + print("--------------------------------------------------------------------") + print("Info for group " + key + ":") + print("Directory: " + cfg[key]["dir"]) + print("Total number of files: " + str(cfg[key]["n_files"])) + print("Total number of events: " + str(cfg[key]["n_evts"])) + print( + "Mean number of events per file: " + + str(round(cfg[key]["n_evts_per_file_mean"], 3)) + ) + print("--------------------------------------------------------------------") def add_fpaths_for_data_split_to_cfg(cfg, key): @@ -213,25 +226,29 @@ def add_fpaths_for_data_split_to_cfg(cfg, key): """ - fpath_lists = {'train': [], 'validate': [], 'rest': []} - for i, fpath in enumerate(cfg[key]['fpaths']): + fpath_lists = {"train": [], "validate": [], "rest": []} + for i, fpath in enumerate(cfg[key]["fpaths"]): - run_id = cfg[key]['run_ids'][i] + run_id = cfg[key]["run_ids"][i] - for dsplit in ['train', 'validate', 'rest']: - if 'run_ids_' + dsplit in cfg[key]: - if cfg[key]['run_ids_' + dsplit][0] <= run_id <= cfg[key]['run_ids_' + dsplit][1]: + for dsplit in ["train", "validate", "rest"]: + if "run_ids_" + dsplit in cfg[key]: + if ( + cfg[key]["run_ids_" + dsplit][0] + <= run_id + <= cfg[key]["run_ids_" + dsplit][1] + ): fpath_lists[dsplit].append(fpath) - for dsplit in ['train', 'validate', 'rest']: + for dsplit in ["train", "validate", "rest"]: if len(fpath_lists[dsplit]) == 0: continue - n_files_dsplit = cfg['n_files_' + dsplit] + n_files_dsplit = cfg["n_files_" + dsplit] fpath_lists[dsplit] = split(fpath_lists[dsplit], n_files_dsplit) - if 'output_' + dsplit not in cfg: - cfg['output_' + dsplit] = dict() - cfg['output_' + dsplit][key] = fpath_lists[dsplit] + if "output_" + dsplit not in cfg: + cfg["output_" + dsplit] = dict() + cfg["output_" + dsplit][key] = fpath_lists[dsplit] def make_dsplit_list_files(cfg): @@ -245,55 +262,63 @@ def make_dsplit_list_files(cfg): """ # check if //conc_list_files folder exists, if not create it. - if not os.path.exists(cfg['output_file_folder'] + '/conc_list_files'): - os.makedirs(cfg['output_file_folder'] + '/conc_list_files') - + if not os.path.exists(cfg["output_file_folder"] + "/conc_list_files"): + os.makedirs(cfg["output_file_folder"] + "/conc_list_files") + print() print() print("In an run-by-run MC the run_id's might not be continuous.") - print("Here are the actual numbers in the split sets:") + print("Here are the actual numbers in the split sets:") print("----------------------------------------------") - - #loop over the different specified sets - for dsplit in ['train', 'validate', 'rest']: - if 'output_' + dsplit not in cfg: + # loop over the different specified sets + for dsplit in ["train", "validate", "rest"]: + + if "output_" + dsplit not in cfg: continue - - print(dsplit,"set:") - - first_key = list(cfg['output_' + dsplit].keys())[0] - n_output_files = len(cfg['output_' + dsplit][first_key]) - - #initialize counter of events for all input groups - imput_groups_dict = cfg['output_' + dsplit] - final_number_of_events = np.zeros(len(imput_groups_dict)) - - #loop over the number of outputfiles for each set - for i in range(n_output_files): - fpath_output = cfg['output_file_folder'] + '/conc_list_files/' + cfg['output_file_name'] + '_' + dsplit + '_' + str(i) + '.txt' - # save the txt list - if 'output_lists' not in cfg: - cfg['output_lists'] = list() - cfg['output_lists'].append(fpath_output) + print(dsplit, "set:") + + first_key = list(cfg["output_" + dsplit].keys())[0] + n_output_files = len(cfg["output_" + dsplit][first_key]) + + # initialize counter of events for all input groups + imput_groups_dict = cfg["output_" + dsplit] + final_number_of_events = np.zeros(len(imput_groups_dict)) - with open(fpath_output, 'w') as f_out: + # loop over the number of outputfiles for each set + for i in range(n_output_files): + fpath_output = ( + cfg["output_file_folder"] + + "/conc_list_files/" + + cfg["output_file_name"] + + "_" + + dsplit + + "_" + + str(i) + + ".txt" + ) + + # save the txt list + if "output_lists" not in cfg: + cfg["output_lists"] = list() + cfg["output_lists"].append(fpath_output) + + with open(fpath_output, "w") as f_out: for j in range(len(imput_groups_dict)): keys = list(imput_groups_dict.keys()) - + for fpath in imput_groups_dict[keys[j]][i]: - #also count here the actual sizes + # also count here the actual sizes final_number_of_events[j] += get_number_of_evts(fpath) - f_out.write(fpath + '\n') - - #and then print them + f_out.write(fpath + "\n") + + # and then print them for i in range(len(imput_groups_dict)): - print(keys[i],":",int(final_number_of_events[i])) - + print(keys[i], ":", int(final_number_of_events[i])) + print("----------------------------------------------") - - + def make_concatenate_and_shuffle_scripts(cfg): """ @@ -306,123 +331,141 @@ def make_concatenate_and_shuffle_scripts(cfg): """ - dirpath = cfg['output_file_folder'] - - if not os.path.exists(dirpath + '/logs'): # check if /logs folder exists, if not create it. - os.makedirs(dirpath + '/logs') - if not os.path.exists(dirpath + '/job_scripts'): # check if /job_scripts folder exists, if not create it. - os.makedirs(dirpath + '/job_scripts') - if not os.path.exists(dirpath + '/data_split'): # check if /data_split folder exists, if not create it. - os.makedirs(dirpath + '/data_split') - - #not available atm... - #chunksize = '' if cfg['chunksize'] is None else ' --chunksize ' + str(cfg['chunksize']) - #complib = '' if cfg['complib'] is None else ' --complib ' + str(cfg['complib']) - #complevel = '' if cfg['complevel'] is None else ' --complevel ' + str(cfg['complevel']) + dirpath = cfg["output_file_folder"] + + if not os.path.exists( + dirpath + "/logs" + ): # check if /logs folder exists, if not create it. + os.makedirs(dirpath + "/logs") + if not os.path.exists( + dirpath + "/job_scripts" + ): # check if /job_scripts folder exists, if not create it. + os.makedirs(dirpath + "/job_scripts") + if not os.path.exists( + dirpath + "/data_split" + ): # check if /data_split folder exists, if not create it. + os.makedirs(dirpath + "/data_split") + + # not available atm... + # chunksize = '' if cfg['chunksize'] is None else ' --chunksize ' + str(cfg['chunksize']) + # complib = '' if cfg['complib'] is None else ' --complib ' + str(cfg['complib']) + # complevel = '' if cfg['complevel'] is None else ' --complevel ' + str(cfg['complevel']) # make qsub .sh file for concatenating - for listfile_fpath in cfg['output_lists']: + for listfile_fpath in cfg["output_lists"]: listfile_fname = os.path.basename(listfile_fpath) listfile_fname_wout_ext = os.path.splitext(listfile_fname)[0] - conc_outputfile_fpath = cfg['output_file_folder'] + '/data_split/' + listfile_fname_wout_ext + '.h5' - - fpath_bash_script = dirpath + '/job_scripts/concatenate_h5_' + listfile_fname_wout_ext + '.sh' - - with open(fpath_bash_script, 'w') as f: - f.write('#!/usr/bin/env bash\n') - f.write('\n') - f.write('source ' + cfg['venv_path'] + 'activate' + '\n') - f.write('\n') - f.write('# Concatenate the files in the list\n') - - f.write('concatenate ' + listfile_fpath + ' --outfile ' + conc_outputfile_fpath) - # at the moment it is not possible to set the comp opts like this+ chunksize + complib + complevel - - + conc_outputfile_fpath = ( + cfg["output_file_folder"] + "/data_split/" + listfile_fname_wout_ext + ".h5" + ) + + fpath_bash_script = ( + dirpath + "/job_scripts/concatenate_h5_" + listfile_fname_wout_ext + ".sh" + ) + + with open(fpath_bash_script, "w") as f: + f.write("#!/usr/bin/env bash\n") + f.write("\n") + f.write("source " + cfg["venv_path"] + "activate" + "\n") + f.write("\n") + f.write("# Concatenate the files in the list\n") + + f.write( + "concatenate " + listfile_fpath + " --outfile " + conc_outputfile_fpath + ) + # at the moment it is not possible to set the comp opts like this+ chunksize + complib + complevel + # make qsub .sh file for shuffling - for listfile_fpath in cfg['output_lists']: + for listfile_fpath in cfg["output_lists"]: listfile_fname = os.path.basename(listfile_fpath) listfile_fname_wout_ext = os.path.splitext(listfile_fname)[0] # This is the input for the shuffle tool! - conc_outputfile_fpath = cfg['output_file_folder'] + '/data_split/' + listfile_fname_wout_ext + '.h5' - - fpath_bash_script = dirpath + '/job_scripts/shuffle_h5_' + listfile_fname_wout_ext + '.sh' - - with open(fpath_bash_script, 'w') as f: - f.write('#!/usr/bin/env bash\n') - f.write('\n') - f.write('source ' + cfg['venv_path'] + 'activate \n') - f.write('\n') - f.write('# Shuffle the h5 file \n') - - f.write('h5shuffle2 ' + conc_outputfile_fpath + ' --max_ram 1000000000 \n') #fix to 1GB ram; in lyon using a fraction - #is difficult... - #time python shuffle/shuffle_h5.py' - #+ delete_flag_shuffle_tool - #+ chunksize + complib + complevel - - if cfg['shuffle_delete']: - f.write('\n') - f.write('rm ' + conc_outputfile_fpath + '\n') - + conc_outputfile_fpath = ( + cfg["output_file_folder"] + "/data_split/" + listfile_fname_wout_ext + ".h5" + ) + + fpath_bash_script = ( + dirpath + "/job_scripts/shuffle_h5_" + listfile_fname_wout_ext + ".sh" + ) + + with open(fpath_bash_script, "w") as f: + f.write("#!/usr/bin/env bash\n") + f.write("\n") + f.write("source " + cfg["venv_path"] + "activate \n") + f.write("\n") + f.write("# Shuffle the h5 file \n") + + f.write( + "h5shuffle2 " + conc_outputfile_fpath + " --max_ram 1000000000 \n" + ) # fix to 1GB ram; in lyon using a fraction + # is difficult... + # time python shuffle/shuffle_h5.py' + # + delete_flag_shuffle_tool + # + chunksize + complib + complevel + + if cfg["shuffle_delete"]: + f.write("\n") + f.write("rm " + conc_outputfile_fpath + "\n") + def main(): """ Main function to make the data split. """ - #load the config + # load the config parser = get_parser() parsed_args = parser.parse_args() config_file = parsed_args.config - - #decode config + + # decode config cfg = toml.load(config_file) - cfg['toml_filename'] = config_file - - #set some defaults/Nones - at the moment setting of the com opts is not available! - #if 'chunksize' not in cfg: cfg['chunksize'] = None - #if 'complib' not in cfg: cfg['complib'] = None - #if 'complevel' not in cfg: cfg['complevel'] = None - - #read out all the input groups + cfg["toml_filename"] = config_file + + # set some defaults/Nones - at the moment setting of the com opts is not available! + # if 'chunksize' not in cfg: cfg['chunksize'] = None + # if 'complib' not in cfg: cfg['complib'] = None + # if 'complevel' not in cfg: cfg['complevel'] = None + + # read out all the input groups ip_group_keys = get_all_ip_group_keys(cfg) - - #and now loop over input groups extracting info + + # and now loop over input groups extracting info n_evts_total = 0 for key in ip_group_keys: - print('Collecting information from input group ' + key) - cfg[key]['fpaths'] = get_h5_filepaths(cfg[key]['dir']) - cfg[key]['n_files'] = len(cfg[key]['fpaths']) - cfg[key]['n_evts'], cfg[key]['n_evts_per_file_mean'], cfg[key]['run_ids'] = get_number_of_evts_and_run_ids(cfg[key]['fpaths'], dataset_key='y') - - n_evts_total += cfg[key]['n_evts'] - - cfg['n_evts_total'] = n_evts_total - #print the extracted statistics + print("Collecting information from input group " + key) + cfg[key]["fpaths"] = get_h5_filepaths(cfg[key]["dir"]) + cfg[key]["n_files"] = len(cfg[key]["fpaths"]) + ( + cfg[key]["n_evts"], + cfg[key]["n_evts_per_file_mean"], + cfg[key]["run_ids"], + ) = get_number_of_evts_and_run_ids(cfg[key]["fpaths"], dataset_key="y") + + n_evts_total += cfg[key]["n_evts"] + + cfg["n_evts_total"] = n_evts_total + # print the extracted statistics print_input_statistics(cfg, ip_group_keys) - if cfg['print_only'] is True: + if cfg["print_only"] is True: from sys import exit + exit() - + for key in ip_group_keys: add_fpaths_for_data_split_to_cfg(cfg, key) - - #create the list files + + # create the list files make_dsplit_list_files(cfg) - #create bash scripts that can be submitted to do the concatenation and shuffle - if cfg['make_qsub_bash_files'] is True: + # create bash scripts that can be submitted to do the concatenation and shuffle + if cfg["make_qsub_bash_files"] is True: make_concatenate_and_shuffle_scripts(cfg) -if __name__ == '__main__': - main() - - - - +if __name__ == "__main__": + main() -- GitLab