Merge branch 'ifx_amke_data_split_parser' into 'master'

fix it and remove old stuff See merge request !33

Merge branch 'ifx_amke_data_split_parser' into 'master'
3b8ce43e · Stefan Reck · 0c84e925 · 7dcc8814 · 3b8ce43e · 3b8ce43e
Commit 3b8ce43e authored 2 years ago by Stefan Reck
--- a/orcasong/extractors/neutrino_chain.py
+++ b/orcasong/extractors/neutrino_chain.py
@@ -262,7 +262,7 @@ def get_random_noise_mc_info_extr(input_file):
    return mc_info_extr


-def get_neutrino_mc_info_extr(input_file):
+def get_neutrino_mc_info_extr(input_file,prod_identifier=999):

    """
    Wrapper function that includes the actual mc_info_extr
@@ -273,6 +273,9 @@ def get_neutrino_mc_info_extr(input_file):
    ----------
    input_file : km3net data file
                    Can be online or offline format.
+    prod_identifier : int
+    	An internal, unofficial identifier to mark the neutrino production. This has to be
+    	defined in a dict before. 

    Returns
    -------
@@ -399,6 +402,7 @@ def get_neutrino_mc_info_extr(input_file):
            "n_gen": n_gen,
            "part_number": part_number,
            "tau_topology": tau_topology,
+            "prod_identifier": prod_identifier,
        }

        # get all the std reco info

--- a/orcasong/tools/make_data_split.py
+++ b/orcasong/tools/make_data_split.py
@@ -13,13 +13,6 @@ import random
 import numpy as np


-def get_parser():
-    # TODO deprecated
-    raise NotImplementedError(
-        "make_data_split has been renamed to orcasong make_data_split"
-    )
-
-
 def add_parser(subparsers):
    parser = subparsers.add_parser(
        "make_data_split",
@@ -29,7 +22,7 @@ def add_parser(subparsers):
        "concatenate the files specfied",
    )
    parser.add_argument(
-        "config", type=str, help="See example config for detailed information"
+        "config_file", type=str, help="See example config for detailed information"
    )
    parser.set_defaults(func=make_split)

@@ -354,11 +347,6 @@ def make_concatenate_and_shuffle_scripts(cfg):
    ):  # check if /data_split folder exists, if not create it.
        os.makedirs(dirpath + "/data_split")

-    # not available atm...
-    # chunksize = '' if cfg['chunksize'] is None else ' --chunksize ' + str(cfg['chunksize'])
-    # complib = '' if cfg['complib'] is None else ' --complib ' + str(cfg['complib'])
-    # complevel = '' if cfg['complevel'] is None else ' --complevel ' + str(cfg['complevel'])
-
    # make qsub .sh file for concatenating
    for listfile_fpath in cfg["output_lists"]:
        listfile_fname = os.path.basename(listfile_fpath)
@@ -379,10 +367,9 @@ def make_concatenate_and_shuffle_scripts(cfg):
            f.write("# Concatenate the files in the list\n")

            f.write(
-                "concatenate " + listfile_fpath + " --outfile " + conc_outputfile_fpath
+                "orcasong concatenate " + listfile_fpath + " --outfile " + conc_outputfile_fpath
            )
-            # at the moment it is not possible to set the comp opts like this+ chunksize + complib + complevel
-
+          
    # make qsub .sh file for shuffling

    for listfile_fpath in cfg["output_lists"]:
@@ -406,39 +393,18 @@ def make_concatenate_and_shuffle_scripts(cfg):
            f.write("# Shuffle the h5 file \n")

            f.write(
-                "h5shuffle2 " + conc_outputfile_fpath + " --max_ram 1000000000 \n"
-            )  # fix to 1GB ram; in lyon using a fraction
-            # is difficult...
-            # time python shuffle/shuffle_h5.py'
-            # + delete_flag_shuffle_tool
-            # + chunksize + complib + complevel
+                "orcasong h5shuffle2 " + conc_outputfile_fpath)

            if cfg["shuffle_delete"]:
                f.write("\n")
                f.write("rm " + conc_outputfile_fpath + "\n")


-def main():
-    """
-    Main function to make the data split.
-    """
-
-    # load the config
-    parser = get_parser()
-    parsed_args = parser.parse_args()
-    make_split(parsed_args.config)
-
-
 def make_split(config_file):
    # decode config
    cfg = toml.load(config_file)
    cfg["toml_filename"] = config_file

-    # set some defaults/Nones - at the moment setting of the com opts is not available!
-    # if 'chunksize' not in cfg: cfg['chunksize'] = None
-    # if 'complib' not in cfg: cfg['complib'] = None
-    # if 'complevel' not in cfg: cfg['complevel'] = None
-
    # read out all the input groups
    ip_group_keys = get_all_ip_group_keys(cfg)

@@ -474,7 +440,3 @@ def make_split(config_file):
    # create bash scripts that can be submitted to do the concatenation and shuffle
    if cfg["make_qsub_bash_files"] is True:
        make_concatenate_and_shuffle_scripts(cfg)
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/data/processed_data_muon/processed_graph_muon.h5
+++ b/tests/data/processed_data_muon/processed_graph_muon.h5
--- a/tests/data/processed_data_neutrino/processed_graph_neutrino.h5
+++ b/tests/data/processed_data_neutrino/processed_graph_neutrino.h5
--- a/tests/data/test_make_data_split_config.toml
+++ b/tests/data/test_make_data_split_config.toml
@@ -62,14 +62,14 @@ shuffle_delete = false

 [neutrino] 
 dir = "processed_data_neutrino"
-run_ids_train = [1, 6767]
-run_ids_validate = [1, 6769]
+run_ids_train = [1, 12000]
+run_ids_validate = [1, 12000]


 [muon] 
 dir = "processed_data_muon"
-run_ids_train = [1, 6767]
-run_ids_validate = [9999, 6769]
+run_ids_train = [1, 12000]
+run_ids_validate = [1, 12000]


-# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
\ No newline at end of file
+# --- Input groups : these are the datafiles, that should be concatenated somehow --- #
--- a/tests/test_extractor.py
+++ b/tests/test_extractor.py
@@ -35,7 +35,7 @@ class TestStdRecoExtractor(TestCase):
            max_n_hits=3,
            time_window=[0, 50],
            hit_infos=["pos_z", "time", "channel_id"],
-            extractor=extractors.get_neutrino_mc_info_extr(NEUTRINO_FILE),
+            extractor=extractors.get_neutrino_mc_info_extr(NEUTRINO_FILE,1),
            det_file=DET_FILE_NEUTRINO,
            add_t0=True,
            keep_event_info=True,
@@ -56,7 +56,7 @@ class TestStdRecoExtractor(TestCase):
            time_window=[0, 50],
            hit_infos=["pos_z", "time", "channel_id"],
            extractor=extractors.get_neutrino_mc_info_extr(
-                NOT_FULLY_RECONSTRUCTED_FILE
+                NOT_FULLY_RECONSTRUCTED_FILE,1
            ),
            det_file=DET_FILE_NEUTRINO,
            add_t0=True,

--- a/tests/test_make_data_split.py
+++ b/tests/test_make_data_split.py
@@ -65,12 +65,12 @@ class TestMakeDataSplit(TestCase):
            "processed_data_neutrino/processed_graph_neutrino.h5",
            "processed_data_neutrino/processed_graph_neutrino.h5\n",
        ]
-        cls.n_events_list = [18, 3]
+        cls.n_events_list = [50, 33]
        cls.contents_concatenate_script = [
-            "concatenate " + list_output_train + " --outfile " + concatenate_file
+            "orcasong concatenate " + list_output_train + " --outfile " + concatenate_file
        ]
        cls.contents_shuffle_script = [
-            "h5shuffle2 " + concatenate_file + " --max_ram 1000000000 \n"
+            "orcasong h5shuffle2 " + concatenate_file
        ]

        # create list_file_dir
@@ -127,7 +127,7 @@ class TestMakeDataSplit(TestCase):
        assert os.path.exists(list_output_val) == 1
        with open(list_output_val) as f:
            for line in f:
-                self.assertIn(line, self.file_path_list_val)
+                self.assertIn(line, self.file_path_list)
        f.close

        assert os.path.exists(list_output_train) == 1