concatenate: allow different chunksizes per dataset

0449ad6c · Stefan Reck · a375a0ea · 0449ad6c · 0449ad6c
Commit 0449ad6c authored 3 years ago by Stefan Reck
--- a/orcasong/tools/concatenate.py
+++ b/orcasong/tools/concatenate.py
@@ -40,7 +40,6 @@ class FileConcatenator:
        self.input_files, self.cumu_rows = self._get_cumu_rows(input_files)

        # Get compression options from first file in the list
-        # TODO different chunksizes for different datasets!
        self.comptopts = get_compopts(self.input_files[0])
        if comptopts_update:
            self.comptopts.update(comptopts_update)
@@ -142,11 +141,14 @@ class FileConcatenator:
                # first file; create the dataset
                dset_shape = (self.cumu_rows[dset_name][-1],) + folder_data.shape[1:]
                print(f"\tCreating dataset '{dset_name}' with shape {dset_shape}")
+                chunks = self.comptopts["chunksize"]
+                if isinstance(chunks, dict):
+                    chunks = chunks[dset_name]
                output_dataset = f_out.create_dataset(
                    dset_name,
                    data=folder_data,
                    maxshape=dset_shape,
-                    chunks=(self.comptopts["chunksize"],) + folder_data.shape[1:],
+                    chunks=(chunks,) + folder_data.shape[1:],
                    compression=self.comptopts["complib"],
                    compression_opts=self.comptopts["complevel"],
                    shuffle=self.comptopts["shuffle"],
@@ -264,26 +266,27 @@ def get_compopts(file):
        Specifies the compression level that should be used for saving
        the concatenated output files.
        A compression level is only available for gzip compression, not lzf!
-    chunksize : None/int
-        Specifies the chunksize for axis_0 in the concatenated output files.
+    chunksize : None/dict
+        Specifies the chunksize of each dataset for axis_0 in the
+        concatenated output files.
    shuffle : bool
        Enable shuffle filter for chunks.

    """
    with h5py.File(file, "r") as f:
-        # for reading the comptopts, take first datsets thats not indexed
        dset_names = strip_keys(list(f.keys()))
+        comptopts = {}
+        comptopts["chunksize"] = {d: f[d].chunks[0] for d in dset_names}
+        # for reading the other comptopts, take first datset thats not indexed
        for dset_name in dset_names:
+            dset = f[dset_name]
            if f"{dset_name}_indices" not in dset_names:
                break
-        dset = f[dset_name]
-        comptopts = {}
        comptopts["complib"] = dset.compression
        if comptopts["complib"] == "lzf":
            comptopts["complevel"] = None
        else:
            comptopts["complevel"] = dset.compression_opts
-        comptopts["chunksize"] = dset.chunks[0]
        comptopts["shuffle"] = dset.shuffle
    return comptopts


--- a/tests/test_concatenate.py
+++ b/tests/test_concatenate.py
@@ -31,7 +31,7 @@ class TestFileConcatenator(unittest.TestCase):
        cls.compt_opts = {
            "complib": "gzip",
            "complevel": 1,
-            "chunksize": 5,
+            "chunksize": {"numpy_array": 5, "rec_array": 5},
            "shuffle": False,
        }

@@ -159,6 +159,11 @@ class TestConcatenateIndexed(BaseTestClass.BaseIndexedFile):
            target_index = np.concatenate([[0], target_n_items.cumsum()[:-1]])
            np.testing.assert_array_equal(f_out["x_indices"]["index"], target_index)

+    def test_chunk_sizes_of_conc_file_are_the_same_as_in_input(self):
+        with h5py.File(self.outfile) as f_out:
+            self.assertTupleEqual(f_out["x"].chunks, (20,))
+            self.assertTupleEqual(f_out["x_indices"].chunks, (3,))
+

 def _create_dummy_file(filepath, columns=10, val_array=1, val_recarray=(1, 3)):
    """ Create a dummy h5 file with an array and a recarray in it. """