Skip to content
Snippets Groups Projects
Commit 0449ad6c authored by Stefan Reck's avatar Stefan Reck
Browse files

concatenate: allow different chunksizes per dataset

parent a375a0ea
No related branches found
No related tags found
1 merge request!27Adjust chunks
......@@ -40,7 +40,6 @@ class FileConcatenator:
self.input_files, self.cumu_rows = self._get_cumu_rows(input_files)
# Get compression options from first file in the list
# TODO different chunksizes for different datasets!
self.comptopts = get_compopts(self.input_files[0])
if comptopts_update:
self.comptopts.update(comptopts_update)
......@@ -142,11 +141,14 @@ class FileConcatenator:
# first file; create the dataset
dset_shape = (self.cumu_rows[dset_name][-1],) + folder_data.shape[1:]
print(f"\tCreating dataset '{dset_name}' with shape {dset_shape}")
chunks = self.comptopts["chunksize"]
if isinstance(chunks, dict):
chunks = chunks[dset_name]
output_dataset = f_out.create_dataset(
dset_name,
data=folder_data,
maxshape=dset_shape,
chunks=(self.comptopts["chunksize"],) + folder_data.shape[1:],
chunks=(chunks,) + folder_data.shape[1:],
compression=self.comptopts["complib"],
compression_opts=self.comptopts["complevel"],
shuffle=self.comptopts["shuffle"],
......@@ -264,26 +266,27 @@ def get_compopts(file):
Specifies the compression level that should be used for saving
the concatenated output files.
A compression level is only available for gzip compression, not lzf!
chunksize : None/int
Specifies the chunksize for axis_0 in the concatenated output files.
chunksize : None/dict
Specifies the chunksize of each dataset for axis_0 in the
concatenated output files.
shuffle : bool
Enable shuffle filter for chunks.
"""
with h5py.File(file, "r") as f:
# for reading the comptopts, take first datsets thats not indexed
dset_names = strip_keys(list(f.keys()))
comptopts = {}
comptopts["chunksize"] = {d: f[d].chunks[0] for d in dset_names}
# for reading the other comptopts, take first datset thats not indexed
for dset_name in dset_names:
dset = f[dset_name]
if f"{dset_name}_indices" not in dset_names:
break
dset = f[dset_name]
comptopts = {}
comptopts["complib"] = dset.compression
if comptopts["complib"] == "lzf":
comptopts["complevel"] = None
else:
comptopts["complevel"] = dset.compression_opts
comptopts["chunksize"] = dset.chunks[0]
comptopts["shuffle"] = dset.shuffle
return comptopts
......
......@@ -31,7 +31,7 @@ class TestFileConcatenator(unittest.TestCase):
cls.compt_opts = {
"complib": "gzip",
"complevel": 1,
"chunksize": 5,
"chunksize": {"numpy_array": 5, "rec_array": 5},
"shuffle": False,
}
......@@ -159,6 +159,11 @@ class TestConcatenateIndexed(BaseTestClass.BaseIndexedFile):
target_index = np.concatenate([[0], target_n_items.cumsum()[:-1]])
np.testing.assert_array_equal(f_out["x_indices"]["index"], target_index)
def test_chunk_sizes_of_conc_file_are_the_same_as_in_input(self):
with h5py.File(self.outfile) as f_out:
self.assertTupleEqual(f_out["x"].chunks, (20,))
self.assertTupleEqual(f_out["x_indices"].chunks, (3,))
def _create_dummy_file(filepath, columns=10, val_array=1, val_recarray=(1, 3)):
""" Create a dummy h5 file with an array and a recarray in it. """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment