Merge branch 'fix_h5shufflefor_index' into 'master'

fix_h5shufflefor_index See merge request !31

Merge branch 'fix_h5shufflefor_index' into 'master'
1b86edc5 · Stefan Reck · 6fd807c0 · d23ae258 · 1b86edc5 · 1b86edc5
Commit 1b86edc5 authored 3 years ago by Stefan Reck
--- a/orcasong/parser.py
+++ b/orcasong/parser.py
@@ -74,7 +74,8 @@ def _add_parser_h5shuffle2(subparsers):
    parser = subparsers.add_parser(
        "h5shuffle2",
        description="Shuffle datasets in a h5file that have the same length. "
-        "Uses chunkwise readout for speed-up.",
+        "Uses chunkwise readout for speed-up. If you run into memory errors, try"
+        "manually setting --max_ram to a smaller value.",
    )
    parser.add_argument(
        "input_file", type=str, help="Path of the file that will be shuffled."
@@ -97,10 +98,11 @@ def _add_parser_h5shuffle2(subparsers):
        "--max_ram_fraction",
        type=float,
        default=0.25,
-        help="in [0, 1]. Fraction of all available ram to use for reading one batch of data "
-        "Note: this should "
-        "be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. "
-        "Default: 0.25",
+        help="in [0, 1]. Only used when max_ram is not given. Fraction of all "
+             "available ram to use for reading one batch of data "
+             "Note: this should "
+             "be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. "
+             "Default: 0.25",
    )
    parser.add_argument(
        "--iterations",

--- a/orcasong/tools/shuffle2.py
+++ b/orcasong/tools/shuffle2.py
@@ -269,6 +269,7 @@ def _shuffle_dset(f_out, f_in, dset_name, indices_per_batch):
    """
    dset_in = f_in[dset_name]
    start_idx = 0
+    running_index = 0
    for batch_number, indices in enumerate(indices_per_batch):
        print(f"Processing batch {batch_number+1}/{len(indices_per_batch)}")
        # remove indices outside of dset
@@ -288,12 +289,16 @@ def _shuffle_dset(f_out, f_in, dset_name, indices_per_batch):

        if dset_is_indexed(f_in, dset_name):
            # special treatment for indexed: slice based on indices dataset
-            slices_indices = [f_in[f"{dset_name}_indices"][slc] for slc in slices]
+            dset_name_indexed = f"{dset_name}_indices"
+            slices_indices = [f_in[dset_name_indexed][slc] for slc in slices]
+            data_indices = np.concatenate(slices_indices)
+            if any(np.diff(data_indices["index"]) <= 0):
+                raise ValueError(f"'index' in {dset_name_indexed} is not increasing for every event!")
+
            data = np.concatenate(
                [dset_in[slice(*_resolve_indexed(slc))] for slc in slices_indices]
            )
            # convert to 3d awkward array, then shuffle, then back to numpy
-            data_indices = np.concatenate(slices_indices)
            data_ak = ak.unflatten(data, data_indices["n_items"])
            data = ak.flatten(data_ak[unsort_ix], axis=1).to_numpy()

@@ -303,9 +308,10 @@ def _shuffle_dset(f_out, f_in, dset_name, indices_per_batch):

        if dset_name.endswith("_indices"):
            # recacalculate index
-            data["index"] = start_idx + np.concatenate(
+            data["index"] = running_index + np.concatenate(
                [[0], np.cumsum(data["n_items"][:-1])]
            )
+            running_index = sum(data[-1])

        if batch_number == 0:
            out_dset = f_out.create_dataset(

--- a/tests/test_postproc.py
+++ b/tests/test_postproc.py
 from unittest import TestCase
+import tempfile
 import os
 import h5py
 import numpy as np
@@ -142,6 +143,38 @@ class TestShuffleIndexed(BaseTestClass.BaseIndexedFile):
            np.testing.assert_array_equal(f_out["x_indices"]["index"], target_index)


+class TestShuffle2LargeFile(TestCase):
+    def setUp(self):
+        self.infile = tempfile.NamedTemporaryFile()
+        with h5py.File(self.infile, "w") as f:
+            dset_x = f.create_dataset("x", data=np.arange(2000), chunks=(11, ))
+            dset_x.attrs.create("indexed", 1)
+            n_items = np.ones(100) * 20
+            self.index = np.concatenate([[0.], np.cumsum(n_items)[:-1]])
+            indices = np.array(
+                list(zip(self.index, n_items)),
+                dtype=[("index", "<i8"), ("n_items", "<i8")],
+            )
+            f.create_dataset("x_indices", data=indices, chunks=(14, ))
+
+        self.outfile = "temp_out.h5"
+        shuffle2.h5shuffle2(
+            self.infile.name,
+            output_file=self.outfile,
+            datasets=("x",),
+            seed=2,
+            max_ram=10000,
+            iterations=2,
+        )
+
+    def test_indices_is_correct(self):
+        with h5py.File(self.outfile) as f_out:
+            np.testing.assert_array_equal(
+                f_out["x_indices"]["index"],
+                self.index,
+            )
+
+
 def _make_shuffle_dummy_file(filepath):
    x = np.random.rand(22, 2)
    x[:, 0] = np.arange(22)