Skip to content
Snippets Groups Projects
Commit 1b86edc5 authored by Stefan Reck's avatar Stefan Reck
Browse files

Merge branch 'fix_h5shufflefor_index' into 'master'

fix_h5shufflefor_index

See merge request !31
parents 6fd807c0 d23ae258
No related branches found
Tags v4.8.4
1 merge request!31fix_h5shufflefor_index
......@@ -74,7 +74,8 @@ def _add_parser_h5shuffle2(subparsers):
parser = subparsers.add_parser(
"h5shuffle2",
description="Shuffle datasets in a h5file that have the same length. "
"Uses chunkwise readout for speed-up.",
"Uses chunkwise readout for speed-up. If you run into memory errors, try"
"manually setting --max_ram to a smaller value.",
)
parser.add_argument(
"input_file", type=str, help="Path of the file that will be shuffled."
......@@ -97,10 +98,11 @@ def _add_parser_h5shuffle2(subparsers):
"--max_ram_fraction",
type=float,
default=0.25,
help="in [0, 1]. Fraction of all available ram to use for reading one batch of data "
"Note: this should "
"be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. "
"Default: 0.25",
help="in [0, 1]. Only used when max_ram is not given. Fraction of all "
"available ram to use for reading one batch of data "
"Note: this should "
"be <=~0.25 or so, since lots of ram is needed for in-memory shuffling. "
"Default: 0.25",
)
parser.add_argument(
"--iterations",
......
......@@ -269,6 +269,7 @@ def _shuffle_dset(f_out, f_in, dset_name, indices_per_batch):
"""
dset_in = f_in[dset_name]
start_idx = 0
running_index = 0
for batch_number, indices in enumerate(indices_per_batch):
print(f"Processing batch {batch_number+1}/{len(indices_per_batch)}")
# remove indices outside of dset
......@@ -288,12 +289,16 @@ def _shuffle_dset(f_out, f_in, dset_name, indices_per_batch):
if dset_is_indexed(f_in, dset_name):
# special treatment for indexed: slice based on indices dataset
slices_indices = [f_in[f"{dset_name}_indices"][slc] for slc in slices]
dset_name_indexed = f"{dset_name}_indices"
slices_indices = [f_in[dset_name_indexed][slc] for slc in slices]
data_indices = np.concatenate(slices_indices)
if any(np.diff(data_indices["index"]) <= 0):
raise ValueError(f"'index' in {dset_name_indexed} is not increasing for every event!")
data = np.concatenate(
[dset_in[slice(*_resolve_indexed(slc))] for slc in slices_indices]
)
# convert to 3d awkward array, then shuffle, then back to numpy
data_indices = np.concatenate(slices_indices)
data_ak = ak.unflatten(data, data_indices["n_items"])
data = ak.flatten(data_ak[unsort_ix], axis=1).to_numpy()
......@@ -303,9 +308,10 @@ def _shuffle_dset(f_out, f_in, dset_name, indices_per_batch):
if dset_name.endswith("_indices"):
# recacalculate index
data["index"] = start_idx + np.concatenate(
data["index"] = running_index + np.concatenate(
[[0], np.cumsum(data["n_items"][:-1])]
)
running_index = sum(data[-1])
if batch_number == 0:
out_dset = f_out.create_dataset(
......
from unittest import TestCase
import tempfile
import os
import h5py
import numpy as np
......@@ -142,6 +143,38 @@ class TestShuffleIndexed(BaseTestClass.BaseIndexedFile):
np.testing.assert_array_equal(f_out["x_indices"]["index"], target_index)
class TestShuffle2LargeFile(TestCase):
def setUp(self):
self.infile = tempfile.NamedTemporaryFile()
with h5py.File(self.infile, "w") as f:
dset_x = f.create_dataset("x", data=np.arange(2000), chunks=(11, ))
dset_x.attrs.create("indexed", 1)
n_items = np.ones(100) * 20
self.index = np.concatenate([[0.], np.cumsum(n_items)[:-1]])
indices = np.array(
list(zip(self.index, n_items)),
dtype=[("index", "<i8"), ("n_items", "<i8")],
)
f.create_dataset("x_indices", data=indices, chunks=(14, ))
self.outfile = "temp_out.h5"
shuffle2.h5shuffle2(
self.infile.name,
output_file=self.outfile,
datasets=("x",),
seed=2,
max_ram=10000,
iterations=2,
)
def test_indices_is_correct(self):
with h5py.File(self.outfile) as f_out:
np.testing.assert_array_equal(
f_out["x_indices"]["index"],
self.index,
)
def _make_shuffle_dummy_file(filepath):
x = np.random.rand(22, 2)
x[:, 0] = np.arange(22)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment