Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
OrcaSong
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Machine Learning
OrcaSong
Commits
b6d0ec52
Commit
b6d0ec52
authored
3 years ago
by
Stefan Reck
Browse files
Options
Downloads
Patches
Plain Diff
cleanup
parent
e02df80b
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!22
Cleanup
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
orcasong/from_toml.py
+2
-2
2 additions, 2 deletions
orcasong/from_toml.py
orcasong/tools/shuffle2.py
+133
-243
133 additions, 243 deletions
orcasong/tools/shuffle2.py
with
135 additions
and
245 deletions
orcasong/from_toml.py
+
2
−
2
View file @
b6d0ec52
...
...
@@ -11,8 +11,8 @@ EXTRACTORS = {
"
nu_chain_muon
"
:
extractors
.
get_muon_mc_info_extr
,
"
nu_chain_noise
"
:
extractors
.
get_random_noise_mc_info_extr
,
"
nu_chain_data
"
:
extractors
.
get_real_data_info_extr
,
"
bundle_mc
"
:
extractors
.
bundles
.
BundleMCExtractor
,
"
bundle_data
"
:
extractors
.
bundles
.
BundleDataExtractor
,
"
bundle_mc
"
:
extractors
.
BundleMCExtractor
,
"
bundle_data
"
:
extractors
.
BundleDataExtractor
,
}
MODES
=
{
...
...
This diff is collapsed.
Click to expand it.
orcasong/tools/shuffle2.py
+
133
−
243
View file @
b6d0ec52
import
os
import
time
import
datetime
import
argparse
import
warnings
import
numpy
as
np
import
psutil
...
...
@@ -23,64 +21,7 @@ def h5shuffle2(
datasets
=
(
"
x
"
,
"
y
"
),
max_ram_fraction
=
0.25
,
max_ram
=
None
,
):
if
output_file
is
None
:
output_file
=
get_filepath_output
(
input_file
,
shuffle
=
True
)
if
iterations
is
None
:
iterations
=
get_n_iterations
(
input_file
,
datasets
=
datasets
,
max_ram_fraction
=
max_ram_fraction
,
max_ram
=
max_ram
,
)
np
.
random
.
seed
(
42
)
for
i
in
range
(
iterations
):
print
(
f
"
\n
Iteration
{
i
+
1
}
/
{
iterations
}
"
)
if
iterations
==
1
:
# special case if theres only one iteration
stgs
=
{
"
input_file
"
:
input_file
,
"
output_file
"
:
output_file
,
"
delete
"
:
False
,
}
elif
i
==
0
:
# first iteration
stgs
=
{
"
input_file
"
:
input_file
,
"
output_file
"
:
f
"
{
output_file
}
_temp_
{
i
}
"
,
"
delete
"
:
False
,
}
elif
i
==
iterations
-
1
:
# last iteration
stgs
=
{
"
input_file
"
:
f
"
{
output_file
}
_temp_
{
i
-
1
}
"
,
"
output_file
"
:
output_file
,
"
delete
"
:
True
,
}
else
:
# intermediate iterations
stgs
=
{
"
input_file
"
:
f
"
{
output_file
}
_temp_
{
i
-
1
}
"
,
"
output_file
"
:
f
"
{
output_file
}
_temp_
{
i
}
"
,
"
delete
"
:
True
,
}
shuffle_file
(
datasets
=
datasets
,
max_ram
=
max_ram
,
max_ram_fraction
=
max_ram_fraction
,
chunks
=
True
,
**
stgs
,
)
def
shuffle_file
(
input_file
,
datasets
=
(
"
x
"
,
"
y
"
),
output_file
=
None
,
max_ram
=
None
,
max_ram_fraction
=
0.25
,
chunks
=
False
,
delete
=
False
,
seed
=
42
,
):
"""
Shuffle datasets in a h5file that have the same length.
...
...
@@ -89,24 +30,24 @@ def shuffle_file(
----------
input_file : str
Path of the file that will be shuffle.
datasets : tuple
Which datasets to include in output.
output_file : str, optional
If given, this will be the name of the output file.
Otherwise, a name is auto generated.
iterations : int, optional
Shuffle the file this many times. For each additional iteration,
a temporary file will be created and then deleted afterwards.
Default: Auto choose best number based on available RAM.
datasets : tuple
Which datasets to include in output.
max_ram : int, optional
Available ram in bytes. Default: Use fraction of
maximum available (see max_ram_fraction).
max_ram_fraction : float
in [0, 1]. Fraction of
ram
to use for reading one batch of data
in [0, 1]. Fraction of
RAM
to use for reading one batch of data
when max_ram is None. Note: when using chunks, this should
be <=~0.25, since lots of ram is needed for in-memory shuffling.
chunks : bool
Use chunk-wise readout. Large speed boost, but will
only quasi-randomize order! Needs lots of ram
to be accurate! (use a node with at least 32gb, the more the better)
delete : bool
Delete the original file afterwards?
seed : int or None
Seed for randomness.
Returns
-------
...
...
@@ -114,38 +55,68 @@ def shuffle_file(
Path to the output file.
"""
start_time
=
time
.
time
()
if
output_file
is
None
:
output_file
=
get_filepath_output
(
input_file
,
shuffle
=
True
)
if
iterations
is
None
:
iterations
=
get_n_iterations
(
input_file
,
datasets
=
datasets
,
max_ram_fraction
=
max_ram_fraction
,
max_ram
=
max_ram
,
)
# filenames of all iterations, in the right order
filenames
=
(
input_file
,
*
_get_temp_filenames
(
output_file
,
number
=
iterations
-
1
),
output_file
,
)
if
seed
:
np
.
random
.
seed
(
seed
)
for
i
in
range
(
iterations
):
print
(
f
"
\n
Iteration
{
i
+
1
}
/
{
iterations
}
"
)
_shuffle_file
(
input_file
=
filenames
[
i
],
output_file
=
filenames
[
i
+
1
],
delete
=
i
>
0
,
datasets
=
datasets
,
max_ram
=
max_ram
,
max_ram_fraction
=
max_ram_fraction
,
)
return
output_file
def
_shuffle_file
(
input_file
,
output_file
,
datasets
=
(
"
x
"
,
"
y
"
),
max_ram
=
None
,
max_ram_fraction
=
0.25
,
delete
=
False
,
):
start_time
=
time
.
time
()
if
os
.
path
.
exists
(
output_file
):
raise
FileExistsError
(
output_file
)
if
max_ram
is
None
:
max_ram
=
get_max_ram
(
max_ram_fraction
)
# create file with temp name first, then rename afterwards
temp_output_file
=
(
output_file
+
"
_temp_
"
+
time
.
strftime
(
"
%d-%m-%Y-%H-%M-%S
"
,
time
.
gmtime
())
)
with
h5py
.
File
(
input_file
,
"
r
"
)
as
f_in
:
dset_infos
,
n_lines
=
get_dset_infos
(
f_in
,
datasets
,
max_ram
)
print
(
f
"
Shuffling datasets
{
datasets
}
with
{
n_lines
}
lines each
"
)
if
not
chunks
:
indices
=
np
.
arange
(
n_lines
)
np
.
random
.
shuffle
(
indices
)
with
h5py
.
File
(
temp_output_file
,
"
x
"
)
as
f_out
:
for
dset_info
in
dset_infos
:
print
(
"
Creating dataset
"
,
dset_info
[
"
name
"
])
make_dset
(
f_out
,
dset_info
,
indices
)
print
(
"
Done!
"
)
else
:
indices_chunked
=
get_indices_largest
(
dset_infos
)
_check_dsets
(
f_in
,
datasets
)
dset_info
=
_get_largest_dset
(
f_in
,
datasets
,
max_ram
)
print
(
f
"
Shuffling datasets
{
datasets
}
"
)
indices_per_batch
=
_get_indices_per_batch
(
dset_info
[
"
n_batches
"
],
dset_info
[
"
n_chunks
"
],
dset_info
[
"
chunksize
"
],
)
with
h5py
.
File
(
temp_output_file
,
"
x
"
)
as
f_out
:
for
dset_
info
in
dset
_info
s
:
print
(
"
Creating dataset
"
,
dset_
info
[
"
name
"
]
)
make_dset_chunked
(
f_out
,
dset_
info
,
indices_
chunked
)
print
(
"
Done!
"
)
with
h5py
.
File
(
temp_output_file
,
"
x
"
)
as
f_out
:
for
dset_
name
in
d
ata
sets
:
print
(
"
Creating dataset
"
,
dset_name
)
_shuffle_dset
(
f_out
,
f_in
,
dset_
name
,
indices_
per_batch
)
print
(
"
Done!
"
)
copy_used_files
(
input_file
,
temp_output_file
)
copy_attrs
(
input_file
,
temp_output_file
)
...
...
@@ -164,21 +135,6 @@ def get_max_ram(max_ram_fraction):
return
max_ram
def
get_indices_largest
(
dset_infos
):
largest_dset
=
np
.
argmax
([
v
[
"
n_batches_chunkwise
"
]
for
v
in
dset_infos
])
dset_info
=
dset_infos
[
largest_dset
]
print
(
f
"
Total chunks:
{
dset_info
[
'
n_chunks
'
]
}
"
)
ratio
=
dset_info
[
"
chunks_per_batch
"
]
/
dset_info
[
"
n_chunks
"
]
print
(
f
"
Chunks per batch:
{
dset_info
[
'
chunks_per_batch
'
]
}
(
{
ratio
:
.
2
%
}
)
"
)
return
get_indices_chunked
(
dset_info
[
"
n_batches_chunkwise
"
],
dset_info
[
"
n_chunks
"
],
dset_info
[
"
chunksize
"
],
)
def
get_n_iterations
(
input_file
,
datasets
=
(
"
x
"
,
"
y
"
),
max_ram
=
None
,
max_ram_fraction
=
0.25
):
...
...
@@ -186,9 +142,7 @@ def get_n_iterations(
if
max_ram
is
None
:
max_ram
=
get_max_ram
(
max_ram_fraction
=
max_ram_fraction
)
with
h5py
.
File
(
input_file
,
"
r
"
)
as
f_in
:
dset_infos
,
n_lines
=
get_dset_infos
(
f_in
,
datasets
,
max_ram
)
largest_dset
=
np
.
argmax
([
v
[
"
n_batches_chunkwise
"
]
for
v
in
dset_infos
])
dset_info
=
dset_infos
[
largest_dset
]
dset_info
=
_get_largest_dset
(
f_in
,
datasets
,
max_ram
)
n_iterations
=
int
(
np
.
ceil
(
np
.
log
(
dset_info
[
"
n_chunks
"
])
/
np
.
log
(
dset_info
[
"
chunks_per_batch
"
]))
)
...
...
@@ -198,137 +152,117 @@ def get_n_iterations(
return
n_iterations
def
get_indices_chunked
(
n_batches
,
n_chunks
,
chunksize
):
"""
Return a list with the chunkwise shuffled indices of each batch.
"""
def
_get_indices_per_batch
(
n_batches
,
n_chunks
,
chunksize
):
"""
Return a list with the shuffled indices for each batch.
Returns
-------
indices_per_batch : List
Length n_batches, each element is a np.array[int].
Element i of the list are the indices of each sample in batch number i.
"""
chunk_indices
=
np
.
arange
(
n_chunks
)
np
.
random
.
shuffle
(
chunk_indices
)
chunk_batches
=
np
.
array_split
(
chunk_indices
,
n_batches
)
ind
ex
_batch
es
=
[]
ind
ices_per
_batch
=
[]
for
bat
in
chunk_batches
:
idx
=
(
bat
[:,
None
]
*
chunksize
+
np
.
arange
(
chunksize
)[
None
,
:]).
flatten
()
np
.
random
.
shuffle
(
idx
)
index_batches
.
append
(
idx
)
indices_per_batch
.
append
(
idx
)
return
indices_per_batch
return
index_batches
def
_get_largest_dset
(
f
,
datasets
,
max_ram
):
"""
Get infos about the dset that needs the most batches.
This is the dset that determines how many samples are shuffled at a time.
"""
dset_infos
=
_get_dset_infos
(
f
,
datasets
,
max_ram
)
return
dset_infos
[
np
.
argmax
([
v
[
"
n_batches
"
]
for
v
in
dset_infos
])]
def
get_dset_infos
(
f
,
datasets
,
max_ram
):
"""
Check datasets and retrieve relevant infos for each.
"""
def
_check_dsets
(
f
,
datasets
):
# check if all datasets have the same number of lines
n_lines_list
=
[
len
(
f
[
dset_name
])
for
dset_name
in
datasets
]
if
not
all
([
n
==
n_lines_list
[
0
]
for
n
in
n_lines_list
]):
raise
ValueError
(
f
"
Datasets have different lengths!
"
f
"
{
n_lines_list
}
"
)
def
_get_dset_infos
(
f
,
datasets
,
max_ram
):
"""
Retrieve infos for each dataset.
"""
dset_infos
=
[]
n_lines
=
None
for
i
,
name
in
enumerate
(
datasets
):
dset
=
f
[
name
]
if
i
==
0
:
n_lines
=
len
(
dset
)
else
:
if
len
(
dset
)
!=
n_lines
:
raise
ValueError
(
f
"
dataset
{
name
}
has different length!
"
f
"
{
len
(
dset
)
}
vs
{
n_lines
}
"
)
n_lines
=
len
(
dset
)
chunksize
=
dset
.
chunks
[
0
]
n_chunks
=
int
(
np
.
ceil
(
n_lines
/
chunksize
))
# TODO in h5py 3.X, use .nbytes to get uncompressed size
bytes_per_line
=
np
.
asarray
(
dset
[
0
]).
nbytes
bytes_per_chunk
=
bytes_per_line
*
chunksize
lines_per_batch
=
int
(
np
.
floor
(
max_ram
/
bytes_per_line
))
chunks_per_batch
=
int
(
np
.
floor
(
max_ram
/
bytes_per_chunk
))
dset_infos
.
append
(
{
"
name
"
:
name
,
"
dset
"
:
dset
,
"
chunksize
"
:
chunksize
,
"
n_lines
"
:
n_lines
,
"
n_chunks
"
:
n_chunks
,
"
bytes_per_line
"
:
bytes_per_line
,
"
bytes_per_chunk
"
:
bytes_per_chunk
,
"
lines_per_batch
"
:
lines_per_batch
,
"
chunks_per_batch
"
:
chunks_per_batch
,
"
n_batches_linewise
"
:
int
(
np
.
ceil
(
n_lines
/
lines_per_batch
)),
"
n_batches_chunkwise
"
:
int
(
np
.
ceil
(
n_chunks
/
chunks_per_batch
)),
}
)
return
dset_infos
,
n_lines
dset_infos
.
append
({
"
name
"
:
name
,
"
dset
"
:
dset
,
"
n_chunks
"
:
n_chunks
,
"
chunks_per_batch
"
:
chunks_per_batch
,
"
n_batches
"
:
int
(
np
.
ceil
(
n_chunks
/
chunks_per_batch
)),
"
chunksize
"
:
chunksize
,
})
def
make_dset
(
f_out
,
dset_info
,
indices
):
"""
Create a shuffled dataset in the output file.
"""
for
batch_index
in
range
(
dset_info
[
"
n_batches_linewise
"
]):
print
(
f
"
Processing batch
{
batch_index
+
1
}
/
{
dset_info
[
'
n_batches_linewise
'
]
}
"
)
return
dset_infos
slc
=
slice
(
batch_index
*
dset_info
[
"
lines_per_batch
"
],
(
batch_index
+
1
)
*
dset_info
[
"
lines_per_batch
"
],
)
to_read
=
indices
[
slc
]
# reading has to be done with linearly increasing index,
# so sort -> read -> undo sorting
sort_ix
=
np
.
argsort
(
to_read
)
unsort_ix
=
np
.
argsort
(
sort_ix
)
data
=
dset_info
[
"
dset
"
][
to_read
[
sort_ix
]][
unsort_ix
]
if
batch_index
==
0
:
in_dset
=
dset_info
[
"
dset
"
]
out_dset
=
f_out
.
create_dataset
(
dset_info
[
"
name
"
],
data
=
data
,
maxshape
=
in_dset
.
shape
,
chunks
=
in_dset
.
chunks
,
compression
=
in_dset
.
compression
,
compression_opts
=
in_dset
.
compression_opts
,
shuffle
=
in_dset
.
shuffle
,
)
out_dset
.
resize
(
len
(
in_dset
),
axis
=
0
)
else
:
f_out
[
dset_info
[
"
name
"
]][
slc
]
=
data
def
_shuffle_dset
(
f_out
,
f_in
,
dset_name
,
indices_per_batch
):
"""
Create a batchwise-shuffled dataset in the output file using given indices.
def
make_dset_chunked
(
f_out
,
dset_info
,
indices_chunked
):
"""
Create a shuffled data
set
in
the output file.
"""
"""
d
set
_
in
=
f_in
[
dset_name
]
start_idx
=
0
for
batch_index
,
to_read
in
enumerate
(
indices_chunked
):
print
(
f
"
Processing batch
{
batch_index
+
1
}
/
{
len
(
indices_chunked
)
}
"
)
for
batch_number
,
indices
in
enumerate
(
indices_per_batch
):
print
(
f
"
Processing batch
{
batch_number
+
1
}
/
{
len
(
indices_per_batch
)
}
"
)
# remove indices outside of dset
to_read
=
to_read
[
to_read
<
len
(
dset_info
[
"
dset
"
])]
indices
=
indices
[
indices
<
len
(
dset_in
)]
# reading has to be done with linearly increasing index
# fancy indexing is super slow
# so sort -> turn to slices -> read -> conc -> undo sorting
sort_ix
=
np
.
argsort
(
to_read
)
sort_ix
=
np
.
argsort
(
indices
)
unsort_ix
=
np
.
argsort
(
sort_ix
)
fancy_indices
=
to_read
[
sort_ix
]
slices
=
slicify
(
fancy_indices
)
data
=
np
.
concatenate
([
dset_in
fo
[
"
dset
"
]
[
slc
]
for
slc
in
slices
])
fancy_indices
=
indices
[
sort_ix
]
slices
=
_
slicify
(
fancy_indices
)
data
=
np
.
concatenate
([
dset_in
[
slc
]
for
slc
in
slices
])
data
=
data
[
unsort_ix
]
if
batch_index
==
0
:
in_dset
=
dset_info
[
"
dset
"
]
if
batch_number
==
0
:
out_dset
=
f_out
.
create_dataset
(
dset_
info
[
"
name
"
]
,
dset_name
,
data
=
data
,
maxshape
=
in_
dset
.
shape
,
chunks
=
in_
dset
.
chunks
,
compression
=
in_
dset
.
compression
,
compression_opts
=
in_
dset
.
compression_opts
,
shuffle
=
in_
dset
.
shuffle
,
maxshape
=
dset
_in
.
shape
,
chunks
=
dset
_in
.
chunks
,
compression
=
dset
_in
.
compression
,
compression_opts
=
dset
_in
.
compression_opts
,
shuffle
=
dset
_in
.
shuffle
,
)
out_dset
.
resize
(
len
(
in_
dset
),
axis
=
0
)
out_dset
.
resize
(
len
(
dset
_in
),
axis
=
0
)
start_idx
=
len
(
data
)
else
:
end_idx
=
start_idx
+
len
(
data
)
f_out
[
dset_
info
[
"
name
"
]
][
start_idx
:
end_idx
]
=
data
f_out
[
dset_name
][
start_idx
:
end_idx
]
=
data
start_idx
=
end_idx
print
(
"
Memory peak: {0:.3f} MB
"
.
format
(
peak_memory_usage
()))
if
start_idx
!=
len
(
dset_in
fo
[
"
dset
"
]
):
print
(
f
"
Warning: last index was
{
start_idx
}
not
{
len
(
dset_in
fo
[
'
dset
'
]
)
}
"
)
if
start_idx
!=
len
(
dset_in
):
print
(
f
"
Warning: last index was
{
start_idx
}
not
{
len
(
dset_in
)
}
"
)
def
slicify
(
fancy_indices
):
def
_
slicify
(
fancy_indices
):
"""
[0,1,2, 6,7,8] --> [0:3, 6:9]
"""
steps
=
np
.
diff
(
fancy_indices
)
!=
1
slice_starts
=
np
.
concatenate
([
fancy_indices
[:
1
],
fancy_indices
[
1
:][
steps
]])
...
...
@@ -336,50 +270,6 @@ def slicify(fancy_indices):
return
[
slice
(
slice_starts
[
i
],
slice_ends
[
i
])
for
i
in
range
(
len
(
slice_starts
))]
def
run_parser
():
# TODO deprecated
warnings
.
warn
(
"
h5shuffle2 is deprecated and has been renamed to orcasong h5shuffle2
"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"
Shuffle datasets in a h5file that have the same length.
"
"
Uses chunkwise readout for speed-up.
"
)
parser
.
add_argument
(
"
input_file
"
,
type
=
str
,
help
=
"
Path of the file that will be shuffled.
"
)
parser
.
add_argument
(
"
--output_file
"
,
type
=
str
,
default
=
None
,
help
=
"
If given, this will be the name of the output file.
"
"
Default: input_file + suffix.
"
,
)
parser
.
add_argument
(
"
--datasets
"
,
type
=
str
,
nargs
=
"
*
"
,
default
=
(
"
x
"
,
"
y
"
),
help
=
"
Which datasets to include in output. Default: x, y
"
,
)
parser
.
add_argument
(
"
--max_ram_fraction
"
,
type
=
float
,
default
=
0.25
,
help
=
"
in [0, 1]. Fraction of all available ram to use for reading one batch of data
"
"
Note: this should
"
"
be <=~0.25 or so, since lots of ram is needed for in-memory shuffling.
"
"
Default: 0.25
"
,
)
parser
.
add_argument
(
"
--iterations
"
,
type
=
int
,
default
=
None
,
help
=
"
Shuffle the file this many times. Default: Auto choose best number.
"
,
)
parser
.
add_argument
(
"
--max_ram
"
,
type
=
int
,
default
=
None
,
help
=
"
Available ram in bytes. Default: Use fraction of maximum
"
"
available instead (see max_ram_fraction).
"
,
)
h5shuffle2
(
**
vars
(
parser
.
parse_args
()))
def
_get_temp_filenames
(
output_file
,
number
):
path
,
file
=
os
.
path
.
split
(
output_file
)
return
[
os
.
path
.
join
(
path
,
f
"
temp_iteration_
{
i
}
_
{
file
}
"
)
for
i
in
range
(
number
)]
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment