Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
OrcaSong
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Machine Learning
OrcaSong
Commits
f9acd679
Commit
f9acd679
authored
5 years ago
by
Stefan Reck
Browse files
Options
Downloads
Patches
Plain Diff
Changed concatenate so that group "bin_stats" is ignored, too.
parent
fd12fd58
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
orcasong_contrib/data_tools/concatenate/concatenate_h5.py
+42
-16
42 additions, 16 deletions
orcasong_contrib/data_tools/concatenate/concatenate_h5.py
with
42 additions
and
16 deletions
orcasong_contrib/data_tools/concatenate/concatenate_h5.py
+
42
−
16
View file @
f9acd679
...
...
@@ -105,7 +105,8 @@ def parse_input():
def
get_cum_number_of_rows
(
file_list
):
"""
Returns the cumulative number of rows (axis_0) in a list based on the specified input .h5 files.
Returns the cumulative number of rows (axis_0) in a list based on the
specified input .h5 files.
Parameters
----------
...
...
@@ -115,31 +116,55 @@ def get_cum_number_of_rows(file_list):
Returns
-------
cum_number_of_rows_list : list
List that contains the cumulative number of rows (i.e. [0,100,200,300,...] if each file has 100 rows).
List that contains the cumulative number of rows
(i.e. [0,100,200,300,...] if each file has 100 rows).
"""
total_number_of_rows
=
0
cum_number_of_rows_list
=
[
0
]
number_of_rows_list
=
[]
# used for approximating the chunksize
# Get total number of rows for the files in the list, faster than resizing the dataset in each iteration of the file loop in concatenate_h5_files()
# Get total number of rows for the files in the list, faster than resizing
# the dataset in each iteration of the file loop in concatenate_h5_files()
for
file_name
in
file_list
:
f
=
h5py
.
File
(
file_name
,
'
r
'
)
with
h5py
.
File
(
file_name
,
'
r
'
)
as
f
:
# get number of rows from the first folder of the file
# -> each folder needs to have the same number of rows
f_keys
=
list
(
f
.
keys
())
f_keys_stripped
=
strip_keys
(
f_keys
)
# get number of rows from the first folder of the file -> each folder needs to have the same number of rows
f_keys
=
list
(
f
.
keys
())
# remove pytables folders starting with '_i_', because the shape of its first axis does not correspond to the number of events in the file.
# all other folders normally have an axis_0 shape that is equal to the number of events in the file.
f_keys_stripped
=
[
x
for
x
in
f_keys
if
'
_i_
'
not
in
x
]
total_number_of_rows
+=
f
[
f_keys_stripped
[
0
]].
shape
[
0
]
cum_number_of_rows_list
.
append
(
total_number_of_rows
)
number_of_rows_list
.
append
(
f
[
f_keys_stripped
[
0
]].
shape
[
0
])
total_number_of_rows
+=
f
[
f_keys_stripped
[
0
]].
shape
[
0
]
cum_number_of_rows_list
.
append
(
total_number_of_rows
)
number_of_rows_list
.
append
(
f
[
f_keys_stripped
[
0
]].
shape
[
0
])
return
cum_number_of_rows_list
f
.
close
()
return
cum_number_of_rows_list
def
strip_keys
(
f_keys
):
"""
Remove pytables folders starting with
'
_i_
'
, because the shape
of its first axis does not correspond to the number of events
in the file. All other folders normally have an axis_0 shape
that is equal to the number of events in the file.
Also remove bin_stats.
"""
f_keys_stripped
=
[]
for
x
in
f_keys
:
if
is_folder_ignored
(
x
):
continue
f_keys_stripped
.
append
(
x
)
return
f_keys_stripped
def
is_folder_ignored
(
folder_name
):
"""
Defines pytable folders which should be ignored during concat.
"""
if
'
_i_
'
in
folder_name
or
"
bin_stats
"
in
folder_name
:
return
True
else
:
return
False
def
get_f_compression_and_chunking
(
filepath
):
...
...
@@ -165,7 +190,8 @@ def get_f_compression_and_chunking(filepath):
f
=
h5py
.
File
(
filepath
,
'
r
'
)
# remove any keys to pytables folders that may be in the file
f_keys_stripped
=
[
x
for
x
in
list
(
f
.
keys
())
if
'
_i_
'
not
in
x
]
f_keys
=
list
(
f
.
keys
())
f_keys_stripped
=
strip_keys
(
f_keys
)
compression
=
f
[
f_keys_stripped
[
0
]].
compression
# compression filter
compression_opts
=
f
[
f_keys_stripped
[
0
]].
compression_opts
# filter strength
...
...
@@ -226,7 +252,7 @@ def concatenate_h5_files(output_filepath, file_list, cum_rows_list, chunksize, c
for
folder_name
in
input_file
:
if
folder_name
.
startswith
(
'
_i_
'
):
if
is_folder_ignored
(
folder_name
):
# we ignore datasets that have been created by pytables, don't need them anymore
continue
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment