Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
OrcaSong
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Machine Learning
OrcaSong
Commits
34ee2fea
Commit
34ee2fea
authored
5 years ago
by
Stefan Reck
Browse files
Options
Downloads
Patches
Plain Diff
Bin stats are now added to each h5 file.
parent
2b130099
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
orcasong_plag/core.py
+51
-30
51 additions, 30 deletions
orcasong_plag/core.py
orcasong_plag/modules.py
+37
-91
37 additions, 91 deletions
orcasong_plag/modules.py
orcasong_plag/util/bin_stats_plot.py
+172
-0
172 additions, 0 deletions
orcasong_plag/util/bin_stats_plot.py
with
260 additions
and
121 deletions
orcasong_plag/core.py
+
51
−
30
View file @
34ee2fea
...
...
@@ -2,13 +2,18 @@ import km3pipe as kp
import
km3modules
as
km
import
os
from
orcasong_plag.modules
import
TimePreproc
,
ImageMaker
,
McInfoMaker
,
Binning
Plotter
,
plot_hists
from
orcasong_plag.modules
import
TimePreproc
,
ImageMaker
,
McInfoMaker
,
Binning
StatsMaker
from
orcasong_plag.mc_info_types
import
get_mc_info_extr
from
orcasong_plag.util.bin_stats_plot
import
plot_hists
,
add_hists_to_h5file
,
plot_hist_of_files
class
FileBinner
:
"""
For making binned images.
For making binned images and mc_infos, which can be used for conv. nets.
Can also add statistics of the binning to the h5 files, which can
be plotted to show the distribution of hits among the bins and how
many hits were cut off.
Attributes
----------
...
...
@@ -54,10 +59,14 @@ class FileBinner:
but it increases the RAM usage as well.
"""
def
__init__
(
self
,
bin_edges_list
,
mc_info_extr
=
None
):
def
__init__
(
self
,
bin_edges_list
,
mc_info_extr
=
None
,
add_bin_stats
=
True
):
self
.
bin_edges_list
=
bin_edges_list
self
.
mc_info_extr
=
mc_info_extr
self
.
bin_plot_freq
=
20
if
add_bin_stats
:
self
.
bin_plot_freq
=
1
else
:
self
.
bin_plot_freq
=
None
self
.
n_statusbar
=
1000
self
.
n_memory_observer
=
1000
...
...
@@ -69,52 +78,70 @@ class FileBinner:
self
.
complevel
=
1
self
.
flush_frequency
=
1000
def
run
(
self
,
infile
,
outfile
):
def
run
(
self
,
infile
,
outfile
,
save_plot
=
False
):
"""
Build the pipeline to m
ake images for
the given
file.
M
ake images for
a
file.
Parameters
----------
infile : str
Path to the input file.
outfile : str
Path to the output file.
Path to the output file (will be created).
save_plot : bool
Save the binning hists as a pdf. Only possible if bin_plot_freq
is not None.
"""
if
save_plot
and
self
.
bin_plot_freq
is
None
:
raise
ValueError
(
"
Can not make plot when add_bin_stats is False
"
)
name
,
shape
=
self
.
get_names_and_shape
()
print
(
"
Generating {} images with shape {}
"
.
format
(
name
,
shape
))
plot_hists
=
self
.
bin_plot_freq
is
not
None
pipe
=
self
.
build_pipe
(
infile
,
outfile
)
smry
=
pipe
.
drain
()
if
self
.
bin_plot_freq
is
not
None
:
hists
=
smry
[
"
BinningStatsMaker
"
]
add_hists_to_h5file
(
hists
,
outfile
)
pipe
=
self
.
build_pipe
(
infile
,
outfile
,
plot_hists
=
plot_hists
)
pipe
.
drain
()
if
save_plot
:
save_as
=
os
.
path
.
splitext
(
outfile
)[
0
]
+
"
_hists.pdf
"
plot_hists
(
hists
,
save_as
)
def
run_multi
(
self
,
infiles
,
outfolder
):
def
run_multi
(
self
,
infiles
,
outfolder
,
save_plot
=
False
):
"""
Bin multiple files into their own output files each.
Will also generate a summary binning plot for all of the files.
Parameters
----------
infiles : List
The path to infiles as str.
outfolder : str
The output folder to place them in.
The output folder to place them in. The output file name will
be generated automatically.
save_plot : bool
Save the binning hists as a pdf. Only possible if bin_plot_freq
is not None.
"""
hists
=
None
if
save_plot
and
self
.
bin_plot_freq
is
None
:
raise
ValueError
(
"
Can not make plot when add_bin_stats is False
"
)
outfiles
=
[]
for
infile
in
infiles
:
outfile_name
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
infile
))[
0
]
+
"
_hist.h5
"
outfile_name
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
infile
))[
0
]
\
+
"
_hist.h5
"
outfile
=
outfolder
+
outfile_name
outfiles
.
append
(
outfile
)
pipe
=
self
.
build_pipe
(
infile
,
outfile
,
plot_hists
=
False
,
hists_start
=
hists
)
smry
=
pipe
.
drain
()
hists
=
smry
[
"
BinningPlotter
"
]
plot_hists
(
hists
,
pdf_path
=
outfolder
+
"
binning_summary.pdf
"
)
self
.
run
(
infile
,
outfile
,
save_plot
=
False
)
def
build_pipe
(
self
,
infile
,
outfile
,
plot_hists
=
True
,
hists_start
=
None
):
if
save_plot
:
plot_hist_of_files
(
outfiles
,
save_as
=
outfolder
+
"
binning_hist.pdf
"
)
def
build_pipe
(
self
,
infile
,
outfile
):
"""
Build the pipe to generate images and mc_info for a file.
"""
...
...
@@ -138,15 +165,9 @@ class FileBinner:
# pipe.attach(EventSkipper, data_cuts=self.data_cuts)
if
self
.
bin_plot_freq
is
not
None
:
if
plot_hists
:
pdf_name
=
os
.
path
.
splitext
(
outfile
)[
0
]
+
"
_hists.pdf
"
else
:
pdf_name
=
None
pipe
.
attach
(
BinningPlotter
,
pipe
.
attach
(
BinningStatsMaker
,
bin_plot_freq
=
self
.
bin_plot_freq
,
bin_edges_list
=
self
.
bin_edges_list
,
hists_start
=
hists_start
,
pdf_path
=
pdf_name
)
bin_edges_list
=
self
.
bin_edges_list
)
pipe
.
attach
(
ImageMaker
,
bin_edges_list
=
self
.
bin_edges_list
,
...
...
This diff is collapsed.
Click to expand it.
orcasong_plag/modules.py
+
37
−
91
View file @
34ee2fea
...
...
@@ -4,9 +4,6 @@ Custom km3pipe modules for making nn input files.
import
km3pipe
as
kp
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
matplotlib.backends.backend_pdf
import
PdfPages
import
matplotlib.ticker
as
ticker
class
McInfoMaker
(
kp
.
Module
):
...
...
@@ -119,17 +116,17 @@ class ImageMaker(kp.Module):
return
blob
class
Binning
Plott
er
(
kp
.
Module
):
class
Binning
StatsMak
er
(
kp
.
Module
):
"""
Sav
e a histogram of the number of hits for each binning field name.
Generat
e a histogram of the number of hits for each binning field name.
E.g. if the bin_edges_list contains
"
pos_z
"
, this will
sav
e a histogram
of #Hits vs.
"
pos_z
"
as a pdf
, together with how many hits were outside
E.g. if the bin_edges_list contains
"
pos_z
"
, this will
mak
e a histogram
of #Hits vs.
"
pos_z
"
, together with how many hits were outside
of the bin edges in both directions.
Per default, the resolution of the histogram (width of bins) will be
higher then the given bin edges, and the edges will be
plotted as horizontal
lines.
The time is the exception: The plotted bins have exactly the
higher then the given bin edges, and the edges will be
stored seperatly.
The time is the exception: The plotted bins have exactly the
given bin edges.
Attributes
...
...
@@ -137,48 +134,37 @@ class BinningPlotter(kp.Module):
bin_edges_list : List
List with the names of the fields to bin, and the respective bin edges,
including the left- and right-most bin edge.
pdf_path : str, optional
Where to save the hists to. This pdf will contain all the field names
on their own page each.
bin_plot_freq : int
Extract data for the hitograms only every given number of blobs
Extract data for the hi
s
tograms only every given number of blobs
(reduces time the pipeline takes to complete).
res_increase : int
Increase the number of bins by this much in the
plot
(so that one
Increase the number of bins by this much in the
hists
(so that one
can see if the edges have been placed correctly). Is never used
for the time binning (field name
"
time
"
).
plot_bin_edges : bool
If true, will plot the bin edges as horizontal lines. Is never used
for the time binning (field name
"
time
"
).
hists_start : dict, optional
Starting values for the statistics.
"""
def
configure
(
self
):
self
.
bin_edges_list
=
self
.
require
(
'
bin_edges_list
'
)
self
.
pdf_path
=
self
.
get
(
'
pdf_path
'
,
default
=
None
)
self
.
bin_plot_freq
=
self
.
get
(
"
bin_plot_freq
"
,
default
=
20
)
self
.
bin_plot_freq
=
self
.
get
(
"
bin_plot_freq
"
,
default
=
1
)
self
.
res_increase
=
self
.
get
(
'
res_increase
'
,
default
=
5
)
self
.
plot_bin_edges
=
self
.
get
(
'
plot_bin_edges
'
,
default
=
True
)
self
.
hists_start
=
self
.
get
(
'
hists_start
'
,
default
=
None
)
if
self
.
hists_start
is
None
:
self
.
hists
=
{}
for
bin_name
,
org_bin_edges
in
self
.
bin_edges_list
:
if
bin_name
==
"
time
"
:
bin_edges
=
org_bin_edges
else
:
bin_edges
=
self
.
_space_bin_edges
(
org_bin_edges
)
self
.
hists
[
bin_name
]
=
{
"
hist
"
:
np
.
zeros
(
len
(
bin_edges
)
-
1
),
"
hist_bin_edges
"
:
bin_edges
,
"
bin_edges
"
:
org_bin_edges
,
"
out_pos
"
:
0
,
"
out_neg
"
:
0
,
}
else
:
self
.
hists
=
self
.
hists_start
self
.
hists
=
{}
for
bin_name
,
org_bin_edges
in
self
.
bin_edges_list
:
if
bin_name
==
"
time
"
:
bin_edges
=
org_bin_edges
else
:
bin_edges
=
self
.
_space_bin_edges
(
org_bin_edges
)
self
.
hists
[
bin_name
]
=
{
"
hist
"
:
np
.
zeros
(
len
(
bin_edges
)
-
1
),
"
hist_bin_edges
"
:
bin_edges
,
"
bin_edges
"
:
org_bin_edges
,
# below smallest edge, above largest edge:
"
cut_off
"
:
np
.
zeros
(
2
),
}
self
.
i
=
0
...
...
@@ -207,64 +193,24 @@ class BinningPlotter(kp.Module):
out_neg
=
data
[
data
<
np
.
min
(
hist_bin_edges
)].
size
self
.
hists
[
bin_name
][
"
hist
"
]
+=
hist
self
.
hists
[
bin_name
][
"
out_pos
"
]
+=
out_pos
self
.
hists
[
bin_name
][
"
out_neg
"
]
+=
out_neg
self
.
hists
[
bin_name
][
"
cut_off
"
]
+=
np
.
array
([
out_neg
,
out_pos
])
self
.
i
+=
1
return
blob
def
finish
(
self
):
"""
Make and save the histograms to pdf.
"""
if
self
.
pdf_path
is
not
None
:
plot_hists
(
self
.
hists
,
self
.
pdf_path
,
plot_bin_edges
=
self
.
plot_bin_edges
)
return
self
.
hists
Get the hists, which are the stats of the binning.
Its a dict with each binning field name containing the following
ndarrays:
def
plot_hists
(
hists
,
pdf_path
,
plot_bin_edges
=
True
):
"""
Plot histograms made by the BinningPlotter to the given pdf path.
bin_edges : The actual bin edges.
cut_off : How many events were cut off in positive and negative
direction due to this binning.
hist_bin_edges : The bin edges for the plot in finer resolution then
the actual bin edges.
hist : The number of hist in each bin of the hist_bin_edges.
"""
with
PdfPages
(
pdf_path
)
as
pdf_file
:
for
bin_name
,
hists_data
in
hists
.
items
():
hist_bin_edges
=
hists_data
[
"
hist_bin_edges
"
]
bin_edges
=
hists_data
[
"
bin_edges
"
]
hist
=
hists_data
[
"
hist
"
]
out_pos
=
hists_data
[
"
out_pos
"
]
out_neg
=
hists_data
[
"
out_neg
"
]
hist_frac
=
hist
/
(
np
.
sum
(
hist
)
+
out_pos
+
out_neg
)
bin_spacing
=
hist_bin_edges
[
1
]
-
hist_bin_edges
[
0
]
fig
,
ax
=
plt
.
subplots
()
plt
.
bar
(
hist_bin_edges
[:
-
1
],
hist_frac
,
align
=
"
edge
"
,
width
=
0.9
*
bin_spacing
,
)
ax
.
yaxis
.
set_major_formatter
(
ticker
.
PercentFormatter
(
xmax
=
1
))
if
plot_bin_edges
and
bin_name
!=
"
time
"
:
for
bin_edge
in
bin_edges
:
plt
.
axvline
(
x
=
bin_edge
,
color
=
'
grey
'
,
linestyle
=
'
-
'
,
linewidth
=
1
,
alpha
=
0.9
)
# place a text box in upper left in axes coords
out_pos_rel
=
out_pos
/
np
.
sum
(
hist
)
out_neg_rel
=
out_neg
/
np
.
sum
(
hist
)
textstr
=
"
Hits cut off:
\n
Left: {:.1%}
\n
"
\
"
Right: {:.1%}
"
.
format
(
out_neg_rel
,
out_pos_rel
)
props
=
dict
(
boxstyle
=
'
round
'
,
facecolor
=
'
white
'
,
alpha
=
0.9
)
ax
.
text
(
0.05
,
0.95
,
textstr
,
transform
=
ax
.
transAxes
,
verticalalignment
=
'
top
'
,
bbox
=
props
)
plt
.
xlabel
(
bin_name
)
plt
.
ylabel
(
"
Fraction of hits
"
)
pdf_file
.
savefig
(
fig
)
print
(
"
Saved binning plot to
"
+
pdf_path
)
"""
return
self
.
hists
This diff is collapsed.
Click to expand it.
orcasong_plag/util/bin_stats_plot.py
0 → 100644
+
172
−
0
View file @
34ee2fea
"""
Functions for plotting the bin stats made by the BinningStatsMaker module.
"""
import
matplotlib.pyplot
as
plt
from
matplotlib.backends.backend_pdf
import
PdfPages
import
matplotlib.ticker
as
ticker
import
h5py
import
numpy
as
np
def
plot_hists
(
hists
,
save_to
,
plot_bin_edges
=
True
):
"""
Plot histograms made by the BinningStatsMaker to the given pdf path.
Parameters
----------
hists : dict or List
Dicts with the info about the binning, generated by the BinningStatsMaker.
Can also be a list of dicts (for multiple files), which will be
combined to one plot.
save_to : str
Where to save the plot to.
plot_bin_edges : bool
If true, will plot the bin edges as horizontal lines. Is never used
for the time binning (field name
"
time
"
).
"""
if
isinstance
(
hists
,
list
):
hists
=
combine_hists
(
hists
)
with
PdfPages
(
save_to
)
as
pdf_file
:
for
bin_name
,
hists_data
in
hists
.
items
():
hist_bin_edges
=
hists_data
[
"
hist_bin_edges
"
]
bin_edges
=
hists_data
[
"
bin_edges
"
]
hist
=
hists_data
[
"
hist
"
]
cut_off
=
hists_data
[
"
cut_off
"
]
total_hits
=
np
.
sum
(
hist
)
+
np
.
sum
(
cut_off
)
hist_frac
=
hist
/
total_hits
bin_spacing
=
hist_bin_edges
[
1
]
-
hist_bin_edges
[
0
]
fig
,
ax
=
plt
.
subplots
()
plt
.
bar
(
hist_bin_edges
[:
-
1
],
hist_frac
,
align
=
"
edge
"
,
width
=
0.9
*
bin_spacing
,
)
ax
.
yaxis
.
set_major_formatter
(
ticker
.
PercentFormatter
(
xmax
=
1
))
if
plot_bin_edges
and
bin_name
!=
"
time
"
:
for
bin_edge
in
bin_edges
:
plt
.
axvline
(
x
=
bin_edge
,
color
=
'
grey
'
,
linestyle
=
'
-
'
,
linewidth
=
1
,
alpha
=
0.9
)
# place a text box in upper left in axes coords
out_neg_rel
=
cut_off
[
0
]
/
total_hits
out_pos_rel
=
cut_off
[
1
]
/
total_hits
textstr
=
"
Hits cut off:
\n
Left: {:.1%}
\n
"
\
"
Right: {:.1%}
"
.
format
(
out_neg_rel
,
out_pos_rel
)
props
=
dict
(
boxstyle
=
'
round
'
,
facecolor
=
'
white
'
,
alpha
=
0.9
)
ax
.
text
(
0.05
,
0.95
,
textstr
,
transform
=
ax
.
transAxes
,
verticalalignment
=
'
top
'
,
bbox
=
props
)
plt
.
xlabel
(
bin_name
)
plt
.
ylabel
(
"
Fraction of hits
"
)
pdf_file
.
savefig
(
fig
)
print
(
"
Saved binning plot to
"
+
save_to
)
def
combine_hists
(
hists_list
):
"""
Combine the hists for multiple files into a single big one.
Parameters
----------
hists_list : List
A list of dicts (hists from the BinningStatsMaker for different files).
Returns
-------
combined_hists : Dict
The combined hist.
"""
# initialize combined dict according to first dict in list
combined_hists
=
{}
for
bin_name
,
hists_data
in
hists_list
[
0
].
items
():
hist_bin_edges
=
hists_data
[
"
hist_bin_edges
"
]
bin_edges
=
hists_data
[
"
bin_edges
"
]
bin_name
=
bin_name
combined_hists
[
bin_name
]
=
{
"
hist
"
:
np
.
zeros
(
len
(
hist_bin_edges
)
-
1
),
"
hist_bin_edges
"
:
hist_bin_edges
,
"
bin_edges
"
:
bin_edges
,
# below smallest edge, above largest edge:
"
cut_off
"
:
np
.
zeros
(
2
),
}
# add them together
for
hists
in
hists_list
:
for
bin_name
,
hists_data
in
hists
.
items
():
# name and bin edges must be the same for all hists
if
bin_name
not
in
combined_hists
:
raise
NameError
(
"
Hists dont all have the same binning field name:
"
"
{}
"
.
format
(
bin_name
))
bin_edges
=
hists_data
[
"
bin_edges
"
]
comb_edges
=
combined_hists
[
bin_name
][
"
bin_edges
"
]
if
len
(
bin_edges
)
!=
len
(
comb_edges
):
raise
ValueError
(
"
Hists have different hist bin edges: {} vs {}
"
.
format
(
bin_edges
,
comb_edges
))
hist_bin_edges
=
hists_data
[
"
hist_bin_edges
"
]
comb_hist_edges
=
combined_hists
[
bin_name
][
"
hist_bin_edges
"
]
if
len
(
hist_bin_edges
)
!=
len
(
comb_hist_edges
):
raise
ValueError
(
"
Hists have different bin edges: {} vs {}
"
.
format
(
hist_bin_edges
,
comb_hist_edges
))
combined_hists
[
bin_name
][
"
hist
"
]
+=
hists_data
[
"
hist
"
]
combined_hists
[
bin_name
][
"
cut_off
"
]
+=
hists_data
[
"
cut_off
"
]
return
combined_hists
def
add_hists_to_h5file
(
hists
,
file
):
"""
Add the binning stats as groups to a h5 file.
Parameters
----------
hists : dict
The histst from the BinningStatsMaker module.
file : str
Path to the h5 file.
"""
base_name
=
"
bin_stats/
"
with
h5py
.
File
(
file
,
"
a
"
)
as
f
:
for
bin_name
,
hists_data
in
hists
.
items
():
for
key
,
val
in
hists_data
.
items
():
h5_folder
=
base_name
+
bin_name
+
"
/
"
+
key
f
.
create_dataset
(
h5_folder
,
data
=
val
)
def
plot_hist_of_files
(
files
,
save_as
):
"""
Plot the binning stats of a list of files.
Parameters
----------
files : List
Path of the files as str.
save_as : str
Where to save the plot to.
"""
hists_list
=
[]
opened_files
=
[]
for
file
in
files
:
f
=
h5py
.
File
(
file
,
"
r
"
)
hists_list
.
append
(
f
[
"
bin_stats/
"
])
opened_files
.
append
(
f
)
plot_hists
(
hists_list
,
save_as
)
for
file
in
opened_files
:
file
.
close
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment