Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimization of multiplicity, theta and gh cut #204

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 217 additions & 31 deletions pyirf/cut_optimization.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,205 @@
from itertools import product
import numpy as np
from astropy.table import QTable
import astropy.units as u
from tqdm import tqdm
from tqdm.auto import tqdm
import operator

from .cuts import evaluate_binned_cut, calculate_percentile_cut

from .cuts import evaluate_binned_cut_by_index, calculate_percentile_cut, evaluate_binned_cut
from .sensitivity import calculate_sensitivity, estimate_background
from .binning import create_histogram_table, bin_center
from .binning import create_histogram_table, bin_center, calculate_bin_indices


__all__ = [
"optimize_gh_cut",
]


def optimize_cuts(
signal,
background,
reco_energy_bins,
multiplicity_cuts,
gh_cut_efficiencies,
theta_cut_efficiencies,
fov_offset_min=0 * u.deg,
fov_offset_max=1 * u.deg,
alpha=1.0,
progress=True,
**kwargs
):
"""
Optimize the gamma/hadronnes, theta and multiplicity cut in every energy bin of reconstructed energy
for best sensitivity.

Parameters
----------
signal: astropy.table.QTable
event list of simulated signal events.
Required columns are `theta`, `reco_energy`, 'weight', `gh_score`
No directional (theta) or gamma/hadron cut should already be applied.
background: astropy.table.QTable
event list of simulated background events.
Required columns are `reco_source_fov_offset`, `reco_energy`,
'weight', `gh_score`.
No directional (theta) or gamma/hadron cut should already be applied.
reco_energy_bins: astropy.units.Quantity[energy]
Bins in reconstructed energy to use for sensitivity computation
gh_cut_efficiencies: np.ndarray[float, ndim=1]
The cut efficiencies to scan for best sensitivity.
theta_cuts: astropy.table.QTable
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doc is wrong: it's not the cuts being passed here, but the array of values over which the brute-force optimization scan is done, right? Also, for symmetry, shouldn't there be a similar input for multiplicity?

cut definition of the energy dependent theta cut,
e.g. as created by ``calculate_percentile_cut``
op: comparison function with signature f(a, b) -> bool
The comparison function to use for the gamma hadron score.
Returning true means an event passes the cut, so is not discarded.
E.g. for gammaness-like score, use `operator.ge` (>=) and for a
hadroness-like score use `operator.le` (<=).
fov_offset_min: astropy.units.Quantity[angle]
Minimum distance from the fov center for background events to be taken into account
fov_offset_max: astropy.units.Quantity[angle]
Maximum distance from the fov center for background events to be taken into account
alpha: float
Size ratio of off region / on region. Will be used to
scale the background rate.
progress: bool
If True, show a progress bar during cut optimization
**kwargs are passed to ``calculate_sensitivity``
"""
gh_cut_efficiencies = np.asanyarray(gh_cut_efficiencies)
gh_cut_percentiles = 100 * (1 - gh_cut_efficiencies)
fill_value = signal['gh_score'].max()


sensitivities = []
cut_indicies = []
n_theta_cuts = len(theta_cut_efficiencies)
n_gh_cuts = len(gh_cut_efficiencies)
n_cuts = len(multiplicity_cuts) * n_theta_cuts * n_gh_cuts

signal_bin_index, signal_valid = calculate_bin_indices(
signal['reco_energy'], reco_energy_bins,
)
background_bin_index, background_valid = calculate_bin_indices(
background['reco_energy'], reco_energy_bins,
)

with tqdm(total=n_cuts, disable=not progress) as bar:
for multiplicity_index, multiplicity_cut in enumerate(multiplicity_cuts):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be useful to eventually adapt this to use scipy.optimize.brute internally rather than implement the grid-search manually, as that would make it easier to replace it with more efficient optimizers later

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the manual grid search, we can be a bit cleverer than brute force: make the cuts in the outermost loop possible rather than computing all cuts for all computations.


signal_mask_multiplicity = signal['multiplicity'] >= multiplicity_cut
background_mask_multiplicity = background["multiplicity"] >= multiplicity_cut

gh_cuts = calculate_percentile_cut(
signal['gh_score'][signal_mask_multiplicity],
signal['reco_energy'][signal_mask_multiplicity],
bins=reco_energy_bins,
fill_value=fill_value,
percentile=gh_cut_percentiles,
)

theta_cuts = calculate_percentile_cut(
signal['theta'][signal_mask_multiplicity],
signal['reco_energy'][signal_mask_multiplicity],
bins=reco_energy_bins,
fill_value=0.3 * u.deg,
max_value=0.3 * u.deg,
min_value=0.02 * u.deg,
percentile=100 * theta_cut_efficiencies,
)


for gh_index, theta_index in product(range(n_gh_cuts), range(n_theta_cuts)):

# apply the current cuts
theta_cut = theta_cuts.copy()
theta_cut["cut"] = theta_cuts["cut"][:, theta_index]
signal_mask_theta = evaluate_binned_cut_by_index(
signal["theta"], signal_bin_index, signal_valid, theta_cut, operator.le,
)

gh_cut = gh_cuts.copy()
gh_cut["cut"] = gh_cuts["cut"][:, gh_index]
signal_mask_gh = evaluate_binned_cut_by_index(
signal["gh_score"], signal_bin_index, signal_valid, gh_cut, operator.ge,
)

signal_selected = signal_mask_gh & signal_mask_theta & signal_mask_multiplicity

background_mask_gh = evaluate_binned_cut_by_index(
background["gh_score"], background_bin_index, background_valid, gh_cut, operator.ge,
)
background_selected = background_mask_gh & background_mask_multiplicity

# create the histograms
signal_hist = create_histogram_table(
signal[signal_selected], reco_energy_bins, "reco_energy"
)

background_hist = estimate_background(
events=background[background_selected],
reco_energy_bins=reco_energy_bins,
theta_cuts=theta_cut,
alpha=alpha,
fov_offset_min=fov_offset_min,
fov_offset_max=fov_offset_max,
)

sensitivity = calculate_sensitivity(
signal_hist, background_hist, alpha=alpha,
**kwargs,
)
cut_indicies.append((multiplicity_index, theta_index, gh_index))
sensitivities.append(sensitivity)
bar.update(1)

best_gh_cut = QTable()
best_gh_cut["low"] = reco_energy_bins[0:-1]
best_gh_cut["center"] = bin_center(reco_energy_bins)
best_gh_cut["high"] = reco_energy_bins[1:]
best_gh_cut["cut"] = np.nan
best_gh_cut["efficiency"] = np.nan

best_multiplicity_cut = QTable()
best_multiplicity_cut["low"] = reco_energy_bins[0:-1]
best_multiplicity_cut["center"] = bin_center(reco_energy_bins)
best_multiplicity_cut["high"] = reco_energy_bins[1:]
best_multiplicity_cut["cut"] = np.nan

best_theta_cut = QTable()
best_theta_cut["low"] = reco_energy_bins[0:-1]
best_theta_cut["center"] = bin_center(reco_energy_bins)
best_theta_cut["high"] = reco_energy_bins[1:]
best_theta_cut["cut"] = np.nan
best_theta_cut["cut"].unit = theta_cuts["cut"].unit
best_theta_cut["efficiency"] = np.nan

best_sensitivity = sensitivities[0].copy()
for bin_id in range(len(reco_energy_bins) - 1):
sensitivities_bin = [s["relative_sensitivity"][bin_id] for s in sensitivities]

if not np.all(np.isnan(sensitivities_bin)):
# nanargmin won't return the index of nan entries
best = np.nanargmin(sensitivities_bin)
else:
# if all are invalid, just use the first one
best = 0

multiplicity_index, theta_index, gh_index = cut_indicies[best]

best_sensitivity[bin_id] = sensitivities[best][bin_id]

best_gh_cut["cut"][bin_id] = gh_cuts["cut"][bin_id][gh_index]
best_multiplicity_cut["cut"][bin_id] = multiplicity_cuts[multiplicity_index]
best_theta_cut["cut"][bin_id] = theta_cuts["cut"][bin_id][theta_index]

best_gh_cut["efficiency"][bin_id] = gh_cut_efficiencies[gh_index]
best_theta_cut["efficiency"][bin_id] = theta_cut_efficiencies[theta_index]

return best_sensitivity, best_multiplicity_cut, best_theta_cut, best_gh_cut


def optimize_gh_cut(
signal,
background,
Expand All @@ -28,9 +214,6 @@ def optimize_gh_cut(
**kwargs
):
"""
Optimize the gh-score cut in every energy bin of reconstructed energy
for best sensitivity.

This procedure is EventDisplay-like, since it only applies a
pre-computed theta cut and then optimizes only the gamma/hadron separation
cut.
Expand All @@ -50,9 +233,6 @@ def optimize_gh_cut(
Bins in reconstructed energy to use for sensitivity computation
gh_cut_efficiencies: np.ndarray[float, ndim=1]
The cut efficiencies to scan for best sensitivity.
theta_cuts: astropy.table.QTable
cut definition of the energy dependent theta cut,
e.g. as created by ``calculate_percentile_cut``
op: comparison function with signature f(a, b) -> bool
The comparison function to use for the gamma hadron score.
Returning true means an event passes the cut, so is not discarded.
Expand Down Expand Up @@ -80,27 +260,31 @@ def optimize_gh_cut(
)

sensitivities = []
gh_cuts = []
for efficiency in tqdm(gh_cut_efficiencies, disable=not progress):

# calculate necessary percentile needed for
# ``calculate_percentile_cut`` with the correct efficiency.
# Depends on the operator, since we need to invert the
# efficiency if we compare using >=, since percentile is
# defines as <=.
if op(-1, 1): # if operator behaves like "<=", "<" etc:
percentile = 100 * efficiency
fill_value = signal['gh_score'].min()
else: # operator behaves like ">=", ">"
percentile = 100 * (1 - efficiency)
fill_value = signal['gh_score'].max()

gh_cut = calculate_percentile_cut(
signal['gh_score'], signal['reco_energy'],
bins=reco_energy_bins,
fill_value=fill_value, percentile=percentile,
)
gh_cuts.append(gh_cut)
# calculate necessary percentile needed for
# ``calculate_percentile_cut`` with the correct efficiency.
# Depends on the operator, since we need to invert the
# efficiency if we compare using >=, since percentile is
# defines as <=.
gh_cut_efficiencies = np.asanyarray(gh_cut_efficiencies)
if op(-1, 1):
# if operator behaves like "<=", "<" etc:
percentiles = 100 * gh_cut_efficiencies
fill_value = signal['gh_score'].min()
else:
# operator behaves like ">=", ">"
percentiles = 100 * (1 - gh_cut_efficiencies)
fill_value = signal['gh_score'].max()

gh_cuts = calculate_percentile_cut(
signal['gh_score'], signal['reco_energy'],
bins=reco_energy_bins,
fill_value=fill_value,
percentile=percentiles,
)

for col in tqdm(range(len(gh_cut_efficiencies)), disable=not progress):
gh_cut = gh_cuts.copy()
gh_cut["cut"] = gh_cuts["cut"][:, col]

# apply the current cut
signal_selected = evaluate_binned_cut(
Expand Down Expand Up @@ -136,6 +320,7 @@ def optimize_gh_cut(
best_cut_table["center"] = bin_center(reco_energy_bins)
best_cut_table["high"] = reco_energy_bins[1:]
best_cut_table["cut"] = np.nan
best_cut_table["efficiency"] = np.nan

best_sensitivity = sensitivities[0].copy()
for bin_id in range(len(reco_energy_bins) - 1):
Expand All @@ -149,6 +334,7 @@ def optimize_gh_cut(
best = 0

best_sensitivity[bin_id] = sensitivities[best][bin_id]
best_cut_table["cut"][bin_id] = gh_cuts[best]["cut"][bin_id]
best_cut_table["cut"][bin_id] = gh_cuts["cut"][bin_id][best]
best_cut_table["efficiency"][bin_id] = gh_cut_efficiencies[best]

return best_sensitivity, best_cut_table
54 changes: 48 additions & 6 deletions pyirf/cuts.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,34 @@ def calculate_percentile_cut(
bin_index, valid = calculate_bin_indices(bin_values, bins)
by_bin = table[valid].group_by(bin_index[valid])

n_bins = len(bins) - 1
cut_table = QTable()
cut_table["low"] = bins[:-1]
cut_table["high"] = bins[1:]
cut_table["center"] = bin_center(bins)
cut_table["n_events"] = 0
cut_table["cut"] = np.asanyarray(fill_value, values.dtype)

unit = None
if hasattr(fill_value, 'unit'):
unit = fill_value.unit
fill_value = fill_value.value

percentile = np.asanyarray(percentile)
if percentile.shape == ():
cut_table["cut"] = np.asanyarray(fill_value, values.dtype)
else:
cut_table["cut"] = np.full((n_bins, len(percentile)), fill_value, dtype=values.dtype)

if unit is not None:
cut_table["cut"].unit = unit

for bin_idx, group in zip(by_bin.groups.keys, by_bin.groups):
# replace bins with too few events with fill_value
n_events = len(group)
cut_table["n_events"][bin_idx] = n_events

if n_events < min_events:
cut_table["cut"][bin_idx] = fill_value
cut_table["cut"].value[bin_idx] = fill_value
else:
value = np.nanpercentile(group["values"], percentile)
if min_value is not None or max_value is not None:
Expand All @@ -93,6 +107,37 @@ def calculate_percentile_cut(
return cut_table


def evaluate_binned_cut_by_index(values, bin_index, valid, cut_table, op):
"""
Evaluate a binned cut as defined in cut_table on given events.

Events with bin_values outside the bin edges defined in cut table
will be set to False.

Parameters
----------
values: ``~numpy.ndarray`` or ``~astropy.units.Quantity``
The values on which the cut should be evaluated
cut_table: ``~astropy.table.Table``
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing doc for bin_index

A table describing the binned cuts, e.g. as created by
``~pyirf.cuts.calculate_percentile_cut``.
Required columns:
- `low`: lower edges of the bins
- `high`: upper edges of the bins,
- `cut`: cut value
op: callable(a, b) -> bool
A function taking two arguments, comparing element-wise and
returning an array of booleans.
Must support vectorized application.
"""
if not isinstance(cut_table, QTable):
raise ValueError('cut_table needs to be an astropy.table.QTable')

result = np.zeros(len(bin_index), dtype=bool)
result[valid] = op(values[valid], cut_table["cut"][bin_index[valid]])
return result


def evaluate_binned_cut(values, bin_values, cut_table, op):
"""
Evaluate a binned cut as defined in cut_table on given events.
Expand Down Expand Up @@ -123,10 +168,7 @@ def evaluate_binned_cut(values, bin_values, cut_table, op):

bins = np.append(cut_table["low"], cut_table["high"][-1])
bin_index, valid = calculate_bin_indices(bin_values, bins)

result = np.zeros(len(values), dtype=bool)
result[valid] = op(values[valid], cut_table["cut"][bin_index[valid]])
return result
return evaluate_binned_cut_by_index(values, bin_index, valid, cut_table, op)


def compare_irf_cuts(cuts):
Expand Down
3 changes: 2 additions & 1 deletion pyirf/spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ def __init__(self, normalization, index, e_ref, f, mu, sigma):
@u.quantity_input(energy=u.TeV)
def __call__(self, energy):
power = super().__call__(energy)
log10_e = np.log10(energy / self.e_ref)
e = (energy / self.e_ref).to_value(u.one)
log10_e = np.log10(e)
# ROOT's TMath::Gauss does not add the normalization
# this is missing from the IRFDocs
# the code used for the plot can be found here:
Expand Down
Loading