-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
optimization of multiplicity, theta and gh cut #204
base: main
Are you sure you want to change the base?
Changes from all commits
caa226b
67188b1
db27011
3eb0221
c4640e9
5bc7488
46ec05f
9bef4fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,205 @@ | ||
from itertools import product | ||
import numpy as np | ||
from astropy.table import QTable | ||
import astropy.units as u | ||
from tqdm import tqdm | ||
from tqdm.auto import tqdm | ||
import operator | ||
|
||
from .cuts import evaluate_binned_cut, calculate_percentile_cut | ||
|
||
from .cuts import evaluate_binned_cut_by_index, calculate_percentile_cut, evaluate_binned_cut | ||
from .sensitivity import calculate_sensitivity, estimate_background | ||
from .binning import create_histogram_table, bin_center | ||
from .binning import create_histogram_table, bin_center, calculate_bin_indices | ||
|
||
|
||
__all__ = [ | ||
"optimize_gh_cut", | ||
] | ||
|
||
|
||
def optimize_cuts( | ||
signal, | ||
background, | ||
reco_energy_bins, | ||
multiplicity_cuts, | ||
gh_cut_efficiencies, | ||
theta_cut_efficiencies, | ||
fov_offset_min=0 * u.deg, | ||
fov_offset_max=1 * u.deg, | ||
alpha=1.0, | ||
progress=True, | ||
**kwargs | ||
): | ||
""" | ||
Optimize the gamma/hadronnes, theta and multiplicity cut in every energy bin of reconstructed energy | ||
for best sensitivity. | ||
|
||
Parameters | ||
---------- | ||
signal: astropy.table.QTable | ||
event list of simulated signal events. | ||
Required columns are `theta`, `reco_energy`, 'weight', `gh_score` | ||
No directional (theta) or gamma/hadron cut should already be applied. | ||
background: astropy.table.QTable | ||
event list of simulated background events. | ||
Required columns are `reco_source_fov_offset`, `reco_energy`, | ||
'weight', `gh_score`. | ||
No directional (theta) or gamma/hadron cut should already be applied. | ||
reco_energy_bins: astropy.units.Quantity[energy] | ||
Bins in reconstructed energy to use for sensitivity computation | ||
gh_cut_efficiencies: np.ndarray[float, ndim=1] | ||
The cut efficiencies to scan for best sensitivity. | ||
theta_cuts: astropy.table.QTable | ||
cut definition of the energy dependent theta cut, | ||
e.g. as created by ``calculate_percentile_cut`` | ||
op: comparison function with signature f(a, b) -> bool | ||
The comparison function to use for the gamma hadron score. | ||
Returning true means an event passes the cut, so is not discarded. | ||
E.g. for gammaness-like score, use `operator.ge` (>=) and for a | ||
hadroness-like score use `operator.le` (<=). | ||
fov_offset_min: astropy.units.Quantity[angle] | ||
Minimum distance from the fov center for background events to be taken into account | ||
fov_offset_max: astropy.units.Quantity[angle] | ||
Maximum distance from the fov center for background events to be taken into account | ||
alpha: float | ||
Size ratio of off region / on region. Will be used to | ||
scale the background rate. | ||
progress: bool | ||
If True, show a progress bar during cut optimization | ||
**kwargs are passed to ``calculate_sensitivity`` | ||
""" | ||
gh_cut_efficiencies = np.asanyarray(gh_cut_efficiencies) | ||
gh_cut_percentiles = 100 * (1 - gh_cut_efficiencies) | ||
fill_value = signal['gh_score'].max() | ||
|
||
|
||
sensitivities = [] | ||
cut_indicies = [] | ||
n_theta_cuts = len(theta_cut_efficiencies) | ||
n_gh_cuts = len(gh_cut_efficiencies) | ||
n_cuts = len(multiplicity_cuts) * n_theta_cuts * n_gh_cuts | ||
|
||
signal_bin_index, signal_valid = calculate_bin_indices( | ||
signal['reco_energy'], reco_energy_bins, | ||
) | ||
background_bin_index, background_valid = calculate_bin_indices( | ||
background['reco_energy'], reco_energy_bins, | ||
) | ||
|
||
with tqdm(total=n_cuts, disable=not progress) as bar: | ||
for multiplicity_index, multiplicity_cut in enumerate(multiplicity_cuts): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be useful to eventually adapt this to use scipy.optimize.brute internally rather than implement the grid-search manually, as that would make it easier to replace it with more efficient optimizers later There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using the manual grid search, we can be a bit cleverer than brute force: make the cuts in the outermost loop possible rather than computing all cuts for all computations. |
||
|
||
signal_mask_multiplicity = signal['multiplicity'] >= multiplicity_cut | ||
background_mask_multiplicity = background["multiplicity"] >= multiplicity_cut | ||
|
||
gh_cuts = calculate_percentile_cut( | ||
signal['gh_score'][signal_mask_multiplicity], | ||
signal['reco_energy'][signal_mask_multiplicity], | ||
bins=reco_energy_bins, | ||
fill_value=fill_value, | ||
percentile=gh_cut_percentiles, | ||
) | ||
|
||
theta_cuts = calculate_percentile_cut( | ||
signal['theta'][signal_mask_multiplicity], | ||
signal['reco_energy'][signal_mask_multiplicity], | ||
bins=reco_energy_bins, | ||
fill_value=0.3 * u.deg, | ||
max_value=0.3 * u.deg, | ||
min_value=0.02 * u.deg, | ||
percentile=100 * theta_cut_efficiencies, | ||
) | ||
|
||
|
||
for gh_index, theta_index in product(range(n_gh_cuts), range(n_theta_cuts)): | ||
|
||
# apply the current cuts | ||
theta_cut = theta_cuts.copy() | ||
theta_cut["cut"] = theta_cuts["cut"][:, theta_index] | ||
signal_mask_theta = evaluate_binned_cut_by_index( | ||
signal["theta"], signal_bin_index, signal_valid, theta_cut, operator.le, | ||
) | ||
|
||
gh_cut = gh_cuts.copy() | ||
gh_cut["cut"] = gh_cuts["cut"][:, gh_index] | ||
signal_mask_gh = evaluate_binned_cut_by_index( | ||
signal["gh_score"], signal_bin_index, signal_valid, gh_cut, operator.ge, | ||
) | ||
|
||
signal_selected = signal_mask_gh & signal_mask_theta & signal_mask_multiplicity | ||
|
||
background_mask_gh = evaluate_binned_cut_by_index( | ||
background["gh_score"], background_bin_index, background_valid, gh_cut, operator.ge, | ||
) | ||
background_selected = background_mask_gh & background_mask_multiplicity | ||
|
||
# create the histograms | ||
signal_hist = create_histogram_table( | ||
signal[signal_selected], reco_energy_bins, "reco_energy" | ||
) | ||
|
||
background_hist = estimate_background( | ||
events=background[background_selected], | ||
reco_energy_bins=reco_energy_bins, | ||
theta_cuts=theta_cut, | ||
alpha=alpha, | ||
fov_offset_min=fov_offset_min, | ||
fov_offset_max=fov_offset_max, | ||
) | ||
|
||
sensitivity = calculate_sensitivity( | ||
signal_hist, background_hist, alpha=alpha, | ||
**kwargs, | ||
) | ||
cut_indicies.append((multiplicity_index, theta_index, gh_index)) | ||
sensitivities.append(sensitivity) | ||
bar.update(1) | ||
|
||
best_gh_cut = QTable() | ||
best_gh_cut["low"] = reco_energy_bins[0:-1] | ||
best_gh_cut["center"] = bin_center(reco_energy_bins) | ||
best_gh_cut["high"] = reco_energy_bins[1:] | ||
best_gh_cut["cut"] = np.nan | ||
best_gh_cut["efficiency"] = np.nan | ||
|
||
best_multiplicity_cut = QTable() | ||
best_multiplicity_cut["low"] = reco_energy_bins[0:-1] | ||
best_multiplicity_cut["center"] = bin_center(reco_energy_bins) | ||
best_multiplicity_cut["high"] = reco_energy_bins[1:] | ||
best_multiplicity_cut["cut"] = np.nan | ||
|
||
best_theta_cut = QTable() | ||
best_theta_cut["low"] = reco_energy_bins[0:-1] | ||
best_theta_cut["center"] = bin_center(reco_energy_bins) | ||
best_theta_cut["high"] = reco_energy_bins[1:] | ||
best_theta_cut["cut"] = np.nan | ||
best_theta_cut["cut"].unit = theta_cuts["cut"].unit | ||
best_theta_cut["efficiency"] = np.nan | ||
|
||
best_sensitivity = sensitivities[0].copy() | ||
for bin_id in range(len(reco_energy_bins) - 1): | ||
sensitivities_bin = [s["relative_sensitivity"][bin_id] for s in sensitivities] | ||
|
||
if not np.all(np.isnan(sensitivities_bin)): | ||
# nanargmin won't return the index of nan entries | ||
best = np.nanargmin(sensitivities_bin) | ||
else: | ||
# if all are invalid, just use the first one | ||
best = 0 | ||
|
||
multiplicity_index, theta_index, gh_index = cut_indicies[best] | ||
|
||
best_sensitivity[bin_id] = sensitivities[best][bin_id] | ||
|
||
best_gh_cut["cut"][bin_id] = gh_cuts["cut"][bin_id][gh_index] | ||
best_multiplicity_cut["cut"][bin_id] = multiplicity_cuts[multiplicity_index] | ||
best_theta_cut["cut"][bin_id] = theta_cuts["cut"][bin_id][theta_index] | ||
|
||
best_gh_cut["efficiency"][bin_id] = gh_cut_efficiencies[gh_index] | ||
best_theta_cut["efficiency"][bin_id] = theta_cut_efficiencies[theta_index] | ||
|
||
return best_sensitivity, best_multiplicity_cut, best_theta_cut, best_gh_cut | ||
|
||
|
||
def optimize_gh_cut( | ||
signal, | ||
background, | ||
|
@@ -28,9 +214,6 @@ def optimize_gh_cut( | |
**kwargs | ||
): | ||
""" | ||
Optimize the gh-score cut in every energy bin of reconstructed energy | ||
for best sensitivity. | ||
|
||
This procedure is EventDisplay-like, since it only applies a | ||
pre-computed theta cut and then optimizes only the gamma/hadron separation | ||
cut. | ||
|
@@ -50,9 +233,6 @@ def optimize_gh_cut( | |
Bins in reconstructed energy to use for sensitivity computation | ||
gh_cut_efficiencies: np.ndarray[float, ndim=1] | ||
The cut efficiencies to scan for best sensitivity. | ||
theta_cuts: astropy.table.QTable | ||
cut definition of the energy dependent theta cut, | ||
e.g. as created by ``calculate_percentile_cut`` | ||
op: comparison function with signature f(a, b) -> bool | ||
The comparison function to use for the gamma hadron score. | ||
Returning true means an event passes the cut, so is not discarded. | ||
|
@@ -80,27 +260,31 @@ def optimize_gh_cut( | |
) | ||
|
||
sensitivities = [] | ||
gh_cuts = [] | ||
for efficiency in tqdm(gh_cut_efficiencies, disable=not progress): | ||
|
||
# calculate necessary percentile needed for | ||
# ``calculate_percentile_cut`` with the correct efficiency. | ||
# Depends on the operator, since we need to invert the | ||
# efficiency if we compare using >=, since percentile is | ||
# defines as <=. | ||
if op(-1, 1): # if operator behaves like "<=", "<" etc: | ||
percentile = 100 * efficiency | ||
fill_value = signal['gh_score'].min() | ||
else: # operator behaves like ">=", ">" | ||
percentile = 100 * (1 - efficiency) | ||
fill_value = signal['gh_score'].max() | ||
|
||
gh_cut = calculate_percentile_cut( | ||
signal['gh_score'], signal['reco_energy'], | ||
bins=reco_energy_bins, | ||
fill_value=fill_value, percentile=percentile, | ||
) | ||
gh_cuts.append(gh_cut) | ||
# calculate necessary percentile needed for | ||
# ``calculate_percentile_cut`` with the correct efficiency. | ||
# Depends on the operator, since we need to invert the | ||
# efficiency if we compare using >=, since percentile is | ||
# defines as <=. | ||
gh_cut_efficiencies = np.asanyarray(gh_cut_efficiencies) | ||
if op(-1, 1): | ||
# if operator behaves like "<=", "<" etc: | ||
percentiles = 100 * gh_cut_efficiencies | ||
fill_value = signal['gh_score'].min() | ||
else: | ||
# operator behaves like ">=", ">" | ||
percentiles = 100 * (1 - gh_cut_efficiencies) | ||
fill_value = signal['gh_score'].max() | ||
|
||
gh_cuts = calculate_percentile_cut( | ||
signal['gh_score'], signal['reco_energy'], | ||
bins=reco_energy_bins, | ||
fill_value=fill_value, | ||
percentile=percentiles, | ||
) | ||
|
||
for col in tqdm(range(len(gh_cut_efficiencies)), disable=not progress): | ||
gh_cut = gh_cuts.copy() | ||
gh_cut["cut"] = gh_cuts["cut"][:, col] | ||
|
||
# apply the current cut | ||
signal_selected = evaluate_binned_cut( | ||
|
@@ -136,6 +320,7 @@ def optimize_gh_cut( | |
best_cut_table["center"] = bin_center(reco_energy_bins) | ||
best_cut_table["high"] = reco_energy_bins[1:] | ||
best_cut_table["cut"] = np.nan | ||
best_cut_table["efficiency"] = np.nan | ||
|
||
best_sensitivity = sensitivities[0].copy() | ||
for bin_id in range(len(reco_energy_bins) - 1): | ||
|
@@ -149,6 +334,7 @@ def optimize_gh_cut( | |
best = 0 | ||
|
||
best_sensitivity[bin_id] = sensitivities[best][bin_id] | ||
best_cut_table["cut"][bin_id] = gh_cuts[best]["cut"][bin_id] | ||
best_cut_table["cut"][bin_id] = gh_cuts["cut"][bin_id][best] | ||
best_cut_table["efficiency"][bin_id] = gh_cut_efficiencies[best] | ||
|
||
return best_sensitivity, best_cut_table |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,20 +62,34 @@ def calculate_percentile_cut( | |
bin_index, valid = calculate_bin_indices(bin_values, bins) | ||
by_bin = table[valid].group_by(bin_index[valid]) | ||
|
||
n_bins = len(bins) - 1 | ||
cut_table = QTable() | ||
cut_table["low"] = bins[:-1] | ||
cut_table["high"] = bins[1:] | ||
cut_table["center"] = bin_center(bins) | ||
cut_table["n_events"] = 0 | ||
cut_table["cut"] = np.asanyarray(fill_value, values.dtype) | ||
|
||
unit = None | ||
if hasattr(fill_value, 'unit'): | ||
unit = fill_value.unit | ||
fill_value = fill_value.value | ||
|
||
percentile = np.asanyarray(percentile) | ||
if percentile.shape == (): | ||
cut_table["cut"] = np.asanyarray(fill_value, values.dtype) | ||
else: | ||
cut_table["cut"] = np.full((n_bins, len(percentile)), fill_value, dtype=values.dtype) | ||
|
||
if unit is not None: | ||
cut_table["cut"].unit = unit | ||
|
||
for bin_idx, group in zip(by_bin.groups.keys, by_bin.groups): | ||
# replace bins with too few events with fill_value | ||
n_events = len(group) | ||
cut_table["n_events"][bin_idx] = n_events | ||
|
||
if n_events < min_events: | ||
cut_table["cut"][bin_idx] = fill_value | ||
cut_table["cut"].value[bin_idx] = fill_value | ||
else: | ||
value = np.nanpercentile(group["values"], percentile) | ||
if min_value is not None or max_value is not None: | ||
|
@@ -93,6 +107,37 @@ def calculate_percentile_cut( | |
return cut_table | ||
|
||
|
||
def evaluate_binned_cut_by_index(values, bin_index, valid, cut_table, op): | ||
""" | ||
Evaluate a binned cut as defined in cut_table on given events. | ||
|
||
Events with bin_values outside the bin edges defined in cut table | ||
will be set to False. | ||
|
||
Parameters | ||
---------- | ||
values: ``~numpy.ndarray`` or ``~astropy.units.Quantity`` | ||
The values on which the cut should be evaluated | ||
cut_table: ``~astropy.table.Table`` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing doc for bin_index |
||
A table describing the binned cuts, e.g. as created by | ||
``~pyirf.cuts.calculate_percentile_cut``. | ||
Required columns: | ||
- `low`: lower edges of the bins | ||
- `high`: upper edges of the bins, | ||
- `cut`: cut value | ||
op: callable(a, b) -> bool | ||
A function taking two arguments, comparing element-wise and | ||
returning an array of booleans. | ||
Must support vectorized application. | ||
""" | ||
if not isinstance(cut_table, QTable): | ||
raise ValueError('cut_table needs to be an astropy.table.QTable') | ||
|
||
result = np.zeros(len(bin_index), dtype=bool) | ||
result[valid] = op(values[valid], cut_table["cut"][bin_index[valid]]) | ||
return result | ||
|
||
|
||
def evaluate_binned_cut(values, bin_values, cut_table, op): | ||
""" | ||
Evaluate a binned cut as defined in cut_table on given events. | ||
|
@@ -123,10 +168,7 @@ def evaluate_binned_cut(values, bin_values, cut_table, op): | |
|
||
bins = np.append(cut_table["low"], cut_table["high"][-1]) | ||
bin_index, valid = calculate_bin_indices(bin_values, bins) | ||
|
||
result = np.zeros(len(values), dtype=bool) | ||
result[valid] = op(values[valid], cut_table["cut"][bin_index[valid]]) | ||
return result | ||
return evaluate_binned_cut_by_index(values, bin_index, valid, cut_table, op) | ||
|
||
|
||
def compare_irf_cuts(cuts): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doc is wrong: it's not the cuts being passed here, but the array of values over which the brute-force optimization scan is done, right? Also, for symmetry, shouldn't there be a similar input for multiplicity?