Skip to content

Commit

Permalink
fixed bug in detection of pathological cases in devPAV (#92)
Browse files Browse the repository at this point in the history
- made new detection of pathological cases
- created output for pathological cases
- adjusted tests

---------

Co-authored-by: Camille van Dijk <c.van.dijk@nfi.nl>
Co-authored-by: Kim de Bie (DBS) <kim@7zj9vz2-l.holmes.nl>
Co-authored-by: camillevandijk <144999965+camillevandijk@users.noreply.github.com>
Co-authored-by: wb <20113294+wowtor@users.noreply.github.com>
  • Loading branch information
5 people authored Nov 8, 2023
1 parent cd8e3a1 commit 732266e
Show file tree
Hide file tree
Showing 18 changed files with 45,283 additions and 45,431 deletions.
10 changes: 6 additions & 4 deletions lir/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
from typing import Optional, Tuple, Union, Iterable, Callable, Sized
from typing import Optional, Tuple, Union, Callable, Sized

from .bayeserror import elub
from .loss_functions import negative_log_likelihood_balanced
Expand Down Expand Up @@ -500,6 +500,8 @@ def __init__(self, n_components_H0=1, n_components_H1=1):
warnings.warn(f"the class {type(self).__name__} will be removed in the future")
self.n_components_H1 = n_components_H1
self.n_components_H0 = n_components_H0
self._model0 = None
self._model1 = None

def fit(self, X, y):
X0, X1 = Xy_to_Xn(X, y)
Expand Down Expand Up @@ -582,21 +584,21 @@ def fit(self, X, y):
# then define 4PL-logistic model
self.model = self._four_pl_model
bounds.extend([(10**-10, 1-10**-10), (10**-10, np.inf)])
LOG.warning("There were -Inf lrs for the same source samples and Inf lrs for the different source samples "
LOG.debug("There were -Inf lrs for the same source samples and Inf lrs for the different source samples "
", therefore a 4pl calibrator was fitted.")
elif estimate_c:
# then define 3-PL logistic model. Set 'd' to 0
self.model = partial(self._four_pl_model, d=0)
# use very small values since limits result in -inf llh
bounds.append((10**-10, 1-10**-10))
LOG.warning("There were -Inf lrs for the same source samples, therefore a 3pl calibrator was fitted.")
LOG.debug("There were -Inf lrs for the same source samples, therefore a 3pl calibrator was fitted.")
elif estimate_d:
# then define 3-PL logistic model. Set 'c' to 0
# use bind since 'c' is intermediate variable. In that case partial does not work.
self.model = Bind(self._four_pl_model, ..., ..., ..., 0, ...)
# use very small value since limits result in -inf llh
bounds.append((10**-10, np.inf))
LOG.warning("There were Inf lrs for the different source samples, therefore a 3pl calibrator was fitted.")
LOG.debug("There were Inf lrs for the different source samples, therefore a 3pl calibrator was fitted.")
else:
# define ordinary logistic model (no regularization, so maximum likelihood estimates)
self.model = partial(self._four_pl_model, c=0, d=0)
Expand Down
128 changes: 36 additions & 92 deletions lir/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,57 +62,7 @@ def cllr_min(lrs, y, weights=(1, 1)):
return cllr(lrmin, y, weights)


def devpav_estimated(lrs, y, resolution=1000):
"""
Estimate devPAV, a metric for calibration.
devPAV is the cumulative deviation of the PAV transformation from
the identity line. It is calculated in the LR range where misleading LRs
occur.
See also: P. Vergeer, Measuring calibration of likelihood ratio systems: a
comparison of four systems, including a new metric devPAV, to appear
This implementation estimates devPAV by calculating the average deviation
for a large number of LRs.
Parameters
----------
lrs : a numpy array of LRs
y : a numpy array of labels (0 or 1)
resolution : the number of measurements in the range of misleading evidence; a higher value yields a more accurate estimation
Returns
-------
devPAV
an estimation of devPAV
"""
lrs0, lrs1 = Xy_to_Xn(lrs, y)
if len(lrs0) == 0 or len(lrs1) == 0:
raise ValueError('devpav: illegal input: at least one value is required for each class')

# find misleading LR extremes
first_misleading = np.min(lrs1)
last_misleading = np.max(lrs0)
if first_misleading > last_misleading: # test for perfect discrimination
return 0

if np.isinf(first_misleading) or np.isinf(last_misleading): # test for infinitely misleading LRs
return np.inf

# calibrate on the input LRs
cal = IsotonicCalibrator()
cal.fit_transform(to_probability(lrs), y)

# take `resolution` points evenly divided along the range of misleading LRs
xlr = np.exp(np.linspace(np.log(first_misleading), np.log(last_misleading), resolution))
pavlr = cal.transform(to_probability(xlr))

devlr = np.absolute(np.log10(xlr) - np.log10(pavlr))
return (np.sum(devlr) / resolution) * (np.log10(last_misleading) - np.log10(first_misleading))


def calcsurface_f(c1, c2):
def _calcsurface(c1: (float, float), c2: (float, float)) -> float:
"""
Helperfunction that calculates the desired surface for two xy-coordinates
"""
Expand All @@ -125,8 +75,10 @@ def calcsurface_f(c1, c2):

if a == 1:
# dan xs equals +/- Infinite en is er there is no intersection with the identity line
# since condition 1 holds the product below is always positive
surface = (y2 - y1) * (x2 - x1)

# the surface of the parallellogram is:
surface = (x2 - x1) * np.abs(y1 - x1)

elif (a < 0):
raise ValueError(f"slope is negative; impossible for PAV-transform. Coordinates are {c1} and {c2}. Calculated slope is {a}")
else:
Expand Down Expand Up @@ -168,46 +120,36 @@ def _devpavcalculator(lrs, pav_lrs, y):
Output: devPAV value
"""
DSLRs, SSLRs = Xy_to_Xn(lrs,y)
DSLRs, SSLRs = Xy_to_Xn(lrs, y)
DSPAVLRs, SSPAVLRs = Xy_to_Xn(pav_lrs, y)
PAVresult = np.concatenate([SSPAVLRs, DSPAVLRs])
Xen = np.concatenate([SSLRs, DSLRs])

# order coordinates based on x's then y's and filtering out identical datapoints
data = np.unique(np.array([Xen, PAVresult]), axis=1)
Xen = data[0,:]
Yen = data[1,:]
Xen = data[0, :]
Yen = data[1, :]

# pathological cases
# check if min(Xen) = 0 or max(Xen) = Inf. First min(Xen)
# eerst van drie: als Xen[0] == 0 en Xen[len(Xen)-1] != Inf
if Xen[0] == 0 and Xen[-1] != np.inf:
if Yen[0] == 0 and Yen[1] != 0:
# dan loopt er een lijn in de PAV transform tot {inf, -Inf} evenwijdig aan de lijn y=x
return (np.absolute(np.log10(Xen[1]) - np.log10(Yen[1])))
else:
# dan is Yen[0] finite of Yen[1] gelijk 0 en loopt er ergens een horizontale lijn tot Log(Xen[0]) = -Inf. devPAV wordt oneindig
return np.inf
# tweede van drie: als Xen[len(Xen)-1] == Inf en Xen[0] != 0
elif Xen[0] != 0 and Xen[-1] == np.inf:
if Yen[len(Yen) - 1] == np.inf and Yen[len(Yen) - 2] != np.inf:
# dan loopt er een lijn in de PAV transform tot {inf, -Inf} evenwijdig aan de lijn y=x
return (np.absolute(np.log10(Xen[len(Xen) - 2]) - np.log10(Yen[len(Yen) - 2])))
else:
# dan is Yen[len(Yen] finite of Yen[len(Yen-2] gelijk inf en loopt er ergens een horizontale lijn tot Log(Xen[len(Xen)]) = Inf. devPAV wordt oneindig
return np.inf
# derde van drie: als Xen[0] = 0 en Xen[len(Xen)-1] == Inf
elif Xen[0] == 0 and Xen[-1] == np.inf:
if Yen[len(Yen) - 1] == np.inf and Yen[len(Yen) - 2] != np.inf and Yen[0] == 0 and Yen[1] != 0:
# dan zijn de lijnen aan beide uiteinden evenwijdig met de lijn Y=X. Het berekenen van devPAV is het gemiddelde van twee coordinaten
devPAV = ((np.absolute(np.log10(Xen[len(Xen) - 2]) - np.log10(Yen[len(Yen) - 2]))) + np.absolute(
np.log10(Xen[1]) - np.log10(Yen[1]))) / 2
return (devPAV)
else:
# dan loopt er ergens een horizontale lijn met oneindig bereik is is devPAV Inf
return np.inf
# first one of four: PAV-transform has a horizonal line to log(X) = -Inf as to log(X) = Inf
if Yen[0] != 0 and Yen[-1] != np.inf and Xen[-1] == np.inf and Xen[-1] == np.inf:
return np.Inf

# second of four: PAV-transform has a horizontal line to log(X) = -Inf
if Yen[0] != 0 and Xen[0] == 0 and Yen[-1] == np.inf:
return np.Inf

# third of four: PAV-transform has a horizontal line to log(X) = Inf
if Yen[0] == 0 and Yen[-1] != np.inf and Xen[-1] == np.inf:
return np.Inf

# forth of four: PAV-transform has one vertical line from log(Y) = -Inf to log(Y) = Inf
wh = (Yen == 0) | (Yen == np.inf)
if np.sum(wh) == len(Yen):
return np.nan

else:
# dan is het geen pathological case met rare X-waarden en kan devPAV berekend worden
# then it is not a pathological case with weird X-values and devPAV can be calculated

# filtering out -Inf or 0 Y's
wh = (Yen > 0) & (Yen < np.inf)
Expand All @@ -219,24 +161,26 @@ def _devpavcalculator(lrs, pav_lrs, y):
if len(Xen) == 0:
return np.nan
elif len(Xen) == 1:
return (abs(Xen - Yen))
return abs(Xen - Yen)
# than calculate devPAV
else:
deltaX = Xen[-1] - Xen[0]
surface = (0)
surface = 0
for i in range(1, (len(Xen))):
surface = surface + calcsurface_f((Xen[i - 1], Yen[i - 1]), (Xen[i], Yen[i]))
devPAVs[i - 1] = calcsurface_f((Xen[i - 1], Yen[i - 1]), (Xen[i], Yen[i]))
surface = surface + _calcsurface((Xen[i - 1], Yen[i - 1]), (Xen[i], Yen[i]))
devPAVs[i - 1] = _calcsurface((Xen[i - 1], Yen[i - 1]), (Xen[i], Yen[i]))
# return(list(surface/a, PAVresult, Xen, Yen, devPAVs))
return (surface / deltaX)
return surface / deltaX


def devpav(lrs, y):
def devpav(lrs: np.ndarray, y: np.ndarray) -> float:
"""
calculates PAV transform of LR data under H1 and H2.
calculates devPAV for LR data under H1 and H2.
"""
if sum(y) == len(y) or sum(y) == 0:
raise ValueError('devpav: illegal input: at least one value is required for each class')
cal = IsotonicCalibrator()
pavlrs = cal.fit_transform(to_probability(lrs), y)
pavlrs = cal.fit_transform(lrs, y)
return _devpavcalculator(lrs, pavlrs, y)


Expand Down
2 changes: 0 additions & 2 deletions lir/multiclass/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from . import metrics
from . import util
from . import calibration as calibration
from .calibration import DummyCalibrator, LogitCalibrator, BalancedPriorCalibrator
from .lr import CalibratedScorer
from .util import to_probability, to_odds
6 changes: 1 addition & 5 deletions lir/multiclass/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,13 @@
returns a two dimensional array of lrs; same dimensions as X
"""
import logging
import math
import warnings

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KernelDensity

from ..calibration import DummyCalibrator
from .util import to_odds, get_classes_from_scores_Xy, to_probability
from ..util import to_odds, to_probability

LOG = logging.getLogger(__name__)

Expand Down
4 changes: 1 addition & 3 deletions lir/multiclass/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import numpy as np

from ..util import to_probability, to_odds, get_classes_from_Xy, Xy_to_Xn
from ..util import get_classes_from_Xy, Xy_to_Xn


def get_classes_from_scores_Xy(scores, y, classes=None):
Expand Down
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
author="Netherlands Forensic Institute",
author_email="fbda@nfi.nl",
packages=find_packages(),
setup_requires=["nose"],
test_suite="nose.collector",
install_requires=["matplotlib", "numpy", "scipy", "scikit-learn", "tqdm"],
classifiers=[
"License :: OSI Approved :: Apache Software License",
Expand Down
Empty file added tests/__init__.py
Empty file.
9 changes: 0 additions & 9 deletions tests/context.py

This file was deleted.

Loading

0 comments on commit 732266e

Please sign in to comment.