Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 8 additions & 15 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
syntax: glob
.python-version
.venv
env/*
venv/*
ENV/*
.idea
.vscode
.venv*
.vscode*
.DS_Store
dython.egg*/*
*.egg-info*
*__pycache__*
*run_stuff.py*

build/*
dist/*
build_deploy.sh
site/*
debug.py
.coverage
.hypothesis
.pytest_cache*

*.coverage*
*.hypothesis*
*.pytest_cache*
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Change Log

## 0.7.12 (dev)
## 0.7.12
* _Dython now officially supports Python 3.14_
* Added new tests (issue [#172](https://github.com/shakedzy/dython/issues/172))
* `examples` module removed (all examples exist in the [official documentation](https://shakedzy.xyz/dython/getting_started/examples/))
* Added [Youden's J](https://en.wikipedia.org/wiki/Youden%27s_J_statistic) statistic to `model_utils.metric_graph` ROC Curve option _(breaking change: function signature has changed)_.

## 0.7.11
* Fixing dependency issue ([#170](https://github.com/shakedzy/dython/issues/170))
Expand Down
21 changes: 19 additions & 2 deletions docs/modules/model_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,15 @@ Plots true-positive rate as a function of the false-positive rate of the positiv
where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
(0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.

Computes the estimated optimal threshold using two methods:
* Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance
* Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$)

**Precision-Recall:**
Plots precision as a function of recall of the positive label in a binary classification, where
$Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
line with precision of the ratio of positive examples in the dataset.
Estimated optimal threshold is computed using Euclidean (geometric) distance.

Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) (as was seen on April 2018):

Expand Down Expand Up @@ -258,8 +263,20 @@ Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/mo
consider the data as a multiclass data rather than binary (useful when plotting
curves of different models one against the other)

**Returns:** A dictionary, one key for each class. Each value is another dictionary,
holding AUC and eOpT values.
**Returns:**
A dictionary with these keys:
- `ax`: the Matplotlib plot axis
- `metrics`: each key is a class name from the list of provided classes.,
Per each class, another dict exists with AUC results
and measurement methods results.
AUC key holds both the measured area-under-curve (under `val`)
and the AUC of a random-guess classifier (under `naive`) for
comparison.
Each measurement method key contains three values: `x`, `y`, `val`,
corresponding to the (x,y) coordinates on the metric graph of the
threshold, and its value.
If only one class exists, then the measurements method keys and AUC
will be directly under `metrics`.

**Example:** See [examples](../getting_started/examples.md).

Expand Down
8 changes: 4 additions & 4 deletions dython/_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Any, Literal, cast, overload, Type
from typing import Any, cast, overload, Type
from .typing import OneDimArray, TwoDimArray


Expand Down Expand Up @@ -92,16 +92,16 @@ def convert(
)
)
else:
return converted
return converted # pyright: ignore[reportReturnType]


def remove_incomplete_samples(
x: OneDimArray,
y: OneDimArray,
) -> tuple[OneDimArray, OneDimArray]:

x = [v if v is not None else np.nan for v in x]
y = [v if v is not None else np.nan for v in y]
x = [v if v is not None else np.nan for v in x] # pyright: ignore[reportAssignmentType]
y = [v if v is not None else np.nan for v in y] # pyright: ignore[reportAssignmentType]
arr = np.array([x, y]).transpose()
arr = arr[~np.isnan(arr).any(axis=1)].transpose()
if isinstance(x, list):
Expand Down
143 changes: 98 additions & 45 deletions dython/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.preprocessing import LabelEncoder
from typing import Any, Iterable
from .typing import Number, OneDimArray
from .typing import Number, OneDimArray, MetricGraphResult, SingleCurveResult, SingleMethodResult
from ._private import convert, plot_or_not

__all__ = ["random_forest_feature_importance", "metric_graph", "ks_abc"]
Expand Down Expand Up @@ -53,29 +53,38 @@ def _draw_estimated_optimal_threshold_mark(
ms: int,
fmt: str,
ax: Axes,
) -> tuple[Number, Number, Number]:
) -> list[tuple[Number, Number, Number]]:
annotation_offset = (-0.027, 0.03)
a = np.zeros((len(x_axis), 2))
a[:, 0] = x_axis
a[:, 1] = y_axis
a = a[a[:, 0] != a[:, 1]]
if metric == "roc":
dist = lambda row: row[0] ** 2 + (1 - row[1]) ** 2 # optimal: (0,1)
dists = [ # optimal: (0,1)
lambda row: row[0] ** 2 + (1 - row[1]) ** 2, # geo
lambda row: row[0] - row[1] # Inverse Youden's J (X-Y instead of Y-X) as later on we're finding the min value, and Youden's J needs to be maximized
]
else: # metric == 'pr'
dist = (
lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2
) # optimal: (1,1)
amin = np.apply_along_axis(dist, 1, a).argmin()
ax.plot(x_axis[amin], y_axis[amin], color=color, marker="o", ms=ms) # pyright: ignore[reportCallIssue, reportArgumentType]
ax.annotate(
"{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), # pyright: ignore[reportCallIssue, reportArgumentType]
xy=(x_axis[amin], y_axis[amin]), # pyright: ignore[reportCallIssue, reportArgumentType]
color=color,
xytext=(
x_axis[amin] + annotation_offset[0], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
y_axis[amin] + annotation_offset[1], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
),
)
return thresholds[amin], x_axis[amin], y_axis[amin] # pyright: ignore[reportCallIssue, reportArgumentType, reportReturnType]
dists = [ # optimal: (1,1)
lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2 # geo
]
output_tuples = []
for dist, marker in zip(dists, ['o','x']):
amin = np.apply_along_axis(dist, 1, a).argmin()
ax.plot(x_axis[amin], y_axis[amin], color=color, marker=marker, ms=ms) # pyright: ignore[reportCallIssue, reportArgumentType]
ax.annotate(
"{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), # pyright: ignore[reportCallIssue, reportArgumentType]
xy=(x_axis[amin], y_axis[amin]), # pyright: ignore[reportCallIssue, reportArgumentType]
color=color,
xytext=(
x_axis[amin] + annotation_offset[0], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
y_axis[amin] + annotation_offset[1], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
),
)
output_tuples.append(
(thresholds[amin], x_axis[amin], y_axis[amin]) # pyright: ignore[reportArgumentType, reportCallIssue]
)
return output_tuples


def _plot_macro_metric(
Expand Down Expand Up @@ -141,39 +150,58 @@ def _binary_metric_graph(
metric=metric.upper(), class_label=class_label, auc=auc_score, fmt=fmt
)
if metric == "pr":
label += ", naive = {ytr:{fmt}}".format(ytr=y_t_ratio, fmt=fmt)
label += ", naive = {ytr:{fmt}})".format(ytr=y_t_ratio, fmt=fmt)
if eoptimal:
eopt, eopt_x, eopt_y = _draw_estimated_optimal_threshold_mark(
eopts = _draw_estimated_optimal_threshold_mark(
metric, x_axis, y_axis, th, color, ms, fmt, ax
)
label += ", eOpT = {th:{fmt}})".format(th=eopt, fmt=fmt)
if len(eopts) == 1:
eopts.append((None, None, None)) # pyright: ignore[reportArgumentType]
else:
eopt = None
eopt_x = None
eopt_y = None
label += ")"
eopts = [
(None, None, None),
(None, None, None)
]
ax.plot(x_axis, y_axis, color=color, lw=lw, ls=ls, label=label)
return {
"x": x_axis,
"y": y_axis,
"thresholds": th,
"auc": auc_score,
"eopt": eopt,
"eopt_x": eopt_x,
"eopt_y": eopt_y,
"eopts": [
{
"eopt": eopts[0][0],
"eopt_x": eopts[0][1],
"eopt_y": eopts[0][2],
"name": "geo"
},
{
"eopt": eopts[1][0],
"eopt_x": eopts[1][1],
"eopt_y": eopts[1][2],
"name": "youden_j"
},
],
"y_t_ratio": y_t_ratio,
}


def _build_metric_graph_output_dict(
metric: str,
d: dict[str, Any]
) -> dict[str, dict[str, Any]]:
) -> SingleCurveResult:
naive = d["y_t_ratio"] if metric == "pr" else 0.5
return {
"auc": {"val": d["auc"], "naive": naive},
"eopt": {"val": d["eopt"], "x": d["eopt_x"], "y": d["eopt_y"]},
}
output: dict = {'auc': {"val": d["auc"], "naive": naive}}
for eopt in d['eopts']:
if eopt['eopt'] is None:
continue
method_result = SingleMethodResult(
x=eopt['eopt_x'],
y=eopt['eopt_y'],
val=eopt['eopt']
)
output[eopt['name']] = method_result
return output # pyright: ignore[reportReturnType]


def metric_graph(
Expand All @@ -199,15 +227,25 @@ def metric_graph(
title: str | None = None,
filename: str | None = None,
force_multiclass: bool = False,
) -> dict[str, Any]:
) -> MetricGraphResult:
"""
Plot a ROC graph of predictor's results (including AUC scores), where each
Plot a metric graph of predictor's results (including AUC scores), where each
row of y_true and y_pred represent a single example.
If there are 1 or two columns only, the data is treated as a binary
classification (see input example below).
If there are more then 2 columns, each column is considered a
unique class, and a ROC graph and AUC score will be computed for each.
A Macro-ROC and Micro-ROC are computed and plotted too by default.

**ROC:**
Plots true-positive rate as a function of the false-positive rate of the positive label in a binary classification,
where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
(0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.

Computes the estimated optimal threshold using two methods:
* Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance
* Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$)

**Precision-Recall:**
Plots precision as a function of recall of the positive label in a binary classification, where
$Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
line with precision of the ratio of positive examples in the dataset.
Estimated optimal threshold is computed using Euclidean (geometric) distance.

Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
Expand Down Expand Up @@ -270,8 +308,20 @@ def metric_graph(

Returns:
--------
A dictionary, one key for each class. Each value is another dictionary,
holding AUC and eOpT values.
A dictionary with these keys:
- `ax`: the Matplotlib plot axis
- `metrics`: each key is a class name from the list of provided classes.,
Per each class, another dict exists with AUC results
and measurement methods results.
AUC key holds both the measured area-under-curve (under `val`)
and the AUC of a random-guess classifier (under `naive`) for
comparison.
Each measurement method key contains three values: `x`, `y`, `val`,
corresponding to the (x,y) coordinates on the metric graph of the
threshold, and its value.
If only one class exists, then the measurements method keys and AUC
will be directly under `metrics`.


Binary Classification Input Example:
------------------------------------
Expand Down Expand Up @@ -325,7 +375,7 @@ def metric_graph(
else:
colors_list: list[str] = colors or _ROC_PLOT_COLORS

output_dict = dict()
output_dict: dict[str, SingleCurveResult] = {}
pr_naives = list()
if (
len(y_pred_array.shape) == 1
Expand Down Expand Up @@ -422,8 +472,11 @@ def metric_graph(
filename=filename,
plot=plot,
)
output_dict["ax"] = axis
return output_dict
metric_graph_result = MetricGraphResult(
ax=axis,
metrics=output_dict if len(output_dict) > 1 else output_dict[list(output_dict.keys())[0]]
)
return metric_graph_result


def random_forest_feature_importance(
Expand Down
9 changes: 2 additions & 7 deletions dython/nominal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from collections import Counter
from matplotlib.colors import Colormap
from matplotlib.axes._axes import Axes
from typing import Any, Callable, Iterable, Literal, TypedDict, cast, overload
from typing import Any, Callable, Iterable, Literal, cast, overload
from ._private import convert, remove_incomplete_samples, replace_nan_with_value, plot_or_not
from .data_utils import identify_columns_by_type
from .typing import Number, OneDimArray, TwoDimArray
from .typing import Number, OneDimArray, TwoDimArray, AssociationsResult


__all__ = [
Expand Down Expand Up @@ -53,11 +53,6 @@
NomNomAssocStr = Literal["cramer", "theil"]


class AssociationsResult(TypedDict):
corr: pd.DataFrame
ax: Axes | None


def _inf_nan_str(x: Number) -> str:
if np.isnan(x):
return "NaN"
Expand Down
24 changes: 23 additions & 1 deletion dython/typing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
import numpy as np
import pandas as pd
from typing import Sequence, Any
from typing import Sequence, Any, TypedDict, Protocol
from matplotlib.axes._axes import Axes


Number = int | float
OneDimArray = Sequence[Number | str] | pd.Series | np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]]
TwoDimArray = np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]] | pd.DataFrame


class AssociationsResult(TypedDict):
corr: pd.DataFrame
ax: Axes | None


class SingleMethodResult(TypedDict):
x: float
y: float
val: float


class SingleCurveResult(Protocol):
auc: dict[str, float]
def __getitem__(self, key: str) -> SingleMethodResult: ...


class MetricGraphResult(TypedDict):
metrics: dict[str, SingleCurveResult] | SingleCurveResult
ax: Axes
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]}

min_minor = 10
max_minor = 13
max_minor = 14
CLASSIFIERS = [
f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1)
]
Expand Down
Loading