Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ export(mtr_tpr)
export(mtr_true_negative_rate)
export(mtr_true_positive_rate)
export(mtr_youden_index)
export(mtr_mutual_info_score)
export(mtr_adjusted_rand_score)
importFrom(Rcpp,evalCpp)
importFrom(stats,complete.cases)
importFrom(stats,median)
Expand Down
96 changes: 96 additions & 0 deletions R/clustering.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
##' @title
##' Clustering Metrics Parameters
##'
##' @description
##' Documentation for shared parameters of functions that compute clustering
##' metrics.
##'
##' @param actual \code{[numeric]} The ground truth numeric vector.
##' @param predicted \code{[numeric]} The predicted numeric vector, where each
##' element in the vector is a prediction of the corresponding elements in
##' \code{actual}.
##' @name clustering_params
##' @include helper-functions.r
NULL


##' @title
##' Adjusted Mutual Information Score / Mututal Information Score
##'
##'
##' @description
##'
##' \code{mtr_mutual_info_score} measures the similarity, or mutual dependence
##' between two variable. The worst possible score is 0, higher values are
##' better.
##'
##'
##' @inheritParams clustering_params
##' @importFrom stats var
##' @seealso \code{\link{mtr_adjusted_rand_score}}
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_mutual_info_score(act, pred)
##'
##' act <- rep(c('a', 'b', 'c'), times = 4)
##' pred <- rep(c('a', 'b', 'c'), each = 4)
##' mtr_mutual_info_score(act, pred)
##'
##' @export
mtr_mutual_info_score <- function(actual, predicted) {
chec_empty_vec(actual)
check_equal_length(actual, predicted)
entropy(actual) + entropy(predicted) - joint_entropy(vec_1 = actual,
vec_2 = predicted)
}

mtr_normalized_mutual_info_score <- function(actual, predicted) {
mtr_mutual_info_score(actual = actual, predicted = predicted) /
mean(c(entropy(vec = actual), entropy(vec = predicted)))
}

mtr_adjusted_mutual_info_score <- function(actual, predicted) {
(mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) /
(mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted))
}

##' @title
##' Adjusted Rand Score
##'
##'
##' @description
##'
##' \code{mtr_adjusted_rand_score} measures the similarity, or mutual dependence
##' between two variable. Perfect score is 1. Score between total random vectors
##' is close to 0. Score can be negative.
##'
##'
##' @inheritParams clustering_params
##' @importFrom base choose
##' @seealso \code{\link{mtr_mutual_info_score}}
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_adjusted_rand_score(act, pred)
##'
##' act <- rep(c('a', 'b', 'c'), times = 4)
##' pred <- rep(c('a', 'b', 'c'), each = 4)
##' mtr_adjusted_rand_score(act, pred)
##'
##' @export

mtr_adjusted_rand_score <- function(actual, predicted) {
check_equal_length(actual, predicted)
N = length(actual)
a = sum(choose(table(actual, predicted), 2))
b = sum(choose(table(actual), 2)) * sum(choose(table(predicted), 2)) / choose(N, 2)
c = 1/2 * (sum(choose(table(actual), 2)) + sum(choose(table(predicted), 2)))
(a - b) / (c - b)
}
65 changes: 64 additions & 1 deletion R/helper-functions.r
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@


chec_empty_vec <- function(vec) {
if (length(vec) == 0) {
stop("vector must have positive length.", call. = FALSE)
}

invisible()
}

check_equal_length <- function(actual, predicted) {

Expand Down Expand Up @@ -60,3 +66,60 @@ trapezoid <- function(x, y) {

sum(dx * height)
}

class_prob <- function(vec, class) {
chec_empty_vec(vec)
length(which(vec == class)) / length(vec = vec)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is changed as a mistake and would be fixed in next commit

}

entropy <- function(vec) {
chec_empty_vec(vec)
li = c()
for (cl in unique(vec)) {
m = class_prob(vec = vec, class = cl)
li = c(li, -1 * m * log(m))
}
etp = sum(li, na.rm = TRUE)
etp
}

joint_class_prob <- function(vec_1, vec_2, class_1, class_2) {
chec_empty_vec(vec_1)
check_equal_length(vec_1, vec_2)
length(which(vec_1 == class_1 & vec_2 == class_2)) / length(vec_1)
}

joint_entropy <- function(vec_1, vec_2) {
check_equal_length(vec_1, vec_2)
li = c()
for(cl_1 in unique(vec_1)) {
for(cl_2 in unique(vec_2)) {
m = joint_class_prob(vec_1 = vec_1, vec_2 = vec_2,
class_1 = cl_1, class_2 = cl_2)
li = c(li, - 1 * m * log(m))
}
}
joint_etp = sum(li, na.rm = TRUE)
joint_etp
}

expected_mutual_info <- function(vec_1, vec_2) {
check_equal_length(vec_1, vec_2)
N = length(vec_1)
li = c()
for (i in unique(vec_1)) {
a = length(which(vec_1 == i))
for (j in unique(vec_2)) {
b = length(which(vec_2 == j))
for (nij in max(a + b - N, 0, na.rm = TRUE): min(a, b, na.rm = TRUE)) {
li = c(li, (nij / N) *
log((N * nij) / (a * b)) *
(factorial(a) * factorial(b) * factorial(N - a) * factorial(N - b)) /
(factorial(N) * factorial(nij) * factorial(a - nij) * factorial(b - nij) * factorial(N - a - b + nij)))
}
}
}
emi = sum(li, na.rm = TRUE)
emi
}

24 changes: 22 additions & 2 deletions TODO.org
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@




* List of performance metrics
Expand All @@ -23,6 +23,8 @@ Metrics that built around confusion matrix:

- [X] Balanced Accuracy

- [ ] Balanced Error Rate

- [X] Positive Predicted Value (PPV) / Precision

- [ ] Average Precision
Expand All @@ -31,12 +33,18 @@ Metrics that built around confusion matrix:

- [X] False Omission Rate (FOR)

- [ ] Positive Likelihood

- [ ] Negative Likelihood

- [X] Prevalence

- [X] F1 Score

- [X] Matthews Correlation Coefficient (MCC)

- [ ] Discriminant Power

- [X] Informedness (Bookmaker Informedness - BM) / Youden Index (Youden's J Statistic)

- [X] Markedness (MK)
Expand Down Expand Up @@ -77,19 +85,31 @@ Proper scoring rule:

- [X] Mean Squared Error

- [ ] Normalized Mean Squared Error

- [X] Root Mean Squared Error

- [X] Mean Squared Logarithmic Error

- [X] Median Absolute Error

- [ ] Mean Absolute Percentage Error

- [ ] Mean Absolute Scaled Error

- [ ] Median Squared Error

- [X] R2 Score

- [ ] Adjusted R2 Score

- [ ] M-Estimators

** Clustering tasks

- [ ] Adjusted Mututal Information Score / Mutual Information Score

- [ ] Adjusted Rand Score
- [X] Adjusted Rand Score

- [ ] Calinski-Harabasz Score

Expand Down
29 changes: 29 additions & 0 deletions inst/tinytest/test-clustering.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

## test correctness ------------------------------------------------------------

vec_a = c(0, 1, 2, 0, 3, 4, 5, 1)
vec_b = c(1, 1, 0, 0, 2, 2, 2, 2)

tinytest::expect_equal(
mtr_mutual_info_score(vec_a, vec_b),
target = 0.693147180559945,
tol = 1e-7
)

tinytest::expect_equal(
mtr_normalized_mutual_info_score(vec_a, vec_b),
target = 0.5163977794943221,
tol = 1e-7
)

tinytest::expect_equal(
mtr_adjusted_mutual_info_score(vec_a, vec_b),
target = -0.10526315789473674,
tol = 1e-7
)

tinytest::expect_equal(
mtr_adjusted_rand_score(vec_a, vec_b),
target = -0.12903225806451613,
tol = 1e-7
)