Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ export(mtr_tpr)
export(mtr_true_negative_rate)
export(mtr_true_positive_rate)
export(mtr_youden_index)
export(mtr_mutual_info_score)
export(mtr_adjusted_rand_score)
export(mtr_homogeneity)
export(mtr_completeness)
export(mtr_v_measure)
export(mtr_calinski_harabasz)
export(mtr_davies_boulding_score)
importFrom(Rcpp,evalCpp)
importFrom(stats,complete.cases)
importFrom(stats,median)
Expand Down
242 changes: 242 additions & 0 deletions R/clustering.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
##' @title
##' Clustering Metrics Parameters
##'
##' @description
##' Documentation for shared parameters of functions that compute clustering
##' metrics.
##'
##' @param actual \code{[numeric]} The ground truth numeric vector.
##' @param predicted \code{[numeric]} The predicted numeric vector, where each
##' element in the vector is a prediction of the corresponding elements in
##' \code{actual}.
##' @name clustering_params
##' @include helper-functions.r
NULL


##' @title
##' Adjusted Mutual Information Score / Mututal Information Score
##'
##'
##' @description
##'
##' \code{mtr_mutual_info_score} measures the similarity, or mutual dependence
##' between two variable. The worst possible score is 0, higher values are
##' better.
##'
##'
##' @inheritParams clustering_params
##' @importFrom stats var
##' @seealso \code{\link{mtr_adjusted_rand_score}}
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_mutual_info_score(act, pred)
##'
##' act <- rep(c('a', 'b', 'c'), times = 4)
##' pred <- rep(c('a', 'b', 'c'), each = 4)
##' mtr_mutual_info_score(act, pred)
##'
##' @export
mtr_mutual_info_score <- function(actual, predicted) {
chec_empty_vec(actual)
check_equal_length(actual, predicted)
entropy(actual) + entropy(predicted) - joint_entropy(vec_1 = actual,
vec_2 = predicted)
}

mtr_normalized_mutual_info_score <- function(actual, predicted) {
mtr_mutual_info_score(actual = actual, predicted = predicted) /
mean(c(entropy(vec = actual), entropy(vec = predicted)))
}

mtr_adjusted_mutual_info_score <- function(actual, predicted) {
(mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) /
(mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted))
}

##' @title
##' Adjusted Rand Score
##'
##'
##' @description
##'
##' \code{mtr_adjusted_rand_score} measures the similarity, or mutual dependence
##' between two variable. Perfect score is 1. Score between total random vectors
##' is close to 0. Score can be negative.
##'
##'
##' @inheritParams clustering_params
##' @importFrom base choose
##' @seealso \code{\link{mtr_mutual_info_score}}
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_adjusted_rand_score(act, pred)
##'
##' act <- rep(c('a', 'b', 'c'), times = 4)
##' pred <- rep(c('a', 'b', 'c'), each = 4)
##' mtr_adjusted_rand_score(act, pred)
##'
##' @export

mtr_adjusted_rand_score <- function(actual, predicted) {
check_equal_length(actual, predicted)
N = length(actual)
a = sum(choose(table(actual, predicted), 2))
b = sum(choose(table(actual), 2)) * sum(choose(table(predicted), 2)) / choose(N, 2)
c = 1/2 * (sum(choose(table(actual), 2)) + sum(choose(table(predicted), 2)))
(a - b) / (c - b)
}

##' @title
##' Homogeneity, Completeness, V-measure
##'
##'
##' @description
##'
##' \code{mtr_homogeneity} and \code{mtr_completeness} measures an aspect of the
##' quality of clustering algorithm, as the former measures how similar the
##' elements within each cluster to each other, and the later measures
##' the degree a cluster has cover elements of same labels.
##' Both scores are in range of 0 to 1, as worst to best.
##' \code{mtr_v_measure} is harmonic mean of homogeneity score and completeness
##' score.
##'
##' @inheritParams clustering_params
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_homogeneity(act, pred)
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_completeness(act, pred)
##'
##' act <- sample(1:10, 100, replace = T)
##' pred <- sample(1:10, 100, replace = T)
##' mtr_v_measure(act, pred)
##'
##' @export

mtr_homogeneity <- function(actual, predicted) {
1 - conditional_entropy(actual, predicted) / entropy(actual)
}

mtr_completeness <- function(actual, predicted) {
1 - conditional_entropy(predicted, actual) / entropy(predicted)
}

mtr_v_measure <- function(actual, predicted) {
h = mtr_homogeneity(actual, predicted)
c = mtr_completeness(actual, predicted)
2 * h * c / (h + c)
}

##' @title
##' Calinski-Harabasz Score (Variance Ratio Criterion)
##'
##'
##' @description
##'
##' \code{mtr_calinski_harabasz} measure the 'goodness' of clustering model
##' output, in case ground truth is unknown. Higher score mean cluster are dense
##' and well separated.
##'
##' @inheritParams clustering_params
##' @seealso \code{\link{mtr_davies_boulding_score}}
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##' dt <- iris[,-5]
##' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
##' 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
##' 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
##' 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
##' 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)
##' mtr_calinski_harabasz(dt, pred)
##'
##' @export

mtr_calinski_harabasz <- function(matrix_feature, predicted) {
dt_center = apply(matrix_feature, 2, FUN = mean)
N = length(predicted)
num_cluster = length(unique(predicted))

check_number_of_labels(n_labels = num_cluster, n_samples = N)

dispersion_between_cluster = 0
dispersion_within_cluster = 0

for (cluster_val in unique(predicted)) {
size_cluster = length(which(predicted == cluster_val))
dt_cluster = matrix_feature[which(predicted == cluster_val),]
cluster_center = apply(dt_cluster, 2, FUN = mean)

dispersion_between_cluster = dispersion_between_cluster + size_cluster * sum((t(cluster_center) - dt_center) ^ 2)
dispersion_within_cluster = dispersion_within_cluster + sum((t(dt_cluster) - cluster_center) ^ 2)
}

(dispersion_between_cluster / dispersion_within_cluster) * ((N - num_cluster) / (num_cluster - 1))
}

##' @title
##' Davies-Bouldin Score
##'
##'
##' @description
##'
##' \code{mtr_davies_boulding_score} measure the 'goodness' of clustering model
##' output, in case ground truth is unknown. Lowest and best score is 0,
##' and lower score mean clusters are dense and separated
##'
##' @inheritParams clustering_params
##' @seealso \code{\link{mtr_calinski_harabasz}}
##' @return A numeric scalar output
##' @author Phuc Nguyen
##' @examples
##' dt <- iris[,-5]
##' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
##' 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
##' 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
##' 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
##' 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)
##' mtr_calinski_harabasz(dt, pred)
##'
##' @export

mtr_davies_boulding_score <- function(matrix_feature, predicted) {
check_equal_cluster_length(matrix_feature, predicted)
similarity_sum = 0
for (cluster_val in unique(predicted)) {
dt_cluster = matrix_feature[which(predicted == cluster_val),]
cluster_center = apply(dt_cluster, 2, FUN = mean)
cluster_diameter = mean_distance(dt_cluster, cluster_center)
max_similarity = 0

for(other_cluster_val in unique(predicted)[unique(predicted) != cluster_val]) {
dt_other_cluster = matrix_feature[which(predicted == other_cluster_val),]
other_cluster_center = apply(dt_other_cluster, 2, FUN = mean)
other_cluster_diameter = mean_distance(dt_other_cluster, other_cluster_center)
between_distance = pairwise_distance(cluster_center, other_cluster_center)
similarity = (cluster_diameter + other_cluster_diameter) / between_distance
max_similarity = max(max_similarity, similarity)
}
similarity_sum = similarity_sum + max_similarity
}

similarity_sum/length(unique(predicted))
}
117 changes: 116 additions & 1 deletion R/helper-functions.r
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@


chec_empty_vec <- function(vec) {
if (length(vec) == 0) {
stop("vector must have positive length.", call. = FALSE)
}

invisible()
}

check_equal_length <- function(actual, predicted) {

Expand All @@ -10,6 +16,16 @@ check_equal_length <- function(actual, predicted) {
invisible()
}

check_equal_cluster_length <- function(dt, predicted) {

if (nrow(dt) != length(predicted)) {
stop("`data` and `cluster` must have equal length.", call. = FALSE)
}

invisible()
}


check_cutoff_range <- function(cutoff) {

if (cutoff > 1 || cutoff < 0) {
Expand All @@ -28,6 +44,15 @@ check_binary <- function(actual) {
invisible()
}

check_number_of_labels <- function(n_labels, n_samples) {

if (! (1 < n_labels & n_labels < n_samples)) {
stop("Number of labels is invalid. Valid value are 2 to n_samples - 1", call. = FALSE)
}

invisible()
}

clip <- function(x, mi, ma) {
clip_(x, mi, ma)
}
Expand Down Expand Up @@ -60,3 +85,93 @@ trapezoid <- function(x, y) {

sum(dx * height)
}

class_prob <- function(vec, class) {
chec_empty_vec(vec)
length(which(vec == class)) / length(vec)
}

entropy <- function(vec) {
chec_empty_vec(vec)
li = c()
for (cl in unique(vec)) {
m = class_prob(vec = vec, class = cl)
li = c(li, -1 * m * log(m))
}
etp = sum(li, na.rm = TRUE)
etp
}

joint_class_prob <- function(vec_1, vec_2, class_1, class_2) {
chec_empty_vec(vec_1)
check_equal_length(vec_1, vec_2)
length(which(vec_1 == class_1 & vec_2 == class_2)) / length(vec_1)
}

joint_entropy <- function(vec_1, vec_2) {
check_equal_length(vec_1, vec_2)
li = c()
for(cl_1 in unique(vec_1)) {
for(cl_2 in unique(vec_2)) {
m = joint_class_prob(vec_1 = vec_1, vec_2 = vec_2,
class_1 = cl_1, class_2 = cl_2)
li = c(li, - 1 * m * log(m))
}
}
joint_etp = sum(li, na.rm = TRUE)
joint_etp
}

expected_mutual_info <- function(vec_1, vec_2) {
check_equal_length(vec_1, vec_2)
N = length(vec_1)
li = c()
for (i in unique(vec_1)) {
a = length(which(vec_1 == i))
for (j in unique(vec_2)) {
b = length(which(vec_2 == j))
for (nij in max(a + b - N, 0, na.rm = TRUE): min(a, b, na.rm = TRUE)) {
li = c(li, (nij / N) *
log((N * nij) / (a * b)) *
(factorial(a) * factorial(b) * factorial(N - a) * factorial(N - b)) /
(factorial(N) * factorial(nij) * factorial(a - nij) * factorial(b - nij) * factorial(N - a - b + nij)))
}
}
}
emi = sum(li, na.rm = TRUE)
emi
}

conditional_entropy <- function(vec_1, vec_2) {
check_equal_length(vec_1, vec_2)
N = length(vec_1)
li = c()
for (i in unique(vec_1)) {
for (j in unique(vec_2)) {
b = length(which(vec_2 == j))
nij = length(which(vec_1 == i & vec_2 == j))
li = c(li, - nij / N * log(nij / b))
}
}
cond_entropy = sum(li, na.rm = TRUE)
cond_entropy
}

pairwise_distance <- function(vec_1, vec_2) {
# calculate euclidean distance between two points
check_equal_length(vec_1, vec_2)
sqrt(sum((vec_1 - vec_2) ^ 2))
}

mean_distance <- function(set, pt) {
# calculate euclidean distance between a set of point, represented by
# data frame, to a single point
total_dist = 0
check_equal_length(set, pt)
for (i in 1:nrow(set)) {
ve = set[i,]
total_dist = total_dist + pairwise_distance(ve, pt)
}
total_dist / nrow(set)
}

Loading