Skip to content

Commit d6bb874

Browse files
committed
Fixes #242
Fixes #243
1 parent 95757c7 commit d6bb874

15 files changed

+337
-10
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export(":=")
55
export(.data)
66
export(as_label)
77
export(as_name)
8+
export(check_duplicate_rows)
89
export(core_packages)
910
export(create_model_spec)
1011
export(create_splits)
@@ -35,6 +36,7 @@ export(make_regression_base_tbl)
3536
export(match_args)
3637
export(plot_regression_predictions)
3738
export(plot_regression_residuals)
39+
export(quantile_normalize)
3840
importFrom(magrittr,"%>%")
3941
importFrom(parsnip,cubist_rules)
4042
importFrom(parsnip,linear_reg)

NEWS.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
None
55

66
## New Features
7-
None
7+
1. Fix #242 - Add function `quantile_normalize()`.
8+
2. Fix #243 - Add function `check_duplicate_rows()`.
89

910
## Minor Fixes and Improvements
1011
None

R/utils-check-dupe-rows.R

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#' Check for Duplicate Rows in a Data Frame
2+
#'
3+
#' @family Utility
4+
#'
5+
#' @author Steven P. Sanderson II, MPH
6+
#'
7+
#' @description
8+
#' This function checks for duplicate rows in a data frame.
9+
#'
10+
#' @param .data A data frame.
11+
#' @return A logical vector indicating whether each row is a duplicate or not.
12+
#' @details This function checks for duplicate rows by comparing each row in the
13+
#' data frame to every other row. If a row is identical to another row, it is
14+
#' considered a duplicate.
15+
#' @examples
16+
#' data <- data.frame(
17+
#' x = c(1, 2, 3, 1),
18+
#' y = c(2, 3, 4, 2),
19+
#' z = c(3, 2, 5, 3)
20+
#' )
21+
#'
22+
#' check_duplicate_rows(data)
23+
#'
24+
#' @seealso \code{\link{duplicated}}, \code{\link{anyDuplicated}}
25+
#'
26+
#' @name check_duplicate_rows
27+
NULL
28+
29+
#' @rdname check_duplicate_rows
30+
#' @export
31+
check_duplicate_rows <- function(.data) {
32+
!apply(.data, 1, function(x) length(unique(x)) == ncol(.data))
33+
}

R/utils-quantile-normalize.R

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#' Perform quantile normalization on a numeric matrix/data.frame
2+
#'
3+
#' @family Utility
4+
#'
5+
#' @author Steven P. Sanderson II, MPH
6+
#'
7+
#' @description This function will perform quantile normalization on two or more
8+
#' distributions of equal length. Quantile normalization is a technique used to make the distribution of values across different samples
9+
#' more similar. It ensures that the distributions of values for each sample have the same quantiles.
10+
#' This function takes a numeric matrix as input and returns a quantile-normalized matrix.
11+
#'
12+
#' @param .data A numeric matrix where each column represents a sample.
13+
#' @param .return_tibble A logical value that determines if the output should be a tibble. Default is 'FALSE'.
14+
#'
15+
#' @return A list object that has the following:
16+
#' \enumerate{
17+
#' \item A numeric matrix that has been quantile normalized.
18+
#' \item The row means of the quantile normalized matrix.
19+
#' \item The sorted data
20+
#' \item The ranked indices
21+
#' }
22+
#'
23+
#' @details
24+
#' This function performs quantile normalization on a numeric matrix by following these steps:
25+
#' \enumerate{
26+
#' \item Sort each column of the input matrix.
27+
#' \item Calculate the mean of each row across the sorted columns.
28+
#' \item Replace each column's sorted values with the row means.
29+
#' \item Unsort the columns to their original order.
30+
#' }
31+
#'
32+
#' @examples
33+
#' # Create a sample numeric matrix
34+
#' data <- matrix(rnorm(20), ncol = 4)
35+
#'
36+
#' # Perform quantile normalization
37+
#' normalized_data <- quantile_normalize(data)
38+
#' normalized_data
39+
#'
40+
#' as.data.frame(normalized_data$normalized_data) |>
41+
#' sapply(function(x) quantile(x, probs = seq(0, 1, 1 / 4)))
42+
#'
43+
#' quantile_normalize(data, .return_tibble = TRUE)
44+
#'
45+
#' @seealso
46+
#' \code{\link{rowMeans}}: Calculate row means.
47+
#'
48+
#' \code{\link{apply}}: Apply a function over the margins of an array.
49+
#'
50+
#' \code{\link{order}}: Order the elements of a vector.
51+
#'
52+
#' @name quantile_normalize
53+
NULL
54+
55+
#' @export
56+
#' @rdname quantile_normalize
57+
# Perform quantile normalization on a numeric matrix 'data_matrix'
58+
quantile_normalize <- function(.data, .return_tibble = FALSE) {
59+
# Checks ----
60+
if (!inherits(.data, c("matrix", "data.frame"))) {
61+
rlang::abort(
62+
message = "The input data must be a numeric matrix or data.frame.",
63+
use_cli_format = TRUE
64+
)
65+
}
66+
67+
if (!all(sapply(.data, is.numeric))) {
68+
rlang::abort(
69+
message = "The input data must be a numeric matrix or data.frame.",
70+
use_cli_format = TRUE
71+
)
72+
}
73+
74+
# Data ----
75+
# Get col_nms
76+
col_nms <- colnames(.data)
77+
data_matrix <- as.matrix(.data)
78+
79+
# Step 1: Sort each column
80+
sorted_data <- apply(data_matrix, 2, sort)
81+
82+
# Step 2: Calculate the mean of each row across sorted columns
83+
row_means <- rowMeans(sorted_data)
84+
85+
# Step 3: Replace each column's sorted values with the row means
86+
sorted_data <- matrix(
87+
row_means,
88+
nrow = nrow(sorted_data),
89+
ncol = ncol(sorted_data),
90+
byrow = TRUE
91+
)
92+
93+
# Step 4: Unsort the columns to their original order
94+
# Get rank index
95+
rank_indices <- apply(data_matrix, 2, order)
96+
97+
# Get duplicated rank indices, get the complete data for rows that have a
98+
# duplicated rank
99+
duplicated_ranks <- rank_indices[check_duplicate_rows(rank_indices), ]
100+
101+
# Get duplicated rank vector, get the row indices that have duplicated ranks
102+
duplicated_rank_vector <- which(check_duplicate_rows(rank_indices))
103+
104+
# Get duplicated rank data
105+
duplicated_rank_data <- data_matrix[duplicated_rank_vector, ]
106+
107+
# Normalize the data
108+
normalized_data <- matrix(nrow = nrow(data_matrix), ncol = ncol(data_matrix))
109+
for (i in 1:ncol(data_matrix)) {
110+
normalized_data[, i] <- sorted_data[rank_indices[, i], i]
111+
}
112+
113+
# Add Column Names to all items
114+
colnames(normalized_data) <- col_nms
115+
colnames(sorted_data) <- col_nms
116+
117+
# Should output be a tibble?
118+
if (.return_tibble) {
119+
normalized_data <- dplyr::as_tibble(normalized_data, .name_repair = "universal")
120+
row_means <- dplyr::as_tibble(row_means)
121+
#sorted_data <- dplyr::as_tibble(sorted_data)
122+
#rank_indices <- dplyr::as_tibble(rank_indices)
123+
duplicated_ranks <- dplyr::as_tibble(duplicated_ranks)
124+
duplicated_rank_data <- dplyr::as_tibble(duplicated_rank_data)
125+
duplicated_rank_vector <- dplyr::as_tibble(duplicated_rank_vector) |>
126+
dplyr::rename(row_index = value)
127+
}
128+
129+
# Return ----
130+
if (length(duplicated_rank_vector > 0)) {
131+
rlang::warn(
132+
message = "There are duplicated ranks the input data.",
133+
use_cli_format = TRUE
134+
)
135+
}
136+
137+
return(
138+
list(
139+
normalized_data = normalized_data,
140+
row_means = row_means,
141+
#sorted_data = sorted_data,
142+
#column_rank_indices = rank_indices,
143+
duplicated_ranks = duplicated_ranks,
144+
duplicated_rank_row_indices = duplicated_rank_vector,
145+
duplicated_rank_data = duplicated_rank_data
146+
)
147+
)
148+
}

man/check_duplicate_rows.Rd

Lines changed: 51 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/core_packages.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/create_splits.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/create_workflow_set.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/fast_classification_parsnip_spec_tbl.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/fast_regression_parsnip_spec_tbl.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)