spsanderson
diff --git a/‎NAMESPACE
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE
Lines changed: 2 additions & 0 deletions
diff --git a/‎NEWS.md
Lines changed: 2 additions & 1 deletion b/‎NEWS.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎R/utils-check-dupe-rows.R
Lines changed: 33 additions & 0 deletions b/‎R/utils-check-dupe-rows.R
Lines changed: 33 additions & 0 deletions
diff --git a/‎R/utils-quantile-normalize.R
Lines changed: 148 additions & 0 deletions b/‎R/utils-quantile-normalize.R
Lines changed: 148 additions & 0 deletions
diff --git a/‎man/check_duplicate_rows.Rd
Lines changed: 51 additions & 0 deletions b/‎man/check_duplicate_rows.Rd
Lines changed: 51 additions & 0 deletions
diff --git a/‎man/core_packages.Rd
Lines changed: 3 additions & 1 deletion b/‎man/core_packages.Rd
Lines changed: 3 additions & 1 deletion
diff --git a/‎man/create_splits.Rd
Lines changed: 3 additions & 1 deletion b/‎man/create_splits.Rd
Lines changed: 3 additions & 1 deletion
diff --git a/‎man/create_workflow_set.Rd
Lines changed: 3 additions & 1 deletion b/‎man/create_workflow_set.Rd
Lines changed: 3 additions & 1 deletion
diff --git a/‎man/fast_classification_parsnip_spec_tbl.Rd
Lines changed: 3 additions & 1 deletion b/‎man/fast_classification_parsnip_spec_tbl.Rd
Lines changed: 3 additions & 1 deletion
diff --git a/‎man/fast_regression_parsnip_spec_tbl.Rd
Lines changed: 3 additions & 1 deletion b/‎man/fast_regression_parsnip_spec_tbl.Rd
Lines changed: 3 additions & 1 deletion
@@ -5,6 +5,7 @@ export(":=")
 export(.data)
 export(as_label)
 export(as_name)
+export(check_duplicate_rows)
 export(core_packages)
 export(create_model_spec)
 export(create_splits)
@@ -35,6 +36,7 @@ export(make_regression_base_tbl)
 export(match_args)
 export(plot_regression_predictions)
 export(plot_regression_residuals)
+export(quantile_normalize)
 importFrom(magrittr,"%>%")
 importFrom(parsnip,cubist_rules)
 importFrom(parsnip,linear_reg)
 
@@ -4,7 +4,8 @@
 None
 
 ## New Features
-None
+1. Fix #242 - Add function `quantile_normalize()`.
+2. Fix #243 - Add function `check_duplicate_rows()`.
 
 ## Minor Fixes and Improvements
 None
 
@@ -0,0 +1,33 @@
+#' Check for Duplicate Rows in a Data Frame
+#'
+#' @family Utility
+#'
+#' @author Steven P. Sanderson II, MPH
+#'
+#' @description
+#' This function checks for duplicate rows in a data frame.
+#'
+#' @param .data A data frame.
+#' @return A logical vector indicating whether each row is a duplicate or not.
+#' @details This function checks for duplicate rows by comparing each row in the
+#' data frame to every other row. If a row is identical to another row, it is
+#' considered a duplicate.
+#' @examples
+#' data <- data.frame(
+#'   x = c(1, 2, 3, 1),
+#'   y = c(2, 3, 4, 2),
+#'   z = c(3, 2, 5, 3)
+#' )
+#'
+#' check_duplicate_rows(data)
+#'
+#' @seealso \code{\link{duplicated}}, \code{\link{anyDuplicated}}
+#'
+#' @name check_duplicate_rows
+NULL
+
+#' @rdname check_duplicate_rows
+#' @export
+check_duplicate_rows <- function(.data) {
+  !apply(.data, 1, function(x) length(unique(x)) == ncol(.data))
+}
@@ -0,0 +1,148 @@
+#' Perform quantile normalization on a numeric matrix/data.frame
+#'
+#' @family Utility
+#'
+#' @author Steven P. Sanderson II, MPH
+#'
+#' @description This function will perform quantile normalization on two or more
+#' distributions of equal length. Quantile normalization is a technique used to make the distribution of values across different samples
+#' more similar. It ensures that the distributions of values for each sample have the same quantiles.
+#' This function takes a numeric matrix as input and returns a quantile-normalized matrix.
+#'
+#' @param .data A numeric matrix where each column represents a sample.
+#' @param .return_tibble A logical value that determines if the output should be a tibble. Default is 'FALSE'.
+#'
+#' @return A list object that has the following:
+#' \enumerate{
+#'  \item A numeric matrix that has been quantile normalized.
+#'  \item The row means of the quantile normalized matrix.
+#'  \item The sorted data
+#'  \item The ranked indices
+#' }
+#'
+#' @details
+#' This function performs quantile normalization on a numeric matrix by following these steps:
+#' \enumerate{
+#'   \item Sort each column of the input matrix.
+#'   \item Calculate the mean of each row across the sorted columns.
+#'   \item Replace each column's sorted values with the row means.
+#'   \item Unsort the columns to their original order.
+#' }
+#'
+#' @examples
+#' # Create a sample numeric matrix
+#' data <- matrix(rnorm(20), ncol = 4)
+#'
+#' # Perform quantile normalization
+#' normalized_data <- quantile_normalize(data)
+#' normalized_data
+#'
+#' as.data.frame(normalized_data$normalized_data) |>
+#'   sapply(function(x) quantile(x, probs = seq(0, 1, 1 / 4)))
+#'
+#' quantile_normalize(data, .return_tibble = TRUE)
+#'
+#' @seealso
+#' \code{\link{rowMeans}}: Calculate row means.
+#'
+#' \code{\link{apply}}: Apply a function over the margins of an array.
+#'
+#' \code{\link{order}}: Order the elements of a vector.
+#'
+#' @name quantile_normalize
+NULL
+
+#' @export
+#' @rdname quantile_normalize
+# Perform quantile normalization on a numeric matrix 'data_matrix'
+quantile_normalize <- function(.data, .return_tibble = FALSE) {
+  # Checks ----
+  if (!inherits(.data, c("matrix", "data.frame"))) {
+    rlang::abort(
+      message = "The input data must be a numeric matrix or data.frame.",
+      use_cli_format = TRUE
+    )
+  }
+
+  if (!all(sapply(.data, is.numeric))) {
+    rlang::abort(
+      message = "The input data must be a numeric matrix or data.frame.",
+      use_cli_format = TRUE
+    )
+  }
+
+  # Data ----
+  # Get col_nms
+  col_nms <- colnames(.data)
+  data_matrix <- as.matrix(.data)
+
+  # Step 1: Sort each column
+  sorted_data <- apply(data_matrix, 2, sort)
+
+  # Step 2: Calculate the mean of each row across sorted columns
+  row_means <- rowMeans(sorted_data)
+
+  # Step 3: Replace each column's sorted values with the row means
+  sorted_data <- matrix(
+    row_means,
+    nrow = nrow(sorted_data),
+    ncol = ncol(sorted_data),
+    byrow = TRUE
+  )
+
+  # Step 4: Unsort the columns to their original order
+  # Get rank index
+  rank_indices <- apply(data_matrix, 2, order)
+
+  # Get duplicated rank indices, get the complete data for rows that have a
+  # duplicated rank
+  duplicated_ranks <- rank_indices[check_duplicate_rows(rank_indices), ]
+
+  # Get duplicated rank vector, get the row indices that have duplicated ranks
+  duplicated_rank_vector <- which(check_duplicate_rows(rank_indices))
+
+  # Get duplicated rank data
+  duplicated_rank_data <- data_matrix[duplicated_rank_vector, ]
+
+  # Normalize the data
+  normalized_data <- matrix(nrow = nrow(data_matrix), ncol = ncol(data_matrix))
+  for (i in 1:ncol(data_matrix)) {
+    normalized_data[, i] <- sorted_data[rank_indices[, i], i]
+  }
+
+  # Add Column Names to all items
+  colnames(normalized_data) <- col_nms
+  colnames(sorted_data) <- col_nms
+
+  # Should output be a tibble?
+  if (.return_tibble) {
+    normalized_data <- dplyr::as_tibble(normalized_data, .name_repair = "universal")
+    row_means <- dplyr::as_tibble(row_means)
+    #sorted_data <- dplyr::as_tibble(sorted_data)
+    #rank_indices <- dplyr::as_tibble(rank_indices)
+    duplicated_ranks <- dplyr::as_tibble(duplicated_ranks)
+    duplicated_rank_data <- dplyr::as_tibble(duplicated_rank_data)
+    duplicated_rank_vector <- dplyr::as_tibble(duplicated_rank_vector) |>
+      dplyr::rename(row_index = value)
+  }
+
+  # Return ----
+  if (length(duplicated_rank_vector > 0)) {
+    rlang::warn(
+      message = "There are duplicated ranks the input data.",
+      use_cli_format = TRUE
+    )
+  }
+
+  return(
+    list(
+      normalized_data = normalized_data,
+      row_means = row_means,
+      #sorted_data = sorted_data,
+      #column_rank_indices = rank_indices,
+      duplicated_ranks = duplicated_ranks,
+      duplicated_rank_row_indices = duplicated_rank_vector,
+      duplicated_rank_data = duplicated_rank_data
+    )
+  )
+}