|
| 1 | +#' Perform quantile normalization on a numeric matrix/data.frame |
| 2 | +#' |
| 3 | +#' @family Utility |
| 4 | +#' |
| 5 | +#' @author Steven P. Sanderson II, MPH |
| 6 | +#' |
| 7 | +#' @description This function will perform quantile normalization on two or more |
| 8 | +#' distributions of equal length. Quantile normalization is a technique used to make the distribution of values across different samples |
| 9 | +#' more similar. It ensures that the distributions of values for each sample have the same quantiles. |
| 10 | +#' This function takes a numeric matrix as input and returns a quantile-normalized matrix. |
| 11 | +#' |
| 12 | +#' @param .data A numeric matrix where each column represents a sample. |
| 13 | +#' @param .return_tibble A logical value that determines if the output should be a tibble. Default is 'FALSE'. |
| 14 | +#' |
| 15 | +#' @return A list object that has the following: |
| 16 | +#' \enumerate{ |
| 17 | +#' \item A numeric matrix that has been quantile normalized. |
| 18 | +#' \item The row means of the quantile normalized matrix. |
| 19 | +#' \item The sorted data |
| 20 | +#' \item The ranked indices |
| 21 | +#' } |
| 22 | +#' |
| 23 | +#' @details |
| 24 | +#' This function performs quantile normalization on a numeric matrix by following these steps: |
| 25 | +#' \enumerate{ |
| 26 | +#' \item Sort each column of the input matrix. |
| 27 | +#' \item Calculate the mean of each row across the sorted columns. |
| 28 | +#' \item Replace each column's sorted values with the row means. |
| 29 | +#' \item Unsort the columns to their original order. |
| 30 | +#' } |
| 31 | +#' |
| 32 | +#' @examples |
| 33 | +#' # Create a sample numeric matrix |
| 34 | +#' data <- matrix(rnorm(20), ncol = 4) |
| 35 | +#' |
| 36 | +#' # Perform quantile normalization |
| 37 | +#' normalized_data <- quantile_normalize(data) |
| 38 | +#' normalized_data |
| 39 | +#' |
| 40 | +#' as.data.frame(normalized_data$normalized_data) |> |
| 41 | +#' sapply(function(x) quantile(x, probs = seq(0, 1, 1 / 4))) |
| 42 | +#' |
| 43 | +#' quantile_normalize(data, .return_tibble = TRUE) |
| 44 | +#' |
| 45 | +#' @seealso |
| 46 | +#' \code{\link{rowMeans}}: Calculate row means. |
| 47 | +#' |
| 48 | +#' \code{\link{apply}}: Apply a function over the margins of an array. |
| 49 | +#' |
| 50 | +#' \code{\link{order}}: Order the elements of a vector. |
| 51 | +#' |
| 52 | +#' @name quantile_normalize |
| 53 | +NULL |
| 54 | + |
| 55 | +#' @export |
| 56 | +#' @rdname quantile_normalize |
| 57 | +# Perform quantile normalization on a numeric matrix 'data_matrix' |
| 58 | +quantile_normalize <- function(.data, .return_tibble = FALSE) { |
| 59 | + # Checks ---- |
| 60 | + if (!inherits(.data, c("matrix", "data.frame"))) { |
| 61 | + rlang::abort( |
| 62 | + message = "The input data must be a numeric matrix or data.frame.", |
| 63 | + use_cli_format = TRUE |
| 64 | + ) |
| 65 | + } |
| 66 | + |
| 67 | + if (!all(sapply(.data, is.numeric))) { |
| 68 | + rlang::abort( |
| 69 | + message = "The input data must be a numeric matrix or data.frame.", |
| 70 | + use_cli_format = TRUE |
| 71 | + ) |
| 72 | + } |
| 73 | + |
| 74 | + # Data ---- |
| 75 | + # Get col_nms |
| 76 | + col_nms <- colnames(.data) |
| 77 | + data_matrix <- as.matrix(.data) |
| 78 | + |
| 79 | + # Step 1: Sort each column |
| 80 | + sorted_data <- apply(data_matrix, 2, sort) |
| 81 | + |
| 82 | + # Step 2: Calculate the mean of each row across sorted columns |
| 83 | + row_means <- rowMeans(sorted_data) |
| 84 | + |
| 85 | + # Step 3: Replace each column's sorted values with the row means |
| 86 | + sorted_data <- matrix( |
| 87 | + row_means, |
| 88 | + nrow = nrow(sorted_data), |
| 89 | + ncol = ncol(sorted_data), |
| 90 | + byrow = TRUE |
| 91 | + ) |
| 92 | + |
| 93 | + # Step 4: Unsort the columns to their original order |
| 94 | + # Get rank index |
| 95 | + rank_indices <- apply(data_matrix, 2, order) |
| 96 | + |
| 97 | + # Get duplicated rank indices, get the complete data for rows that have a |
| 98 | + # duplicated rank |
| 99 | + duplicated_ranks <- rank_indices[check_duplicate_rows(rank_indices), ] |
| 100 | + |
| 101 | + # Get duplicated rank vector, get the row indices that have duplicated ranks |
| 102 | + duplicated_rank_vector <- which(check_duplicate_rows(rank_indices)) |
| 103 | + |
| 104 | + # Get duplicated rank data |
| 105 | + duplicated_rank_data <- data_matrix[duplicated_rank_vector, ] |
| 106 | + |
| 107 | + # Normalize the data |
| 108 | + normalized_data <- matrix(nrow = nrow(data_matrix), ncol = ncol(data_matrix)) |
| 109 | + for (i in 1:ncol(data_matrix)) { |
| 110 | + normalized_data[, i] <- sorted_data[rank_indices[, i], i] |
| 111 | + } |
| 112 | + |
| 113 | + # Add Column Names to all items |
| 114 | + colnames(normalized_data) <- col_nms |
| 115 | + colnames(sorted_data) <- col_nms |
| 116 | + |
| 117 | + # Should output be a tibble? |
| 118 | + if (.return_tibble) { |
| 119 | + normalized_data <- dplyr::as_tibble(normalized_data, .name_repair = "universal") |
| 120 | + row_means <- dplyr::as_tibble(row_means) |
| 121 | + #sorted_data <- dplyr::as_tibble(sorted_data) |
| 122 | + #rank_indices <- dplyr::as_tibble(rank_indices) |
| 123 | + duplicated_ranks <- dplyr::as_tibble(duplicated_ranks) |
| 124 | + duplicated_rank_data <- dplyr::as_tibble(duplicated_rank_data) |
| 125 | + duplicated_rank_vector <- dplyr::as_tibble(duplicated_rank_vector) |> |
| 126 | + dplyr::rename(row_index = value) |
| 127 | + } |
| 128 | + |
| 129 | + # Return ---- |
| 130 | + if (length(duplicated_rank_vector > 0)) { |
| 131 | + rlang::warn( |
| 132 | + message = "There are duplicated ranks the input data.", |
| 133 | + use_cli_format = TRUE |
| 134 | + ) |
| 135 | + } |
| 136 | + |
| 137 | + return( |
| 138 | + list( |
| 139 | + normalized_data = normalized_data, |
| 140 | + row_means = row_means, |
| 141 | + #sorted_data = sorted_data, |
| 142 | + #column_rank_indices = rank_indices, |
| 143 | + duplicated_ranks = duplicated_ranks, |
| 144 | + duplicated_rank_row_indices = duplicated_rank_vector, |
| 145 | + duplicated_rank_data = duplicated_rank_data |
| 146 | + ) |
| 147 | + ) |
| 148 | +} |
0 commit comments