From 9affd89e56dcdc4fda40a8646276b57b94f2253c Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 13 Jun 2023 12:05:38 -0700 Subject: [PATCH 01/17] Update motrpac_bic_functions.R Fix separation of Specie into protein_id and sequence columns when PTMs are denoted by '@' --- R/motrpac_bic_funtions.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R/motrpac_bic_funtions.R b/R/motrpac_bic_funtions.R index 88e731c..46b971f 100644 --- a/R/motrpac_bic_funtions.R +++ b/R/motrpac_bic_funtions.R @@ -42,7 +42,6 @@ #' @importFrom dplyr select inner_join left_join mutate %>% case_when rename #' group_by summarize arrange across #' @importFrom tibble rownames_to_column -#' @importFrom tidyr separate #' #' @name motrpac_bic_output #' @@ -172,8 +171,11 @@ make_rii_peptide_gl <- function(msnid, # Feature data feature_data <- crosstab %>% dplyr::select(Specie) %>% - tidyr::separate(col = Specie, into = c("protein_id", "sequence"), - sep = "@", remove = FALSE) %>% + ## Old code: does not work if PTMs are denoted by "@" + # tidyr::separate(col = Specie, into = c("protein_id", "sequence"), + # sep = "@", remove = FALSE) %>% + mutate(protein_id = sub("(^[^@]*)@(.*)", "\\1", Specie), + sequence = sub("(^[^@]*)@(.*)", "\\2", Specie)) %>% mutate(organism_name = org_name) if (annotation == "REFSEQ") { @@ -492,8 +494,11 @@ make_results_ratio_ph <- function(msnid, ## Create RII peptide table feature_data <- crosstab %>% select(Specie) %>% - tidyr::separate(Specie, into = c("protein_id", "ptm_id"), - sep = "@", remove = FALSE) %>% + ## Old code: does not work if PTMs are denoted by "@" + # tidyr::separate(Specie, into = c("protein_id", "ptm_id"), + # sep = "@", remove = FALSE) %>% + mutate(protein_id = sub("(^[^@]*)@(.*)", "\\1", Specie), + ptm_id = sub("(^[^@]*)@(.*)", "\\2", Specie)) %>% mutate(organism_name = org_name) if (annotation == "REFSEQ") { From 5dcf498a93af4b8815dd17713c776a866d89d335 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 13 Jun 2023 12:07:25 -0700 Subject: [PATCH 02/17] Update read_study_design.R Include more robust checks for study design tables --- R/read_study_design.R | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/R/read_study_design.R b/R/read_study_design.R index 294f3ca..c871e87 100644 --- a/R/read_study_design.R +++ b/R/read_study_design.R @@ -13,6 +13,7 @@ #' #' @importFrom readr read_tsv #' @importFrom dplyr filter select rename %>% +#' @importFrom data.table data.table #' #' @export read_study_design #' @@ -63,10 +64,14 @@ read_study_design <- function(path_to_study_design, required_cols_i <- required_cols[[name_i]] col_in_tbl <- required_cols_i %in% colnames(tbl) + if (!all(col_in_tbl)) { - message(sprintf("\nRequired column(s) not found in the '%s' file: ", pttrn), + message(sprintf("\nRequired column(s) not found in the '%s' file: ", + pttrn), paste(required_cols_i[!col_in_tbl], collapse = ", ")) - stop(sprintf("Incorrect column names or missing columns in the '%s' study design table.", name_i)) + stop(sprintf( + "Incorrect column names or missing columns in the '%s' study design table.", + name_i)) } return(tbl) @@ -74,19 +79,36 @@ read_study_design <- function(path_to_study_design, names(study_design) <- names(required_cols) - # Check for duplicates if (any(duplicated(study_design$fractions$Dataset))) { - stop(sprintf("Duplicate datasets in '%sfractions.txt'")) + stop(sprintf("Duplicate Dataset entries in '%sfractions.txt'", prefix)) } - if (any(duplicated(study_design$samples$MeasurementName[ - !is.na(study_design$samples$MeasurementName)]))) { - stop("Duplicate sample names in '%ssamples.txt'") + if (any(duplicated(study_design$samples$MeasurementName, + incomparables = NA))) { + stop(sprintf("Duplicate MeasurementName entries in '%ssamples.txt'", + prefix)) } if (!setequal(study_design$fractions$PlexID, study_design$samples$PlexID)) { - stop(sprintf("Plex IDs in '%sfractions.txt' and '%ssamples.txt' do not match.", - prefix, prefix)) + stop(sprintf( + "Plex IDs in '%sfractions.txt' and '%ssamples.txt' do not match.", + prefix, prefix)) + } + + # Check for valid TMT layout + # Set reporter_converter to prevent "no visible binding" note + reporter_converter <- PlexedPiper::reporter_converter + for (i in seq_along(reporter_converter)) { + if (setequal(study_design$samples$ReporterName, + reporter_converter[[i]]$ReporterIon)) { + converter <- data.table(reporter_converter[[i]]) + break + } + } + + if (missing(converter)) { + stop(paste("No reporter ion converter tables match reporter ions in", + "samples$ReporterName")) } return(study_design) From 355387c01f4b034ce9768809b70c0264d237d5e8 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 13 Jun 2023 12:09:43 -0700 Subject: [PATCH 03/17] Update run_plexedpiper.R Update how results are written to tables (set na to empty strings) and save RData files with compression --- R/run_plexedpiper.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/run_plexedpiper.R b/R/run_plexedpiper.R index de2e186..7278cac 100644 --- a/R/run_plexedpiper.R +++ b/R/run_plexedpiper.R @@ -298,7 +298,8 @@ run_plexedpiper <- function(msgf_output_folder, file = file.path(output_folder, filename), sep = "\t", row.names = FALSE, - quote = FALSE) + quote = FALSE, + na = "") if(verbose) { message("- RII file save to ", file.path(output_folder, filename)) } @@ -308,7 +309,8 @@ run_plexedpiper <- function(msgf_output_folder, file = file.path(output_folder, filename), sep = "\t", row.names = FALSE, - quote = FALSE) + quote = FALSE, + na = "") if(verbose) { message("- RATIO file save to ", file.path(output_folder, filename)) } @@ -316,7 +318,7 @@ run_plexedpiper <- function(msgf_output_folder, } if (save_env) { fileenv <- paste0(file_prefix, "-env.RData") - save.image(file = file.path(output_folder, fileenv)) + save.image(file = file.path(output_folder, fileenv), compress = TRUE) if(verbose) { message("- R environment saved to ", file.path(output_folder, fileenv)) } From 5ae6d38ccd17782c7f0b26f2d000e837a8040ada Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 13 Jun 2023 21:36:16 -0700 Subject: [PATCH 04/17] Minor update to run_plexedpiper documentation --- man/run_plexedpiper.Rd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/man/run_plexedpiper.Rd b/man/run_plexedpiper.Rd index 4e476c2..f8215cd 100644 --- a/man/run_plexedpiper.Rd +++ b/man/run_plexedpiper.Rd @@ -29,8 +29,8 @@ run_plexedpiper( \item{msgf_output_folder}{(character) Path to MS-GF+ results folder(s).} \item{fasta_file}{(character) Path to FASTA file. If multiple MS-GF+ results -folders are provided, all should be searched against the same -protein database.} +folders are provided, all should be searched against the same protein +database.} \item{masic_output_folder}{(character) MASIC results folder(s).} @@ -91,7 +91,7 @@ and Ratio Results tables used by the MoTrPAC Bioinformatics Center (BIC). } \examples{ \dontrun{ -# Example with pseudo-paths +# Phosphoproteomics example with pseudo-paths results <- run_plexedpiper(msgf_output_folder = "~/path/to/msgfplus/", fasta_file = "~/path/to/fasta/sequence.fasta", masic_output_folder = "~/path/to/masic-results/", @@ -108,4 +108,5 @@ results <- run_plexedpiper(msgf_output_folder = "~/path/to/msgfplus/", return_results = TRUE, verbose = TRUE) } + } From 99b0dfcd25004a5ee29a5cd0f4335cb0a091e08c Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 13 Jun 2023 21:43:21 -0700 Subject: [PATCH 05/17] Update run_plexedpiper Version change from 0.4.1 to 0.5.0. Update run_plexedpiper and its helper functions to allow for the processing of redox proteomics data. Also increase the minimum MSnID version number to prevent 'function not in namespace' errors caused by installation of the Bioconductor version. --- DESCRIPTION | 6 ++--- NAMESPACE | 1 - R/motrpac_bic_funtions.R | 44 +++++++++++++++++++------------ R/run_plexedpiper.R | 56 +++++++++++++++++++++++++++++----------- 4 files changed, 71 insertions(+), 36 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index baf763f..6d2c62c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: PlexedPiper Type: Package Title: Pipeline for isobaric quantification -Version: 0.4.1 -Date: 2023-02-07 +Version: 0.5.0 +Date: 2023-06-14 Author: Vladislav Petyuk vladislav.petyuk@pnnl.gov Maintainer: Vladislav Petyuk Description: Pipeline for isobaric quantification. @@ -11,7 +11,7 @@ Encoding: UTF-8 LazyData: true RoxygenNote: 7.2.3 Depends: - MSnID (>= 1.18.1) + MSnID (>= 1.25.2) Imports: Biostrings, data.table, diff --git a/NAMESPACE b/NAMESPACE index a6e85c1..887a116 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -88,7 +88,6 @@ importFrom(readr,read_tsv) importFrom(tibble,rownames_to_column) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) -importFrom(tidyr,separate) importFrom(tidyselect,where) importFrom(utils,read.delim) importFrom(utils,read.table) diff --git a/R/motrpac_bic_funtions.R b/R/motrpac_bic_funtions.R index 46b971f..64fbc45 100644 --- a/R/motrpac_bic_funtions.R +++ b/R/motrpac_bic_funtions.R @@ -409,13 +409,17 @@ make_rii_peptide_ph <- function(msnid, ## Additional info from MS/MS ids <- psms(msnid) %>% - select(protein_id = accession, - sequence = peptide, - ptm_id = SiteID, - flanking_sequence = sequenceWindow, - redundant_ids = redundantAccessions, - MSGFDB_SpecEValue, - maxAScore) %>% + dplyr::select(protein_id = accession, + sequence = peptide, + ptm_id = SiteID, + flanking_sequence = sequenceWindow, + redundant_ids = redundantAccessions, + MSGFDB_SpecEValue, + any_of(c("maxAScore"))) %>% + distinct() %>% + # Redox results do not use AScore, so we need to account for that + rowwise() %>% + mutate(maxAScore = ifelse("maxAScore" %in% names(.), maxAScore, NA)) %>% group_by(protein_id, sequence, ptm_id, flanking_sequence, redundant_ids) %>% summarize(peptide_score = min(MSGFDB_SpecEValue), @@ -517,15 +521,20 @@ make_results_ratio_ph <- function(msnid, ## Additional info from MS/MS ids <- psms(msnid) %>% - select(protein_id = accession, - sequence = peptide, - ptm_id = SiteID, - redundant_ids = noninferableProteins, - flanking_sequence = sequenceWindow, - MSGFDB_SpecEValue, - maxAScore) %>% + dplyr::select(protein_id = accession, + sequence = peptide, + ptm_id = SiteID, + redundant_ids = noninferableProteins, + flanking_sequence = sequenceWindow, + MSGFDB_SpecEValue, + any_of(c("maxAScore"))) %>% + distinct() %>% + # Redox results do not use AScore, so we need to account for that + rowwise() %>% + mutate(maxAScore = ifelse("maxAScore" %in% names(.), maxAScore, NA)) %>% # group at peptide level to calculate peptide score, confident score - group_by(protein_id, sequence, ptm_id, flanking_sequence, redundant_ids) %>% + group_by(protein_id, sequence, ptm_id, + flanking_sequence, redundant_ids) %>% summarize(peptide_score = min(MSGFDB_SpecEValue), confident_score = max(maxAScore)) %>% # regroup at siteID level and recalculate ptm score @@ -533,7 +542,8 @@ make_results_ratio_ph <- function(msnid, summarize(ptm_score = min(peptide_score), confident_score = max(confident_score)) %>% mutate(confident_site = case_when(confident_score >= 17 ~ TRUE, - confident_score < 17 ~ FALSE), + confident_score < 17 ~ FALSE, + TRUE ~ NA), is_contaminant = grepl("Contaminant", protein_id)) results_ratio <- feature_data %>% @@ -583,7 +593,7 @@ assess_noninferable_proteins <- function(msnid, collapse="|") { res <- res %>% group_by(peptideSignature) %>% summarize(noninferableProteins = paste(accession, collapse=collapse)) %>% - left_join(res, by="peptideSignature") %>% + left_join(res, by="peptideSignature", multiple = "all") %>% dplyr::select(-peptideSignature) psms(msnid) <- left_join(psms(msnid), res, by = "accession") diff --git a/R/run_plexedpiper.R b/R/run_plexedpiper.R index 7278cac..64108c4 100644 --- a/R/run_plexedpiper.R +++ b/R/run_plexedpiper.R @@ -6,8 +6,8 @@ #' #' @param msgf_output_folder (character) Path to MS-GF+ results folder(s). #' @param fasta_file (character) Path to FASTA file. If multiple MS-GF+ results -#' folders are provided, all should be searched against the same -#' protein database. +#' folders are provided, all should be searched against the same protein +#' database. #' @param masic_output_folder (character) MASIC results folder(s). #' @param ascore_output_folder (character) AScore results folder(s). #' @param proteomics (character) Either "pr" - proteomics, "ph" - @@ -44,7 +44,6 @@ #' @return (list) If `return_results` is `TRUE`, it returns list with ratio and #' RII data frames. #' -#' #' @md #' #' @importFrom Biostrings readAAStringSet @@ -58,7 +57,7 @@ #' @importFrom purrr reduce map #' #' @examples \dontrun{ -#' # Example with pseudo-paths +#' # Phosphoproteomics example with pseudo-paths #' results <- run_plexedpiper(msgf_output_folder = "~/path/to/msgfplus/", #' fasta_file = "~/path/to/fasta/sequence.fasta", #' masic_output_folder = "~/path/to/masic-results/", @@ -75,6 +74,7 @@ #' return_results = TRUE, #' verbose = TRUE) #' } +#' #' @export @@ -97,12 +97,25 @@ run_plexedpiper <- function(msgf_output_folder, return_results = FALSE, verbose = TRUE) { + on.exit(invisible(gc())) + + # Check input annotation <- toupper(annotation) annotation <- match.arg(annotation, choices = c("REFSEQ", "UNIPROT", "GENCODE")) + # global proteomics, phosphorylation, acetylation, ubiquitination, + # oxidation (redox) + # TODO: change pr to g or gl + proteomics <- match.arg(proteomics, + choices = c("pr", "ph", "ac", "ub", "ox")) + + require_ascore <- proteomics %in% c("ph", "ac", "ub") + if (require_ascore & missing(ascore_output_folder)) { + stop("ascore_output_folder not provided.") + } - if( is.null(write_results_to_file) & is.null(return_results) ){ - stop("\nProvide either or or both. Both cannot be FALSE.") + if (all(is.null(c(write_results_to_file, return_results)))) { + stop("\nAt least write_results_to_file or return_results must be TRUE.") } if (verbose) { @@ -112,7 +125,7 @@ run_plexedpiper <- function(msgf_output_folder, message("- Annotation: \"", annotation, "\"") } - if(is.null(file_prefix)){ + if (is.null(file_prefix)) { file_prefix <- paste0("MSGFPLUS_", toupper(proteomics)) if(!is.null(global_results)){ file_prefix <- paste0(file_prefix,"-ip") @@ -133,10 +146,7 @@ run_plexedpiper <- function(msgf_output_folder, }) %>% rbindlist(fill = TRUE) - if (!is.null(ascore_output_folder)) { - ascore <- map(ascore_output_folder, read_AScore_results) %>% - rbindlist(fill = TRUE) - } + invisible(gc()) if (verbose) {message("- Filtering MASIC results.")} masic_data <- map(masic_output_folder, function(masic_folder) { @@ -144,6 +154,8 @@ run_plexedpiper <- function(msgf_output_folder, filter_masic_data(0.5, 0) }) + invisible(gc()) + fst <- Biostrings::readAAStringSet(fasta_file) names(fst) <- sub(" .*", "", names(fst)) # extract first word @@ -153,18 +165,27 @@ run_plexedpiper <- function(msgf_output_folder, msnid <- correct_peak_selection(msnid) } - if (proteomics != "pr") { + if (require_ascore) { if (verbose) {message(" + Select best PTM location by AScore")} + ascore <- map(ascore_output_folder, read_AScore_results) %>% + rbindlist(fill = TRUE) msnid <- best_PTM_location_by_ascore(msnid, ascore) + } - if (verbose) message(" + Apply PTM filter") + if (proteomics != "pr") { + if (verbose) {message(" + Apply PTM filter")} if (proteomics == "ph") { reg.expr <- "grepl('\\\\*', peptide)" } else if (proteomics == "ac") { reg.expr <- "grepl('#', peptide)" } else if (proteomics == "ub") { + # "@" and "#" both mean ubiquitination msnid$peptide <- gsub("@", "#", msnid$peptide) reg.expr <- "grepl('#', peptide)" + } else if (proteomics == "ox") { + # "@" and "#" do not mean the same thing, so we cannot replace + # the former with the latter. + reg.expr <- "grepl('\\\\@', peptide)" } msnid <- apply_filter(msnid, reg.expr) } @@ -172,6 +193,8 @@ run_plexedpiper <- function(msgf_output_folder, if (verbose) {message(" + Peptide-level FDR filter")} msnid <- filter_msgf_data(msnid, level = "peptide", fdr.max = 0.01) + invisible(gc()) + if (proteomics == "pr") { if (verbose) {message(" + Protein-level FDR filter")} msnid <- compute_num_peptides_per_1000aa(msnid, path_to_FASTA = fasta_file) @@ -238,8 +261,8 @@ run_plexedpiper <- function(msgf_output_folder, mod_char <- "*" } else if (proteomics %in% c("ac", "ub")) { mod_char <- "#" - } else { - stop("Proteomics variable not supported.") + } else if (proteomics == "ox") { + mod_char <- "@" } msnid <- map_mod_sites(object = msnid, @@ -253,6 +276,8 @@ run_plexedpiper <- function(msgf_output_folder, msnid <- extract_sequence_window(msnid, fasta = fst) } + invisible(gc()) + args <- list(msnid = msnid, fractions = study_design$fractions, samples = study_design$samples, @@ -316,6 +341,7 @@ run_plexedpiper <- function(msgf_output_folder, } } } + if (save_env) { fileenv <- paste0(file_prefix, "-env.RData") save.image(file = file.path(output_folder, fileenv), compress = TRUE) From 696fc103551fbf516e4a822aa337a50e02eb4998 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 08:44:12 -0700 Subject: [PATCH 06/17] Update tests Update test-remap_accessions_refseq_to_gene_fasta to use specific conversion table to ensure reproducibility. --- tests/testthat.R | 1 + .../test-remap_accessions_refseq_to_gene_fasta.R | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/testthat.R b/tests/testthat.R index c6ab72b..07bf85f 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,5 @@ library(testthat) library(PlexedPiper) +library(PlexedPiperTestData) test_check("PlexedPiper") diff --git a/tests/testthat/test-remap_accessions_refseq_to_gene_fasta.R b/tests/testthat/test-remap_accessions_refseq_to_gene_fasta.R index df759d2..5c6fda5 100644 --- a/tests/testthat/test-remap_accessions_refseq_to_gene_fasta.R +++ b/tests/testthat/test-remap_accessions_refseq_to_gene_fasta.R @@ -1,5 +1,11 @@ test_that("remap_accessions_refseq_to_gene_fasta works", { library(Biostrings) + library(MSnID) + + conv_tbl <- fetch_conversion_table( + organism_name = "Rattus norvegicus", + from = "REFSEQ", to = "SYMBOL", + snapshot_date = "2023-04-24") path_to_FASTA <- system.file( "extdata/Rattus_norvegicus_NCBI_RefSeq_2018-04-10.fasta.gz", @@ -10,15 +16,16 @@ test_that("remap_accessions_refseq_to_gene_fasta works", { path_to_FASTA <- file.path(temp_dir, basename(path_to_FASTA)) suppressMessages( path_to_new_FASTA <- remap_accessions_refseq_to_gene_fasta( - path_to_FASTA, organism_name = "Rattus norvegicus" - ) + path_to_FASTA, organism_name = "Rattus norvegicus", + conversion_table = conv_tbl) ) fst <- readAAStringSet(path_to_new_FASTA) # gene symbols expect_equal(head(names(fst)), - c("A1bg", "A1cf", "A2m", "A3galt2", "A4galt", "A4gnt")) + c("1700013D24Rik", "A1bg", "A1cf", + "A2m", "A3galt2", "A4galt")) expect_equal(head(width(fst)), - c(513, 594, 1472, 339, 360, 351)) + c(120, 513, 594, 1472, 339, 360)) # Clean up temp_files <- list.files(temp_dir, full.names = TRUE) From 220269100751ea4564767d2dbfeb99fed5d1f1e1 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 08:45:33 -0700 Subject: [PATCH 07/17] Update vignettes Replace get_study_design_by_dataset_package with read_study_design_from_DMS. --- vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd | 2 +- vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd | 2 +- vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd b/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd index 2d43bc7..942eff9 100644 --- a/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd +++ b/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd @@ -149,7 +149,7 @@ masic_data <- filter_masic_data(masic_data, 0.5, 0) # Fetch study design tables ```{r study_design} -study_design_tables <- get_study_design_by_dataset_package(data_package_num) +study_design_tables <- read_study_design_from_DMS(data_package_num) fractions <- study_design_tables$fractions samples <- study_design_tables$samples references <- study_design_tables$references diff --git a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd index 99ffdd9..90dbc3e 100644 --- a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd +++ b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd @@ -142,7 +142,7 @@ masic_data <- filter_masic_data(masic_data, 0.5, 0) # Fetch study design tables ```{r study_design} -study_design_tables <- get_study_design_by_dataset_package(data_package_num) +study_design_tables <- read_study_design_from_DMS(data_package_num) fractions <- study_design_tables$fractions samples <- study_design_tables$samples references <- study_design_tables$references diff --git a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd index bbcc6bb..000fdb8 100644 --- a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd +++ b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd @@ -142,7 +142,7 @@ masic_data <- filter_masic_data(masic_data, 0.5, 0) # Fetch study design tables ```{r study_design} -study_design_tables <- get_study_design_by_dataset_package(data_package_num) +study_design_tables <- read_study_design_from_DMS(data_package_num) fractions <- study_design_tables$fractions samples <- study_design_tables$samples references <- study_design_tables$references From fabb3e30bbad25455717d56bfa4b0039775ba2a0 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 08:51:23 -0700 Subject: [PATCH 08/17] Update filter_msfragger_data documentation Capitalize MSFragger. Reflow text. Remove ... parameter, which was a carry-over from filter_msgf_data. Align code. --- R/filter_msfragger_data.R | 122 +++++++++++++++++++---------------- man/filter_msfragger_data.Rd | 27 ++++---- 2 files changed, 78 insertions(+), 71 deletions(-) diff --git a/R/filter_msfragger_data.R b/R/filter_msfragger_data.R index bf50a1b..1aefa86 100644 --- a/R/filter_msfragger_data.R +++ b/R/filter_msfragger_data.R @@ -1,46 +1,44 @@ -#' Filtering msfragger Data +#' Filtering MSFragger Data +#' +#' Filtering MSFragger data. In this implementation, the peptide-level filter +#' optimizes both ppm and one of Expectation or PeptideProphet Probability +#' thresholds to achieve maximum number of peptide identifications within a +#' given FDR constraint. #' -#' Filtering msfragger data. In this implementation, the peptide-level filter -#' optimizes both ppm and one of Expectation or PeptideProphet Probability thresholds -#' to achieve maximum number of peptide identifications within a given FDR constraint. #' The accession-level filter optimizes based on `peptides_per_1000aa`, so #' \code{\link{compute_num_peptides_per_1000aa}} must be used first. #' #' @md #' -#' @param msnid (MSnID object) collated msfragger output +#' @param msnid (MSnID object) collated MSFragger output #' @param fdr.max (numeric) Maximum acceptable FDR. Default is 0.01 (1%). #' @param level (character) Level at which to perform FDR filter. The name of a #' column in `psms(msnid)`. Currently, only `"peptide"` or `"accession"` are -#' supported. The added level `SiteID` makes sense only for PTM data and +#' supported. The added level `"SiteID"` makes sense only for PTM data and #' first requires mapping of the modification site using -#' `MSnID::map_mod_sites`. -#' @param filtering_criterion (character) One of "evalue" which is -#' expectation value or "pp_prob" - peptide prophet probability. Default is -#' "pp_prob". +#' `MSnID::map_mod_sites`. +#' @param filtering_criterion (character) One of `"evalue"` which is expectation +#' value or `"pp_prob"` - peptide prophet probability. Default is "pp_prob". #' @param n.iter.grid (numeric) number of grid-distributed evaluation points. #' @param n.iter.nm (numeric) number of iterations for Nelder-Mead optimization #' algorithm. -#' @param ... arguments passed to `filter_msfragger_data`. #' -#' @return (MSnID object) filtered msfragger output +#' @return (MSnID object) filtered MSFragger output #' -#' @seealso -#' \code{\link[MSnID]{MSnIDFilter}} -#' \code{\link[MSnID]{optimize_filter}} -#' \code{\link[MSnID]{apply_filter}} +#' @seealso \code{\link[MSnID]{MSnIDFilter}} +#' \code{\link[MSnID]{optimize_filter}} \code{\link[MSnID]{apply_filter}} #' -#' @importFrom MSnID MSnIDFilter optimize_filter -#' mass_measurement_error apply_filter +#' @importFrom MSnID MSnIDFilter optimize_filter mass_measurement_error +#' apply_filter #' @export filter_msfragger_data <- function(msnid, - level, - filtering_criterion = c("pp_prob","evalue"), - fdr.max=0.01, - n.iter.grid=500, - n.iter.nm=100){ + level, + filtering_criterion = c("pp_prob", "evalue"), + fdr.max=0.01, + n.iter.grid=500, + n.iter.nm=100){ # Clean up on exit on.exit(rm(list = ls())) @@ -49,18 +47,18 @@ filter_msfragger_data <- function(msnid, # Check input level <- match.arg(level, choices = c("peptide", "accession", "SiteID")) filtering_criterion <- match.arg(filtering_criterion) - - if(level == "SiteID" & !("SiteID" %in% names(msnid))) - stop("Column 'SiteID' is not in the MSnID object. Please map the PTMs first.") + + if (level == "SiteID" & !("SiteID" %in% names(msnid))) { + stop("Column 'SiteID' is not in the MSnID object. Please map the PTMs first.") + } keep_cols <- c(level, "isDecoy") # columns to calculate FDR # Create MSnID of minimum size suppressMessages(msnid_small <- MSnID()) - + # Setup if (level == "accession") { - # Add filter criteria column keep_cols <- c(keep_cols, "peptides_per_1000aa") msnid_small@psms <- unique(msnid@psms[, keep_cols, with = FALSE]) @@ -69,36 +67,41 @@ filter_msfragger_data <- function(msnid, filtObj <- MSnIDFilter(msnid_small) filtObj$peptides_per_1000aa <- list(comparison = ">", threshold = 1) method <- "SANN" + } else { - #Choose filter object probability value - if (filtering_criterion == "evalue") { - msnid$msmsScore <- -log10(msnid$Expectation) - } - if (filtering_criterion == "pp_prob") { - msnid$msmsScore <- msnid$`PeptideProphet Probability` - } - # Create columns for peptide filtering - # Can not use data.table syntax if the msnid has been modified at all, - # as it results in the "Invalid .internal.selfref" warning and - # columns not being created. - msnid$absParentMassErrorPPM <- abs(mass_measurement_error(msnid)) - - # Add filter criteria columns - keep_cols <- c(keep_cols, "msmsScore", "absParentMassErrorPPM") - msnid_small@psms <- unique(msnid@psms[, keep_cols, with = FALSE]) - - # Create filter object - filtObj <- MSnIDFilter(msnid_small) - filtObj$absParentMassErrorPPM <- list(comparison = "<", threshold = 10) - if (filtering_criterion == "evalue") { - filtObj$msmsScore <- list(comparison = ">", threshold = 2) - } - if (filtering_criterion == "pp_prob") { - filtObj$msmsScore <- list(comparison = ">", threshold = 0.99) - } - method <- "Nelder-Mead" + #Choose filter object probability value + if (filtering_criterion == "evalue") { + msnid$msmsScore <- -log10(msnid$Expectation) + } + + if (filtering_criterion == "pp_prob") { + msnid$msmsScore <- msnid$`PeptideProphet Probability` + } + + # Create columns for peptide filtering + # Can not use data.table syntax if the msnid has been modified at all, + # as it results in the "Invalid .internal.selfref" warning and + # columns not being created. + msnid$absParentMassErrorPPM <- abs(mass_measurement_error(msnid)) + + # Add filter criteria columns + keep_cols <- c(keep_cols, "msmsScore", "absParentMassErrorPPM") + msnid_small@psms <- unique(msnid@psms[, keep_cols, with = FALSE]) + + # Create filter object + filtObj <- MSnIDFilter(msnid_small) + filtObj$absParentMassErrorPPM <- list(comparison = "<", threshold = 10) + + if (filtering_criterion == "evalue") { + filtObj$msmsScore <- list(comparison = ">", threshold = 2) + } + + if (filtering_criterion == "pp_prob") { + filtObj$msmsScore <- list(comparison = ">", threshold = 0.99) + } + + method <- "Nelder-Mead" } - # step 1 filtObj.grid <- optimize_filter(filtObj, @@ -107,6 +110,7 @@ filter_msfragger_data <- function(msnid, method="Grid", level=level, n.iter=n.iter.grid) + # step 2 filtObj.nm <- optimize_filter(filtObj.grid, msnid_small, @@ -114,5 +118,9 @@ filter_msfragger_data <- function(msnid, method=method, level=level, n.iter=n.iter.nm) - return(apply_filter(msnid, filtObj.nm)) + + msnid <- apply_filter(msnid, filtObj.nm) + + return(msnid) } + diff --git a/man/filter_msfragger_data.Rd b/man/filter_msfragger_data.Rd index 1721b3d..ecd138f 100644 --- a/man/filter_msfragger_data.Rd +++ b/man/filter_msfragger_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/filter_msfragger_data.R \name{filter_msfragger_data} \alias{filter_msfragger_data} -\title{Filtering msfragger Data} +\title{Filtering MSFragger Data} \usage{ filter_msfragger_data( msnid, @@ -14,17 +14,16 @@ filter_msfragger_data( ) } \arguments{ -\item{msnid}{(MSnID object) collated msfragger output} +\item{msnid}{(MSnID object) collated MSFragger output} \item{level}{(character) Level at which to perform FDR filter. The name of a column in \code{psms(msnid)}. Currently, only \code{"peptide"} or \code{"accession"} are -supported. The added level \code{SiteID} makes sense only for PTM data and +supported. The added level \code{"SiteID"} makes sense only for PTM data and first requires mapping of the modification site using \code{MSnID::map_mod_sites}.} -\item{filtering_criterion}{(character) One of "evalue" which is -expectation value or "pp_prob" - peptide prophet probability. Default is -"pp_prob".} +\item{filtering_criterion}{(character) One of \code{"evalue"} which is expectation +value or \code{"pp_prob"} - peptide prophet probability. Default is "pp_prob".} \item{fdr.max}{(numeric) Maximum acceptable FDR. Default is 0.01 (1\%).} @@ -32,21 +31,21 @@ expectation value or "pp_prob" - peptide prophet probability. Default is \item{n.iter.nm}{(numeric) number of iterations for Nelder-Mead optimization algorithm.} - -\item{...}{arguments passed to \code{filter_msfragger_data}.} } \value{ -(MSnID object) filtered msfragger output +(MSnID object) filtered MSFragger output } \description{ -Filtering msfragger data. In this implementation, the peptide-level filter -optimizes both ppm and one of Expectation or PeptideProphet Probability thresholds -to achieve maximum number of peptide identifications within a given FDR constraint. +Filtering MSFragger data. In this implementation, the peptide-level filter +optimizes both ppm and one of Expectation or PeptideProphet Probability +thresholds to achieve maximum number of peptide identifications within a +given FDR constraint. +} +\details{ The accession-level filter optimizes based on \code{peptides_per_1000aa}, so \code{\link{compute_num_peptides_per_1000aa}} must be used first. } \seealso{ \code{\link[MSnID]{MSnIDFilter}} -\code{\link[MSnID]{optimize_filter}} -\code{\link[MSnID]{apply_filter}} +\code{\link[MSnID]{optimize_filter}} \code{\link[MSnID]{apply_filter}} } From 5834164c92350e565e9c8b3be828135cdfc37bbb Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 08:54:51 -0700 Subject: [PATCH 09/17] Update run_plexedpiper Add input checks. Improve readability with switch. Add global variables. --- R/run_plexedpiper.R | 168 ++++++++++++++++++++++++-------------------- 1 file changed, 93 insertions(+), 75 deletions(-) diff --git a/R/run_plexedpiper.R b/R/run_plexedpiper.R index 64108c4..6774575 100644 --- a/R/run_plexedpiper.R +++ b/R/run_plexedpiper.R @@ -51,8 +51,7 @@ #' @importFrom MSnID psms MSnID compute_accession_coverage #' correct_peak_selection extract_sequence_window #' infer_parsimonious_accessions map_mod_sites -#' @importFrom dplyr %>% full_join select mutate filter pull -#' @importFrom tidyselect where +#' @importFrom dplyr %>% full_join select mutate filter pull case_when where #' @importFrom data.table rbindlist #' @importFrom purrr reduce map #' @@ -104,8 +103,7 @@ run_plexedpiper <- function(msgf_output_folder, annotation <- match.arg(annotation, choices = c("REFSEQ", "UNIPROT", "GENCODE")) # global proteomics, phosphorylation, acetylation, ubiquitination, - # oxidation (redox) - # TODO: change pr to g or gl + # reduction/oxidation (redox) proteomics <- match.arg(proteomics, choices = c("pr", "ph", "ac", "ub", "ox")) @@ -114,6 +112,25 @@ run_plexedpiper <- function(msgf_output_folder, stop("ascore_output_folder not provided.") } + if (proteomics != "pr") { + if (is.null(global_results)) { + if (verbose) { + warning("Reference global proteomics dataset (global_results) not provided.") + } + } else { + if (!file.exists(global_results)) { + stop("global_results file not found.") + } + } + } + + if (is.null(file_prefix)) { + file_prefix <- paste0("MSGFPLUS_", toupper(proteomics)) + if(!is.null(global_results)){ + file_prefix <- paste0(file_prefix, "-ip") + } + } + if (all(is.null(c(write_results_to_file, return_results)))) { stop("\nAt least write_results_to_file or return_results must be TRUE.") } @@ -125,13 +142,6 @@ run_plexedpiper <- function(msgf_output_folder, message("- Annotation: \"", annotation, "\"") } - if (is.null(file_prefix)) { - file_prefix <- paste0("MSGFPLUS_", toupper(proteomics)) - if(!is.null(global_results)){ - file_prefix <- paste0(file_prefix,"-ip") - } - } - # Data loading message("- Fetch study design tables") @@ -151,14 +161,11 @@ run_plexedpiper <- function(msgf_output_folder, if (verbose) {message("- Filtering MASIC results.")} masic_data <- map(masic_output_folder, function(masic_folder) { read_masic_data(masic_folder, interference_score = TRUE) %>% - filter_masic_data(0.5, 0) + filter_masic_data(interference_score_threshold = 0.5, s2n_threshold = 0) }) invisible(gc()) - fst <- Biostrings::readAAStringSet(fasta_file) - names(fst) <- sub(" .*", "", names(fst)) # extract first word - if (verbose) {message("- Filtering MS-GF+ results.")} if (proteomics == "pr") { if (verbose) {message(" + Correct for isotope selection error")} @@ -174,19 +181,23 @@ run_plexedpiper <- function(msgf_output_folder, if (proteomics != "pr") { if (verbose) {message(" + Apply PTM filter")} - if (proteomics == "ph") { - reg.expr <- "grepl('\\\\*', peptide)" - } else if (proteomics == "ac") { - reg.expr <- "grepl('#', peptide)" - } else if (proteomics == "ub") { - # "@" and "#" both mean ubiquitination - msnid$peptide <- gsub("@", "#", msnid$peptide) - reg.expr <- "grepl('#', peptide)" - } else if (proteomics == "ox") { - # "@" and "#" do not mean the same thing, so we cannot replace - # the former with the latter. - reg.expr <- "grepl('\\\\@', peptide)" - } + switch(proteomics, + ph = { + reg.expr <- "grepl('\\\\*', peptide)" + }, + ac = { + reg.expr <- "grepl('#', peptide)" + }, + ub = { + # "@" and "#" both mean ubiquitination + msnid$peptide <- gsub("@", "#", msnid$peptide) + reg.expr <- "grepl('#', peptide)" + }, + ox = { + # "@" and "#" do not mean the same thing, so we cannot replace + # the former with the latter. + reg.expr <- "grepl('\\\\@', peptide)" + }) msnid <- apply_filter(msnid, reg.expr) } @@ -204,29 +215,39 @@ run_plexedpiper <- function(msgf_output_folder, if(verbose) {message(" + Remove decoy sequences")} msnid <- apply_filter(msnid, "!isDecoy") - if (annotation == "GENCODE") { - # Remove duplicate GENCODE protein IDs using FASTA file headers - pttrn <- "(ENSP[^\\|]+).*" - unique_ids <- data.frame(id_orig = names(fst)) %>% - mutate(id_new = sub(pttrn, "\\1", id_orig), - duped = duplicated(id_new)) %>% - filter(!duped) %>% - pull(id_orig) - msnid <- apply_filter(msnid, "accession %in% unique_ids") - msnid$accession <- sub(pttrn, "\\1", msnid$accession) - fst <- fst[names(fst) %in% unique_ids] - names(fst) <- sub(pttrn, "\\1", names(fst)) - - # Sanity check - if (anyDuplicated(names(fst)) != 0) { - stop("Duplicate FASTA entry names!") - } - } else if (annotation == "UNIPROT") { - pttrn <- "((sp|tr)\\|)?([^\\|]*)(.*)?" - # Extract middle portion of ID - msnid$accession <- sub(pttrn, "\\3", msnid$accession) - names(fst) <- sub(pttrn, "\\3", names(fst)) - } + # FASTA + fst <- Biostrings::readAAStringSet(fasta_file) + names(fst) <- sub(" .*", "", names(fst)) + msnid$accession <- sub(" .*", "", msnid$accession) + + switch(annotation, + GENCODE = { + # Remove duplicate GENCODE protein IDs using FASTA file headers + pttrn <- "\\|.*" + unique_ids <- data.frame(id_orig = names(fst)) %>% + arrange(id_orig) %>% + mutate(id_new = sub(pttrn, "", id_orig)) %>% + filter(!duplicated(id_new)) %>% + pull(id_orig) + msnid <- apply_filter(msnid, "accession %in% unique_ids") + msnid$accession <- sub(pttrn, "", msnid$accession) + fst <- fst[names(fst) %in% unique_ids] + names(fst) <- sub(pttrn, "", names(fst)) + + # Sanity check + if (anyDuplicated(names(fst)) != 0) { + stop("Duplicate FASTA entry names!") + } + }, + UNIPROT = { + pttrn <- "((sp|tr)\\|)?([^\\|]*)(.*)?" + # Extract middle portion of ID + msnid$accession <- sub(pttrn, "\\3", msnid$accession) + names(fst) <- sub(pttrn, "\\3", names(fst)) + }, + REFSEQ = { + # Extract first word (already done before switch) + }) if (verbose) {message(" + Concatenating redundant protein matches")} msnid <- assess_redundant_protein_matches(msnid, collapse = ",") @@ -235,15 +256,10 @@ run_plexedpiper <- function(msgf_output_folder, msnid <- assess_noninferable_proteins(msnid, collapse = ",") if (verbose) {message(" + Inference of parsimonius set")} - if (proteomics == "pr") { - prior <- character(0) - } else if (is.null(global_results)) { - if (verbose) { - message(" > Reference global proteomics dataset NOT provided") - } + if (proteomics == "pr" | is.null(global_results)) { prior <- character(0) } else { - global_ratios <- read.table(global_results, header=TRUE, sep="\t") + global_ratios <- read.table(global_results, header = TRUE, sep = "\t") prior <- unique(global_ratios$protein_id) } @@ -257,13 +273,10 @@ run_plexedpiper <- function(msgf_output_folder, msnid <- compute_accession_coverage(msnid, fst) } else { if(verbose) {message(" + Mapping sites to protein sequence")} - if (proteomics == "ph") { - mod_char <- "*" - } else if (proteomics %in% c("ac", "ub")) { - mod_char <- "#" - } else if (proteomics == "ox") { - mod_char <- "@" - } + + mod_char <- case_when(proteomics == "ph" ~ "*", + proteomics %in% c("ac", "ub") ~ "#", + proteomics == "ox" ~ "@") msnid <- map_mod_sites(object = msnid, fasta = fst, @@ -287,11 +300,11 @@ run_plexedpiper <- function(msgf_output_folder, fasta_file = fasta_file) if (verbose) {message("- Making results tables.")} - rii_fun <- switch(EXPR = proteomics, - pr = make_rii_peptide_gl, + rii_fun <- ifelse(proteomics == "pr", + make_rii_peptide_gl, make_rii_peptide_ph) - ratio_fun <- switch(EXPR = proteomics, - pr = make_results_ratio_gl, + ratio_fun <- ifelse(proteomics == "pr", + make_results_ratio_gl, make_results_ratio_ph) results_ratio <- rii_peptide <- list() @@ -301,11 +314,14 @@ run_plexedpiper <- function(msgf_output_folder, suppressMessages(results_ratio[[i]] <- do.call(ratio_fun, args)) } # Combine tables and remove columns with all missing values - rii_peptide <- purrr::reduce(rii_peptide, .f = full_join) %>% + rii_peptide <- reduce(rii_peptide, .f = full_join) %>% dplyr::select(where(~ !all(is.na(.x)))) - results_ratio <- purrr::reduce(results_ratio, .f = full_join) %>% + results_ratio <- reduce(results_ratio, .f = full_join) %>% dplyr::select(where(~ !all(is.na(.x)))) + names(rii_peptide) <- make.names(names(rii_peptide)) + names(results_ratio) <- make.names(names(results_ratio)) + if (verbose) {message("- Saving results.")} if (is.null(output_folder)) { @@ -323,8 +339,7 @@ run_plexedpiper <- function(msgf_output_folder, file = file.path(output_folder, filename), sep = "\t", row.names = FALSE, - quote = FALSE, - na = "") + quote = FALSE) if(verbose) { message("- RII file save to ", file.path(output_folder, filename)) } @@ -334,8 +349,7 @@ run_plexedpiper <- function(msgf_output_folder, file = file.path(output_folder, filename), sep = "\t", row.names = FALSE, - quote = FALSE, - na = "") + quote = FALSE) if(verbose) { message("- RATIO file save to ", file.path(output_folder, filename)) } @@ -359,3 +373,7 @@ run_plexedpiper <- function(msgf_output_folder, } } + +utils::globalVariables( + c("id_orig", "id_new") +) From a71519ad40ce97c3d64728834be4c8b5e73e5992 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 08:56:04 -0700 Subject: [PATCH 10/17] Update read_study_design Fix checks for valid TMT layout. --- R/read_study_design.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/read_study_design.R b/R/read_study_design.R index c871e87..9d1a105 100644 --- a/R/read_study_design.R +++ b/R/read_study_design.R @@ -100,13 +100,13 @@ read_study_design <- function(path_to_study_design, reporter_converter <- PlexedPiper::reporter_converter for (i in seq_along(reporter_converter)) { if (setequal(study_design$samples$ReporterName, - reporter_converter[[i]]$ReporterIon)) { + reporter_converter[[i]]$ReporterName)) { converter <- data.table(reporter_converter[[i]]) break } } - if (missing(converter)) { + if (!exists("converter")) { stop(paste("No reporter ion converter tables match reporter ions in", "samples$ReporterName")) } From 3446b70f2dd86c15856b12a40bdc3b50fa634357 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 08:57:21 -0700 Subject: [PATCH 11/17] Improve readability with switch function. Update global variables. --- R/motrpac_bic_funtions.R | 201 ++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 97 deletions(-) diff --git a/R/motrpac_bic_funtions.R b/R/motrpac_bic_funtions.R index 64fbc45..fdfe527 100644 --- a/R/motrpac_bic_funtions.R +++ b/R/motrpac_bic_funtions.R @@ -40,7 +40,7 @@ #' #' @importFrom MSnID fetch_conversion_table parse_FASTA_names #' @importFrom dplyr select inner_join left_join mutate %>% case_when rename -#' group_by summarize arrange across +#' group_by summarize arrange across any_of rowwise #' @importFrom tibble rownames_to_column #' #' @name motrpac_bic_output @@ -144,25 +144,28 @@ make_rii_peptide_gl <- function(msnid, from <- annotation to <- c("SYMBOL", "ENTREZID") - if (annotation == "REFSEQ") { - rgx <- "(^.*)\\.\\d+" - grp <- "\\1" - } else if (annotation == "UNIPROT") { - rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" - grp <- "\\3" - fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% - dplyr::rename(SYMBOL = gene, - protein_id = unique_id) - from <- "SYMBOL" - to <- "ENTREZID" - } else if (annotation == "GENCODE") { - rgx <- "(ENSP[^\\|]+).*" - grp <- "\\1" - fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% - dplyr::rename(SYMBOL = gene) - from <- "SYMBOL" - to <- "ENTREZID" - } + switch(annotation, + REFSEQ = { + rgx <- "(^.*)\\.\\d+" + grp <- "\\1" + }, + UNIPROT = { + rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" + grp <- "\\3" + fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% + dplyr::rename(SYMBOL = gene, + protein_id = unique_id) + from <- "SYMBOL" + to <- "ENTREZID" + }, + GENCODE = { + rgx <- "(ENSP[^\\|]+).*" + grp <- "\\1" + fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% + dplyr::rename(SYMBOL = gene) + from <- "SYMBOL" + to <- "ENTREZID" + }) conv <- suppressWarnings( fetch_conversion_table(org_name, from = from, to = to) @@ -171,9 +174,6 @@ make_rii_peptide_gl <- function(msnid, # Feature data feature_data <- crosstab %>% dplyr::select(Specie) %>% - ## Old code: does not work if PTMs are denoted by "@" - # tidyr::separate(col = Specie, into = c("protein_id", "sequence"), - # sep = "@", remove = FALSE) %>% mutate(protein_id = sub("(^[^@]*)@(.*)", "\\1", Specie), sequence = sub("(^[^@]*)@(.*)", "\\2", Specie)) %>% mutate(organism_name = org_name) @@ -217,7 +217,7 @@ utils::globalVariables( "protein_id", "ANNOTATION", "SYMBOL", "ENTREZID", "accession", "peptide", "noninferableProteins", "MSGFDB_SpecEValue", "redundant_ids", "gene", - "feature", "transcript_id") + "feature", "transcript_id", "unique_id") ) @@ -244,27 +244,30 @@ make_results_ratio_gl <- function(msnid, rownames_to_column("protein_id") ## Add feature annotations -------------------------------------------- - from <- annotation - to <- c("SYMBOL", "ENTREZID") - if (annotation == "REFSEQ") { - rgx <- "(^.*)\\.\\d+" - grp <- "\\1" - } else if (annotation == "UNIPROT") { - rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" - grp <- "\\3" - fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% - dplyr::rename(SYMBOL = gene, - protein_id = unique_id) - from <- "SYMBOL" - to <- "ENTREZID" - } else if (annotation == "GENCODE") { - rgx <- "(ENSP[^\\|]+).*" - grp <- "\\1" - fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% - dplyr::rename(SYMBOL = gene) - from <- "SYMBOL" - to <- "ENTREZID" - } + switch(annotation, + REFSEQ = { + rgx <- "(^.*)\\.\\d+" + grp <- "\\1" + from <- "REFSEQ" + to <- c("SYMBOL", "ENTREZID") + }, + UNIPROT = { + rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" + grp <- "\\3" + fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% + dplyr::rename(SYMBOL = gene, + protein_id = unique_id) + from <- "SYMBOL" + to <- "ENTREZID" + }, + GENCODE = { + rgx <- "(ENSP[^\\|]+).*" + grp <- "\\1" + fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% + dplyr::rename(SYMBOL = gene) + from <- "SYMBOL" + to <- "ENTREZID" + }) conv <- suppressWarnings( fetch_conversion_table(org_name, from = from, to = to) @@ -312,8 +315,10 @@ make_results_ratio_gl <- function(msnid, return(results_ratio) } -utils::globalVariables(c("noninferableProteins", "percentAACoverage", - "percent_coverage", "feature", "transcript_id")) +utils::globalVariables( + c("noninferableProteins", "percentAACoverage", "percent_coverage", + "feature", "transcript_id") +) #' @export @@ -341,8 +346,6 @@ make_rii_peptide_ph <- function(msnid, mutate(Reference = 1) ## Create Crosstab - annotation <- toupper(annotation) - aggregation_level <- c("accession", "peptide", "SiteID") crosstab <- create_crosstab(msnid, masic_data, @@ -354,27 +357,31 @@ make_rii_peptide_ph <- function(msnid, rownames_to_column("Specie") ## Fetch conversion table ----- - from <- annotation - to <- c("SYMBOL", "ENTREZID") - if (annotation == "REFSEQ") { - rgx <- "(^.*)\\.\\d+" - grp <- "\\1" - } else if (annotation == "UNIPROT") { - rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" - grp <- "\\3" - fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% - dplyr::rename(SYMBOL = gene, - protein_id = unique_id) - from <- "SYMBOL" - to <- "ENTREZID" - } else if (annotation == "GENCODE") { - rgx <- "(ENSP[^\\|]+).*" - grp <- "\\1" - fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% - dplyr::rename(SYMBOL = gene) - from <- "SYMBOL" - to <- "ENTREZID" - } + annotation <- toupper(annotation) + switch(annotation, + REFSEQ = { + rgx <- "(^.*)\\.\\d+" + grp <- "\\1" + from <- "REFSEQ" + to <- c("SYMBOL", "ENTREZID") + }, + UNIPROT = { + rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" + grp <- "\\3" + fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% + dplyr::rename(SYMBOL = gene, + protein_id = unique_id) + from <- "SYMBOL" + to <- "ENTREZID" + }, + GENCODE = { + rgx <- "(ENSP[^\\|]+).*" + grp <- "\\1" + fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% + dplyr::rename(SYMBOL = gene) + from <- "SYMBOL" + to <- "ENTREZID" + }) conv <- suppressWarnings( fetch_conversion_table(org_name, from = from, to = to) @@ -439,7 +446,7 @@ make_rii_peptide_ph <- function(msnid, } utils::globalVariables( - c("ptm_peptide", "MeasurementName", "ReporterAlias", + c(".", "ptm_peptide", "MeasurementName", "ReporterAlias", "PlexID", "Specie", "ptm_id", "protein_id", "ANNOTATION", "SYMBOL", "ENTREZID", "accession", "peptide", "SiteID", "sequenceWindow", @@ -468,28 +475,31 @@ make_results_ratio_ph <- function(msnid, ## Fetch conversation table annotation <- toupper(annotation) - from <- annotation - to <- c("SYMBOL", "ENTREZID") - if (annotation == "REFSEQ") { - rgx <- "(^.*)\\.\\d+" - grp <- "\\1" - } else if (annotation == "UNIPROT") { - rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" - grp <- "\\3" - fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% - dplyr::rename(SYMBOL = gene, - protein_id = unique_id) - from <- "SYMBOL" - to <- "ENTREZID" - } else if (annotation == "GENCODE") { - rgx <- "(ENSP[^\\|]+).*" - grp <- "\\1" - fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% - dplyr::rename(SYMBOL = gene) - from <- "SYMBOL" - to <- "ENTREZID" - } + switch(annotation, + REFSEQ = { + rgx <- "(^.*)\\.\\d+" + grp <- "\\1" + from <- "REFSEQ" + to <- c("SYMBOL", "ENTREZID") + }, + UNIPROT = { + rgx <- "((sp|tr)\\|)?([^\\|]*)(.*)?" + grp <- "\\3" + fasta_names <- parse_FASTA_names(fasta_file, "uniprot") %>% + dplyr::rename(SYMBOL = gene, + protein_id = unique_id) + from <- "SYMBOL" + to <- "ENTREZID" + }, + GENCODE = { + rgx <- "(ENSP[^\\|]+).*" + grp <- "\\1" + fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>% + dplyr::rename(SYMBOL = gene) + from <- "SYMBOL" + to <- "ENTREZID" + }) conv <- suppressWarnings( fetch_conversion_table(org_name, from = from, to = to) @@ -498,9 +508,6 @@ make_results_ratio_ph <- function(msnid, ## Create RII peptide table feature_data <- crosstab %>% select(Specie) %>% - ## Old code: does not work if PTMs are denoted by "@" - # tidyr::separate(Specie, into = c("protein_id", "ptm_id"), - # sep = "@", remove = FALSE) %>% mutate(protein_id = sub("(^[^@]*)@(.*)", "\\1", Specie), ptm_id = sub("(^[^@]*)@(.*)", "\\2", Specie)) %>% mutate(organism_name = org_name) @@ -556,9 +563,9 @@ make_results_ratio_ph <- function(msnid, } utils::globalVariables( - c("flanking_sequence", "redundant_ids", "peptide_score", + c(".", "flanking_sequence", "redundant_ids", "peptide_score", "confident_score", "noninferableProteins", "gene", - "feature", "transcript_id") + "feature", "transcript_id", "unique_id") ) From 70d6f5ca1fe24e09c31887f3d8b3ac2982ebd965 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 09:03:14 -0700 Subject: [PATCH 12/17] Update DESCRIPTION and NAMESPACE Remove tidyselect import: the where function is now imported from dplyr. --- DESCRIPTION | 5 ++--- NAMESPACE | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6d2c62c..c7bc448 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: PlexedPiper Type: Package Title: Pipeline for isobaric quantification -Version: 0.5.0 -Date: 2023-06-14 +Version: 0.4.2 +Date: 2023-07-06 Author: Vladislav Petyuk vladislav.petyuk@pnnl.gov Maintainer: Vladislav Petyuk Description: Pipeline for isobaric quantification. @@ -21,7 +21,6 @@ Imports: purrr, tibble, tidyr, - tidyselect, readr, utils Suggests: diff --git a/NAMESPACE b/NAMESPACE index 887a116..a2629d8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ importFrom(data.table,setnames) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) +importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) @@ -76,11 +77,13 @@ importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,pull) importFrom(dplyr,rename) +importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,starts_with) importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) +importFrom(dplyr,where) importFrom(plyr,llply) importFrom(purrr,map) importFrom(purrr,reduce) @@ -88,7 +91,6 @@ importFrom(readr,read_tsv) importFrom(tibble,rownames_to_column) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) -importFrom(tidyselect,where) importFrom(utils,read.delim) importFrom(utils,read.table) importFrom(utils,write.table) From 8fc8dad9521b9beec5e482f4961d1932da3071e4 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 09:04:18 -0700 Subject: [PATCH 13/17] Update NEWS.md to v0.4.2 --- NEWS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index efe6a7f..ff57e2d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# PlexedPiper 0.4.2 (2023-07-06) + +- Update `run_plexedpiper` to process redox proteomics data. +- Add more robust checks for `read_study_design` output. +- Set minimum MSnID version to 1.25.2, though this does not prevent installation of an incorrect version of MSnID from Bioconductor, since the version number on Bioconductor is higher. + # PlexedPiper 0.4.1 (2023-02-07) - Removed duplicate GENCODE protein IDs from `run_plexedpiper` output. GENCODE IDs are currently only unique when combining the protein (ENSP) and transcript (ENST) IDs. Since there are so few duplicates, we will remove them rather than concatenating these IDs in the "protein_id" column of the output of `make_results_ratio_*` and `make_rii_peptide_*` functions. From a0a562ecbc404e5face86ba360f2b3586a9eb6b1 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Thu, 6 Jul 2023 09:25:19 -0700 Subject: [PATCH 14/17] Update GitHub R-CMD-check workflow v1 --> v2 --- .github/workflows/R-CMD-check.yaml | 64 ++++++++++-------------------- README.md | 1 + 2 files changed, 21 insertions(+), 44 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 3d00814..a67ec0f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,12 +1,12 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help - on: push: - branches: - master + branches: [main, master] pull_request: - branches: master + branches: [main, master] workflow_dispatch: + branches: [main, master] name: R-CMD-check @@ -20,56 +20,32 @@ jobs: fail-fast: false matrix: config: + - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: macOS-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes steps: - - name: Set up Git Repository - uses: actions/checkout@v2 + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 - - name: Set up R - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true - - name: Set up Pandoc - uses: r-lib/actions/setup-pandoc@v1 - - - name: Query dependencies - run: | - install.packages("devtools") - saveRDS(devtools::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Restore R package cache - uses: actions/cache@v2 + - uses: r-lib/actions/setup-r-dependencies@v2 with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - - - name: Install dependencies - run: | - devtools::install_deps(dependencies = TRUE) - shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_REMOTE_: false - run: | - options(crayon.enabled = TRUE) - devtools::check(error_on = "error", vignettes = FALSE) - shell: Rscript {0} + extra-packages: any::rcmdcheck + needs: check - - name: Upload check results - if: failure() - uses: actions/upload-artifact@main + - uses: r-lib/actions/check-r-package@v2 with: - name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check + upload-snapshots: true diff --git a/README.md b/README.md index ef394af..19e686d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![R-CMD-check](https://github.com/PNNL-Comp-Mass-Spec/PlexedPiper/workflows/R-CMD-check/badge.svg)](https://github.com/PNNL-Comp-Mass-Spec/PlexedPiper/actions) +[![R-CMD-check](https://github.com/PNNL-Comp-Mass-Spec/PlexedPiper/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/PNNL-Comp-Mass-Spec/PlexedPiper/actions/workflows/R-CMD-check.yaml) From c84ae1a5184a29062700529b712afb16e77aba21 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 11 Jul 2023 11:22:58 -0700 Subject: [PATCH 15/17] Update vignettes Replace get_AScore_results with read_AScore_results_from_DMS. --- vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd | 2 +- vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd | 2 +- vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd b/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd index 942eff9..ac26dc8 100644 --- a/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd +++ b/vignettes/tmt_pipeline_for_PNNL_DMS_phos.Rmd @@ -74,7 +74,7 @@ newly suggested locations. Importantly it contains `AScore` column that signifies the confidence of PTM assingment. `AScore > 17` is considered confident. ```{r ascore} -ascore <- get_AScore_results(data_package_num) +ascore <- read_AScore_results_from_DMS(data_package_num) msnid <- best_PTM_location_by_ascore(msnid, ascore) ``` diff --git a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd index 90dbc3e..95e6441 100644 --- a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd +++ b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_MoTrPAC.Rmd @@ -78,7 +78,7 @@ newly suggested locations. Importantly it contains `AScore` column that signifies the confidence of PTM assingment. `AScore > 17` is considered confident. ```{r ascore} -ascore <- get_AScore_results(data_package_num) +ascore <- read_AScore_results_from_DMS(data_package_num) msnid <- best_PTM_location_by_ascore(msnid, ascore) ``` diff --git a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd index 000fdb8..52e4166 100644 --- a/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd +++ b/vignettes/tmt_pipeline_for_PNNL_DMS_phos_pep_MoTrPAC.Rmd @@ -78,7 +78,7 @@ newly suggested locations. Importantly it contains `AScore` column that signifies the confidence of PTM assingment. `AScore > 17` is considered confident. ```{r ascore} -ascore <- get_AScore_results(data_package_num) +ascore <- read_AScore_results_from_DMS(data_package_num) msnid <- best_PTM_location_by_ascore(msnid, ascore) ``` From bb8a4c9c84d87512a3c5653fbc982361d803e7f5 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Tue, 11 Jul 2023 11:26:24 -0700 Subject: [PATCH 16/17] Update pkgdown GitHub workflow Only run when a new release is published or upon workflow dispatch. --- .github/workflows/pkgdown.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 194c991..de5cd14 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,10 +1,6 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - push: - branches: [main, master] - pull_request: - branches: [main, master] release: types: [published] workflow_dispatch: From 38c98f45fa76ebb112a7c1cd190a8e041d814e03 Mon Sep 17 00:00:00 2001 From: TylerSagendorf Date: Wed, 12 Jul 2023 19:45:25 -0700 Subject: [PATCH 17/17] Update R-CMD-check workflow Only keep current release machines. --- .github/workflows/R-CMD-check.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a67ec0f..3d0aa13 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,9 +22,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}