Skip to content

Commit

Permalink
update metadata functions
Browse files Browse the repository at this point in the history
  • Loading branch information
lstanish committed Nov 4, 2020
1 parent a61d7f1 commit 1a354f5
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 3 deletions.
115 changes: 115 additions & 0 deletions R/qcMetadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#' QC Raw Sequence Metadata
#'
#' Performs basic QC checks and reports potential issues with downstream processing.
#'
#' @param metadata The output of downloadSequenceMetadata(). Must be provided as either the data.frame returned by downloadSequenceMetadata() or as a filepath to the csv file produced by downloadSequenceMetadata() when dir is specified.
#' @param outdir_sequence Default file.path(PRESET_OUTDIR, PRESET_OUTDIR_SEQUENCE). Directory where raw sequence files can be found before reorganizing.
#'
#' @return Report file of potential quality issues with metadata records
qcMetadata <- function(metadata, outdir = file.path(PRESET_OUTDIR, PRESET_OUTDIR_SEQMETA)) {

metadata_load_err <- FALSE

if(class(metadata) == "data.frame") {
metadata <- metadata
} else if(class(metadata) == "character") {
if(file.exists(metadata)) {
metadata <- read.csv(metadata)
} else {
metadata_load_err <- TRUE
}
} else {
metadata_load_err <- TRUE
}

if(metadata_load_err) {
stop("'metadata' must be the data.frame output from downloadSequenceMetadata() or a filepath to the local copy of the output from downloadSequenceMetadata()")
}

# QC check #1: completeness


# Using metadata, look up the sequencing run ID and target gene for each file
match_fn_to_meta <- match(basename(fn), metadata$rawDataFileName)
runIDs <- metadata$sequencerRunID[match_fn_to_meta]
targetGenes <- as.character(metadata$targetGene[match_fn_to_meta])
targetGenes[grep("16S", targetGenes, ignore.case=TRUE)] <- "16S"
targetGenes[grep("ITS", targetGenes, ignore.case=TRUE)] <- "ITS"

if(all(is.na(runIDs))) stop("All sequencer run ID values are NA. Cannot proceed with reorganizing this download batch.
Try a different `fn` or `metadata`.")

files_organized <- c()
for(i in 1:length(fn)) {
# get run ID associated with the zip file
runID <- runIDs[i]
if(is.na(runID)) {
warning("No recorded sequencer run ID in metadata for file: ", fn[i], ". Skipping this file.")
next
}

# get target gene associated with the zip file
targetGene <- targetGenes[i]
if(is.na(runID)) {
warning("No recorded target gene in metadata for file: ", fn[i], ". Skipping this file.")
next
}
if(!(targetGene %in% c("16S", "ITS"))) {
warning("Recorded target gene in metadata for file does not clearly indicate '16S' or 'ITS': ", fn[i],
". Skipping this file.")
next
}

# If the file is a tar file (unusual case)
if(grepl("tar.gz", fn[i])) {
tar_filenames <- untar(fn[i], list=TRUE)
untarred_dir <- file.path(outdir_sequence, "untarred")
untar_dir <- if(!dir.exists(untarred_dir)) dir.create(untarred_dir)
untar(fn[i], list=FALSE, exdir = untarred_dir)
file.remove(fn[i])
untarred <- file.path(untarred_dir, tar_filenames)

# rename untarred files by appending sequencer run ID and moving to target gene-specific subdirectory
untarred_rename_to <- file.path(outdir_sequence, targetGene, "0_raw", paste0("run", runID, "_", basename(untarred)))
if(length(untarred) > 0) {
untarred_rename_to <- file.path(outdir_sequence, targetGene, "0_raw", paste0("run", runID, "_", basename(untarred)))
renamed <- file.rename(untarred, untarred_rename_to)
if(!all(renamed)) {
warning(sum(!renamed), " untarred file(s) in sequencer run ", runID, "were not successfully reorganized")
}
files_organized <- c(files_organized, untarred_rename_to[renamed])
}
message("Extracted files from ", basename(fn[i]), " and reorganized its contents.")

# If the file is not tarred (typical case)
} else {

# rename file by appending sequencer run ID and moving to target gene-specific subdirectory
rename_to <- file.path(outdir_sequence, targetGene, "0_raw", paste0("run", runID, "_", basename(fn[i])))
renamed <- file.rename(fn[i], rename_to)

if(!renamed) {
warning(fn[i], " was not successfully reorganized")
} else {
files_organized <- c(files_organized, rename_to)
}
message("Reorganized ", basename(fn[i]))
}
}

# Gzip all non-gzipped fastq files
for(i in 1:length(files_organized)) {
if(!grepl(".gz$", files_organized[i])) {
gzip(files_organized[i])
}
}

if(length(files_organized) > 0) {
message(length(files_organized), " out of ", length(fn), " files in sequencer run ", runID, " were successfully reorganized.")
message("Reorganized files can be found in ", file.path(outdir_sequence, "ITS", "0_raw"), " and ", file.path(outdir_sequence, "16S", "0_raw"), ".")
return(files_organized)
} else {
warning("No files were successfully reorganized")
return(NULL)
}
}
8 changes: 5 additions & 3 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -554,15 +554,15 @@ downloadSequenceMetadata <- function(sites='all', startYrMo=NA, endYrMo=NA, targ
if(targetGene=="all") {
joinedTarget <- left_join(raw, seq, by=c('dnaSampleID', 'sequencerRunID', 'internalLabID'))
out <- joinedTarget[!is.na(joinedTarget$uid.y), ]
message(paste0(length(grep("16S", joinedTarget$rawDataFileName)), " 16S records and ", length(grep("ITS", joinedTarget$rawDataFileName)), " ITS records found."))
message(paste0(length(grep("16S", joinedTarget$rawDataFileName)), " 16S raw sequence records and ", length(grep("ITS", joinedTarget$rawDataFileName)), " ITS records found."))
}

# clean up redundant column names
names(out) <- gsub("\\.x", ".rawFiles", names(out))
names(out) <- gsub("\\.y", ".seq", names(out))

# join with DNA extraction metadata
outDNA <- left_join(out, dna, by=c('plotID', 'dnaSampleID', 'internalLabID'))
outDNA <- left_join(out, dna, by=c('plotID', 'dnaSampleID'))
# clean up redundant column names
names(outDNA) <- gsub("\\.x", ".seq", names(outDNA))
names(outDNA) <- gsub("\\.y", ".dna", names(outDNA))
Expand All @@ -575,10 +575,12 @@ downloadSequenceMetadata <- function(sites='all', startYrMo=NA, endYrMo=NA, targ
names(outDNA)[names(outDNA)=="dnaProcessedBy"] <- 'processedBy.dna'

# join with PCR amplification metadata
outPCR <- left_join(outDNA, pcr, by=c('plotID', 'dnaSampleID', 'internalLabID'))
outPCR <- left_join(outDNA, pcr, by=c('plotID', 'dnaSampleID'))
# clean up redundant column names
names(outPCR)[names(outPCR)=="uid"] <- "uid.pcr"
names(outPCR)[names(outPCR)=="processedDate"] <- "processedDate.pcr"
names(outPCR)[names(outPCR)=="testProtocolVersion"] <- "testProtocolVersion.pcr"
names(outPCR)[names(outPCR)=="internalLabID"] <- "internalLabID.pcr"
names(outPCR)[names(outPCR)=="qaqcStatus"] <- "qaqcStatus.pcr"
names(outPCR)[names(outPCR)=="processedBy"] <- "processedBy.pcr"
names(outPCR)[names(outPCR)=="remarks"] <- "remarks.pcr"
Expand Down

0 comments on commit 1a354f5

Please sign in to comment.