v0.1.11 - Added visualization of detected number of signature genes v…

…s number of mapped reads (signature_genes/read-count_detected-genes.pdf)
Russel88 · Jan 3, 2023 · 4e6a3f7 · 4e6a3f7
1 parent 347ccb5
commit 4e6a3f7
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 1 deletion.
diff --git a/maginator/workflow/scripts/gene_refinement_plots.R b/maginator/workflow/scripts/gene_refinement_plots.R
@@ -0,0 +1,97 @@
+# Loading relevant data
+library(ggplot2)
+
+GeneLengths <- readRDS(snakemake@input[["gene_lengths"]])
+Clusterlist <- readRDS(snakemake@input[["clusters_sorted"]])
+sg_files <- snakemake@input[["screened_clusters"]]
+screened_clusters <- do.call("rbind", lapply(sg_files, readRDS))
+taxmat <- read.csv(snakemake@input[["tax_matrix"]], sep="\t", header=FALSE) 
+
+# Initializing relevant parameters
+n_signature_genes_expected <- '95'
+minimum_sampels <- 3
+n.genes <- 100
+
+#loading colors
+cmcol1 = c('#0571B0', '#92C5DE', '#999999', '#E0E0E0', '#CA0020',
+           '#FF8784', '#FFD900', '#FFE967', '#349F2C', '#BAE4B3',
+           '#6B3D9A', '#CAB2D6')
+
+# The function for visualizing the reads and the expected distribution along with it's MSE and taxonomy
+pois.two.plot <- function(i.Genes, i.Reads, i.mse, r.Genes, r.Reads, r.mse, id){ 
+  # Plotting the reads and mapped genes for each sample
+  #
+  # Args:
+  #   Genes: A named vector containing the number of genes mapped in the samples
+  #      i.Genes: initial gene names
+  #      r.Genes: Refined gene names
+  #   Reads: Vector containing the number of reads mapping to the Genes. Genes and Reads must 
+  #           have the same length, greater than one, with no missing values
+  #   mse:  The MSE of the MGS when compared to the expected distribution
+  #   id is the ID of the species in the HGMGS object
+  #
+  # Returns:
+  #   A plot of all samples with their corresponding genes and mapped reads (log)
+
+
+  df_plot <- do.call(rbind, Map(data.frame, Reads = i.Reads, Genes = i.Genes, name = names(i.Genes)))
+  df_r_plot <- do.call(rbind, Map(data.frame, r.Reads = r.Reads, r.Genes = r.Genes, name = names(r.Genes)))
+
+
+  p = ggplot(log = "x", xlim = c(0.1, 1e5), cex = 0.8, pch = 25, ylim = c(0, n.genes), 
+             data = df_plot, aes(text = name, y = Genes, x = Reads)) + 
+    geom_point(size = 0.7, aes(color = "Initial")) +
+    stat_function(fun =  function(Genes) (1 - ((n.genes - 1) / n.genes)^Genes) * n.genes, aes(Genes, color="Expected"), size=0.25, alpha=0.5) + #t he expected distribution
+    xlab('Reads mapped (log)') +
+    ylab('Signature genes detected') +
+    ggtitle(id) +
+    geom_point(data=df_r_plot, aes(x=r.Reads, y=r.Genes,color="Refined"), size=0.7)
+
+  plotly.data <- (p + scale_x_log10()+ scale_colour_manual(values = c("Initial" = cmcol1[2], "Refined"=cmcol1[5], "Expected"="black")))
+  plotly.data <- plotly.data + annotate("text",x=min(df_plot$Reads),y=max(df_plot$Genes),hjust=.2, label = paste("initial MSE:", round(i.mse, 2), "\n refined MSE:", round(r.mse, 2))) 
+  plotly.data <- plotly.data  + theme_minimal() + theme(legend.position="bottom")
+  return(plotly.data)  
+}
+
+
+Cluster_gene_names <- c()
+for(Cluster in names(Clusterlist)){
+  Cluster_gene_names <- c(Cluster_gene_names, rownames(Clusterlist[[Cluster]]))
+}
+
+present_genes <- GeneLengths[names(GeneLengths) %in% Cluster_gene_names]
+
+
+pdf(snakemake@output[["plot_pdf"]])
+
+for(Cluster in taxmat$V1){
+
+if (length(screened_clusters[screened_clusters[,'id']==Cluster,]) > 1){
+
+tax=taxmat[taxmat$V1==Cluster,7]
+if (is.na(taxmat[taxmat$V1==Cluster,7])){
+tax=taxmat[taxmat$V1==Cluster,6]}
+
+colsum <- colSums(Clusterlist[[Cluster]][1:n.genes, ])
+mapped <- colsum[colsum >= minimum_sampels]
+
+
+refined_reads <- round(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$best, names(mapped)] / 
+                 +                   (present_genes[rownames(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$best, ])] * 10^-3))
+
+refined_countreads <- colSums(refined_reads)
+refined_genes <- colSums(refined_reads > 0) 
+
+reads <- round(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$init,names(mapped)] / 
+                 +                   (present_genes[rownames(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$init, ])] * 10^-3))
+
+countreads <- colSums(reads)
+genes <- colSums(reads > 0) 
+
+
+p <- pois.two.plot(genes, countreads, as.numeric(screened_clusters[screened_clusters[,'id']==Cluster,]$mse$init), refined_genes, refined_countreads, as.numeric(screened_clusters[screened_clusters[,'id']==Cluster,]$MSE$best), paste(Cluster, tax))
+
+print(p)
+
+}}
+dev.off()
diff --git a/maginator/workflow/signature_genes.Snakefile b/maginator/workflow/signature_genes.Snakefile
@@ -15,7 +15,8 @@ CLUSTERS = {x for x in CLUSTERS if x.isdigit()}
 
 rule all:
     input:
-        os.path.join(WD, 'abundance', 'abundance_phyloseq.RData')
+        os.path.join(WD, 'abundance', 'abundance_phyloseq.RData'),
+        os.path.join(WD, 'signature_genes', 'read-count_detected-genes.pdf')
 
 
 # Identifying the refined sets of signature genes using the gene count matrix and gene lengths
@@ -76,3 +77,23 @@ rule abundance_profile:
         runtime = '12:00:00'
     script:
         "scripts/abundance_profiles.R"
+
+
+# Create figure for each MGS with expected readcounts and number of detected Signature Genes
+rule gene_refinement_plots:
+    input:
+        gene_lengths = os.path.join(WD, 'signature_genes', 'gene_lengths.RDS'),
+        screened_clusters = expand(os.path.join(WD, 'signature_genes', 'screened', 'cluster_{cluster}_screened.RDS'), cluster=CLUSTERS),
+        clusters_sorted = os.path.join(WD, 'signature_genes', 'clusters_sorted.RDS'),
+        annotation = os.path.join(WD, 'tabs', 'metagenomicspecies.tab'),
+        tax_matrix = os.path.join(WD, 'tabs', 'tax_matrix.tsv')
+    output:
+        plot_pdf = os.path.join(WD, 'signature_genes', 'read-count_detected-genes.pdf')
+    conda:
+        "envs/signature_genes.yaml"
+    resources:
+        cores = 1,
+        memory = 80,
+        runtime = '12:00:00'
+    script:
+        "scripts/gene_refinement_plots.R"