Skip to content

Commit

Permalink
v0.1.11 - Added visualization of detected number of signature genes v…
Browse files Browse the repository at this point in the history
…s number of mapped reads (signature_genes/read-count_detected-genes.pdf)
  • Loading branch information
trinezac committed Jan 3, 2023
1 parent 347ccb5 commit 4e6a3f7
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 1 deletion.
97 changes: 97 additions & 0 deletions maginator/workflow/scripts/gene_refinement_plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Loading relevant data
library(ggplot2)

GeneLengths <- readRDS(snakemake@input[["gene_lengths"]])
Clusterlist <- readRDS(snakemake@input[["clusters_sorted"]])
sg_files <- snakemake@input[["screened_clusters"]]
screened_clusters <- do.call("rbind", lapply(sg_files, readRDS))
taxmat <- read.csv(snakemake@input[["tax_matrix"]], sep="\t", header=FALSE)

# Initializing relevant parameters
n_signature_genes_expected <- '95'
minimum_sampels <- 3
n.genes <- 100

#loading colors
cmcol1 = c('#0571B0', '#92C5DE', '#999999', '#E0E0E0', '#CA0020',
'#FF8784', '#FFD900', '#FFE967', '#349F2C', '#BAE4B3',
'#6B3D9A', '#CAB2D6')

# The function for visualizing the reads and the expected distribution along with it's MSE and taxonomy
pois.two.plot <- function(i.Genes, i.Reads, i.mse, r.Genes, r.Reads, r.mse, id){
# Plotting the reads and mapped genes for each sample
#
# Args:
# Genes: A named vector containing the number of genes mapped in the samples
# i.Genes: initial gene names
# r.Genes: Refined gene names
# Reads: Vector containing the number of reads mapping to the Genes. Genes and Reads must
# have the same length, greater than one, with no missing values
# mse: The MSE of the MGS when compared to the expected distribution
# id is the ID of the species in the HGMGS object
#
# Returns:
# A plot of all samples with their corresponding genes and mapped reads (log)


df_plot <- do.call(rbind, Map(data.frame, Reads = i.Reads, Genes = i.Genes, name = names(i.Genes)))
df_r_plot <- do.call(rbind, Map(data.frame, r.Reads = r.Reads, r.Genes = r.Genes, name = names(r.Genes)))


p = ggplot(log = "x", xlim = c(0.1, 1e5), cex = 0.8, pch = 25, ylim = c(0, n.genes),
data = df_plot, aes(text = name, y = Genes, x = Reads)) +
geom_point(size = 0.7, aes(color = "Initial")) +
stat_function(fun = function(Genes) (1 - ((n.genes - 1) / n.genes)^Genes) * n.genes, aes(Genes, color="Expected"), size=0.25, alpha=0.5) + #t he expected distribution
xlab('Reads mapped (log)') +
ylab('Signature genes detected') +
ggtitle(id) +
geom_point(data=df_r_plot, aes(x=r.Reads, y=r.Genes,color="Refined"), size=0.7)

plotly.data <- (p + scale_x_log10()+ scale_colour_manual(values = c("Initial" = cmcol1[2], "Refined"=cmcol1[5], "Expected"="black")))
plotly.data <- plotly.data + annotate("text",x=min(df_plot$Reads),y=max(df_plot$Genes),hjust=.2, label = paste("initial MSE:", round(i.mse, 2), "\n refined MSE:", round(r.mse, 2)))
plotly.data <- plotly.data + theme_minimal() + theme(legend.position="bottom")
return(plotly.data)
}


Cluster_gene_names <- c()
for(Cluster in names(Clusterlist)){
Cluster_gene_names <- c(Cluster_gene_names, rownames(Clusterlist[[Cluster]]))
}

present_genes <- GeneLengths[names(GeneLengths) %in% Cluster_gene_names]


pdf(snakemake@output[["plot_pdf"]])

for(Cluster in taxmat$V1){

if (length(screened_clusters[screened_clusters[,'id']==Cluster,]) > 1){

tax=taxmat[taxmat$V1==Cluster,7]
if (is.na(taxmat[taxmat$V1==Cluster,7])){
tax=taxmat[taxmat$V1==Cluster,6]}

colsum <- colSums(Clusterlist[[Cluster]][1:n.genes, ])
mapped <- colsum[colsum >= minimum_sampels]


refined_reads <- round(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$best, names(mapped)] /
+ (present_genes[rownames(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$best, ])] * 10^-3))

refined_countreads <- colSums(refined_reads)
refined_genes <- colSums(refined_reads > 0)

reads <- round(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$init,names(mapped)] /
+ (present_genes[rownames(Clusterlist[[Cluster]][screened_clusters[screened_clusters[,'id']==Cluster,]$genes$init, ])] * 10^-3))

countreads <- colSums(reads)
genes <- colSums(reads > 0)


p <- pois.two.plot(genes, countreads, as.numeric(screened_clusters[screened_clusters[,'id']==Cluster,]$mse$init), refined_genes, refined_countreads, as.numeric(screened_clusters[screened_clusters[,'id']==Cluster,]$MSE$best), paste(Cluster, tax))

print(p)

}}
dev.off()
23 changes: 22 additions & 1 deletion maginator/workflow/signature_genes.Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ CLUSTERS = {x for x in CLUSTERS if x.isdigit()}

rule all:
input:
os.path.join(WD, 'abundance', 'abundance_phyloseq.RData')
os.path.join(WD, 'abundance', 'abundance_phyloseq.RData'),
os.path.join(WD, 'signature_genes', 'read-count_detected-genes.pdf')


# Identifying the refined sets of signature genes using the gene count matrix and gene lengths
Expand Down Expand Up @@ -76,3 +77,23 @@ rule abundance_profile:
runtime = '12:00:00'
script:
"scripts/abundance_profiles.R"


# Create figure for each MGS with expected readcounts and number of detected Signature Genes
rule gene_refinement_plots:
input:
gene_lengths = os.path.join(WD, 'signature_genes', 'gene_lengths.RDS'),
screened_clusters = expand(os.path.join(WD, 'signature_genes', 'screened', 'cluster_{cluster}_screened.RDS'), cluster=CLUSTERS),
clusters_sorted = os.path.join(WD, 'signature_genes', 'clusters_sorted.RDS'),
annotation = os.path.join(WD, 'tabs', 'metagenomicspecies.tab'),
tax_matrix = os.path.join(WD, 'tabs', 'tax_matrix.tsv')
output:
plot_pdf = os.path.join(WD, 'signature_genes', 'read-count_detected-genes.pdf')
conda:
"envs/signature_genes.yaml"
resources:
cores = 1,
memory = 80,
runtime = '12:00:00'
script:
"scripts/gene_refinement_plots.R"

0 comments on commit 4e6a3f7

Please sign in to comment.