Final version of msamtools branch

Russel88 · Jul 7, 2024 · d8460ac · d8460ac
1 parent ec78eb1
commit d8460ac
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 8 deletions.
diff --git a/maginator/main.py b/maginator/main.py
@@ -19,6 +19,7 @@
 WORKFLOW_GTDBTK = os.path.join(_ROOT, 'workflow', 'gtdbtk.Snakefile')
 WORKFLOW_PARSE_GTDBTK = os.path.join(_ROOT, 'workflow', 'parse_gtdbtk.Snakefile')
 WORKFLOW_FILTER_GENE_CLUSTERS = os.path.join(_ROOT, 'workflow', 'filter_geneclusters.Snakefile')
+WORKFLOW_SIGNATURE_READS = os.path.join(_ROOT, 'workflow', 'signature_reads.Snakefile')
 WORKFLOW_GENE_COUNT_MAT = os.path.join(_ROOT, 'workflow', 'gene_count_mat.Snakefile')
 WORKFLOW_PRESCREENING_GENES = os.path.join(_ROOT, 'workflow', 'prescreening_genes.Snakefile')
 WORKFLOW_SIGNATURE_GENES = os.path.join(_ROOT, 'workflow', 'signature_genes.Snakefile')
@@ -29,7 +30,6 @@
 WORKFLOW_PHYLO = os.path.join(_ROOT, 'workflow', 'phylo.Snakefile')
 WORKFLOW_GENE_TAX = os.path.join(_ROOT, 'workflow', 'gene_tax.Snakefile')
 WORKFLOW_BENCHMARK = os.path.join(_ROOT, 'workflow', 'benchmark.Snakefile')
-WORKFLOW_SIGNATURE_READS = os.path.join(_ROOT, 'workflow', 'signature_reads.Snakefile')
 
 def cli():
 
@@ -74,6 +74,7 @@ def cli():
     app.add_argument('--min_length',help='Minimum number of aligned basepairs of a read to be included [%(default)s]', default=80, type=int)
     app.add_argument('--min_identity',help='Minimum percentage of identity for a read to be included [%(default)s]', default=95, type=int)	
     app.add_argument('--min_map', help='Minimum percentage of mapped bases for a read to be included [%(default)s]', default=80, type=int)
+    app.add_argument('--multi',help='Method used by msamtools to treat multihit inserts [%(default)s]',default='proportional',type=str,choices=['proportional','ignore','all','equal'])
     app.add_argument('--abundance_calculation', help='Method employed to calculate the absolute abundances [%(default)s]', default='ot_trun', type=str, choices=['sum', 'ot_trun','tt_trun'])
     app.add_argument('--tail_percentage', help='Percentage range for the tail of the truncated mean or low_avg method [%(default)s]', default=10, type=float)
     app.add_argument('--min_gtdb_markers', help='Minimum GTDBtk marker genes shared between MGS and outgroup for rooting trees [%(default)s]', default=10, type=int)
@@ -86,15 +87,14 @@ def cli():
     app.add_argument('--min_nonN', help='Minimum fraction of non-N characters of a sample to be included in a phylogeny [%(default)s]', default=0.5, type=float)
     app.add_argument('--min_marker_genes', help='Minimum marker genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=2, type=int)
     app.add_argument('--min_signature_genes', help='Minimum signature genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=50, type=int)
+    app.add_argument('--af_cutoff', help='Cutoff for average median allele frequency of SG alignment for determining mixed/pure population of a MAG in a sample [%(default)s]', default=0, type=float)
     app.add_argument('--phylo', help='Software for phylogeny inference. Either fast (fasttree) or slow and more accurate (iqtree) [%(default)s]', default='fasttree', type=str, choices=['fasttree', 'iqtree'])
     app.add_argument('--tax_scope_threshold', help='Threshold for assigning the taxonomic scope of a gene cluster [%(default)s]', default=0.9, type=float)
     app.add_argument('--synteny_adj_cutoff', help='Minimum number of times gene clusters should be adjacent to be included in synteny graph [%(default)s]', default=1, type=int)
     app.add_argument('--synteny_mcl_inflation', help='Inflation parameter for mcl clustering of synteny graph. Usually between 1.2 and 5. Higher values produce smaller clusters [%(default)s]', default=5, type=float)
 
     ## Benchmarking
     app.add_argument('--benchmark',help='Run MAGinator in benchmarking mode',action='store_true')
-    app.add_argument('--signature_reads',help='Run the signature reads profiling only',action='store_true')
-    app.add_argument('--multi',help='Method used by msamtools to treat multihit inserts [%(default)s]',default='proportional',type=str,choices=['proportional','ignore','all','equal'])
     ########## Workflow ##########
     master = Controller(ap)
 
@@ -124,10 +124,8 @@ def cli():
         # Signature genes
         logging.info('Identifying signature genes')
         logging.debug('Creating a gene count matrix of the readmappings')
-        if master.signature_reads:
-            wf.run(snakefile=WORKFLOW_SIGNATURE_READS)
-        else:
-            wf.run(snakefile=WORKFLOW_GENE_COUNT_MAT)
+        wf.run(snakefile=WORKFLOW_SIGNATURE_READS)
+        #wf.run(snakefile=WORKFLOW_GENE_COUNT_MAT)
         logging.debug('Sorting the matrix to only contain genes, that do not cluster across the Metagenomic Species')
         wf.run(snakefile=WORKFLOW_PRESCREENING_GENES)
         logging.debug('Identifying the signature genes')

diff --git a/maginator/workflow/pileup_parse.Snakefile b/maginator/workflow/pileup_parse.Snakefile
@@ -42,3 +42,22 @@ rule parse:
     script:
         "scripts/mpileup.py"
 
+rule allele_frequencies:
+    input:
+        stat=os.path.join(WD, 'phylo', 'stats.tab'),
+        gene=os.path.join(WD, 'phylo', 'stats_genes.tab')
+    output:
+        af_matrix=os.path.join(WD,'tabs','allele_frequencies.tab')
+    params:
+        af_cutoff = param_dict['af_cutoff'],
+        min_signature_genes = param_dict['min_signature_genes'],
+        min_nonN = param_dict['min_nonN'],
+        min_marker_genes = param_dict['min_marker_genes']
+    conda:
+        "envs/signature_genes.yaml"
+    resources:
+        cores=2,
+        mem_gb=190,
+        runtime='172800'
+    script:
+        "scripts/allele_frequency_mat.R"
diff --git a/maginator/workflow/scripts/allele_frequency_mat.R b/maginator/workflow/scripts/allele_frequency_mat.R
@@ -0,0 +1,59 @@
+library(dplyr)
+gene_stats_file <- snakemake@input[["gene"]]
+gene_stats <- read.table(file=gene_stats_file, header=TRUE)
+
+stats_file <- snakemake@input[["stat"]]
+stats <- read.table(file=stats_file, header=TRUE)
+
+af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]])
+min_signature_genes <- as.integer(snakemake@params[["min_signature_genes"]])
+min_nonN <- as.numeric(snakemake@params[["min_nonN"]])
+min_marker_genes <- as.integer(snakemake@params[["min_marker_genes"]])
+
+# Identifying the samples in each tree fulfill the default tree-criteria
+#(<50 detected SG, min 2 marker genes, min 50% non-n coverage)
+filtered_data <- stats %>%
+  filter(Marker_N >= min_marker_genes, Signature_N>=min_signature_genes, NonN > min_nonN)
+
+# Count the number of different samples for each cluster
+sample_counts <- filtered_data %>%
+  group_by(Cluster) %>%
+  summarize(Count = n_distinct(Sample))
+
+# Get unique clusters
+unique_clusters <- unique(gene_stats$Cluster)
+unique_samples <- unique(gene_stats$Sample)
+
+# Create an empty matrix
+empty_matrix <- matrix(NA, nrow = length(unique_samples), ncol = length(unique_clusters),
+                       dimnames = list(unique_samples, unique_clusters))
+
+# Loop through each cluster
+for (cluster_id in unique_clusters) {
+  if ((dim(gene_stats[gene_stats$Cluster == cluster_id,]))[1]>af_cutoff){
+    # Subset data for the current cluster
+    cluster_data <- gene_stats[gene_stats$Cluster == cluster_id, ]
+    samples <- (filtered_data[filtered_data$Cluster==cluster_id, "Sample"])
+    absent_samples <- unique_samples[!unique_samples %in% samples]
+
+    # Calculate the median Signature_AF per sample for the current cluster
+    median_signature_af_per_sample <- cluster_data %>%
+      group_by(Sample) %>%
+      summarise(median_signature_af = median(Signature_AF, na.rm = TRUE))
+
+    # Convert median_signature_af_per_sample to a data frame
+    median_signature_af_per_sample_df <- as.data.frame(median_signature_af_per_sample)
+    # Replace values based on conditions
+    median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af > af_cutoff] <- 2
+    median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af == af_cutoff] <- 1
+
+    # Filling in the numbers in the matrix
+    empty_matrix[,cluster_id] = median_signature_af_per_sample_df$median_signature_af
+    empty_matrix[absent_samples,cluster_id] = 0 # if the sample is not in the tree - set NA as AF
+  }
+}
+
+AF_matrix <- empty_matrix
+
+# Save the data frame to a CSV file
+write.csv(AF_matrix, file = snakemake@output[["af_matrix"]], row.names = TRUE)
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="maginator", 
-    version="0.1.19",
+    version="0.1.20",
     author="Jakob Russel & Trine Zachariasen",
     author_email="russel2620@gmail.com,trine_zachariasen@hotmail.com",
     description="MAGinator: Abundance, strain, and functional profiling of MAGs",