Skip to content

Commit

Permalink
Final version of msamtools branch
Browse files Browse the repository at this point in the history
  • Loading branch information
pabloati committed Jul 7, 2024
1 parent ec78eb1 commit d8460ac
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 8 deletions.
12 changes: 5 additions & 7 deletions maginator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
WORKFLOW_GTDBTK = os.path.join(_ROOT, 'workflow', 'gtdbtk.Snakefile')
WORKFLOW_PARSE_GTDBTK = os.path.join(_ROOT, 'workflow', 'parse_gtdbtk.Snakefile')
WORKFLOW_FILTER_GENE_CLUSTERS = os.path.join(_ROOT, 'workflow', 'filter_geneclusters.Snakefile')
WORKFLOW_SIGNATURE_READS = os.path.join(_ROOT, 'workflow', 'signature_reads.Snakefile')
WORKFLOW_GENE_COUNT_MAT = os.path.join(_ROOT, 'workflow', 'gene_count_mat.Snakefile')
WORKFLOW_PRESCREENING_GENES = os.path.join(_ROOT, 'workflow', 'prescreening_genes.Snakefile')
WORKFLOW_SIGNATURE_GENES = os.path.join(_ROOT, 'workflow', 'signature_genes.Snakefile')
Expand All @@ -29,7 +30,6 @@
WORKFLOW_PHYLO = os.path.join(_ROOT, 'workflow', 'phylo.Snakefile')
WORKFLOW_GENE_TAX = os.path.join(_ROOT, 'workflow', 'gene_tax.Snakefile')
WORKFLOW_BENCHMARK = os.path.join(_ROOT, 'workflow', 'benchmark.Snakefile')
WORKFLOW_SIGNATURE_READS = os.path.join(_ROOT, 'workflow', 'signature_reads.Snakefile')

def cli():

Expand Down Expand Up @@ -74,6 +74,7 @@ def cli():
app.add_argument('--min_length',help='Minimum number of aligned basepairs of a read to be included [%(default)s]', default=80, type=int)
app.add_argument('--min_identity',help='Minimum percentage of identity for a read to be included [%(default)s]', default=95, type=int)
app.add_argument('--min_map', help='Minimum percentage of mapped bases for a read to be included [%(default)s]', default=80, type=int)
app.add_argument('--multi',help='Method used by msamtools to treat multihit inserts [%(default)s]',default='proportional',type=str,choices=['proportional','ignore','all','equal'])
app.add_argument('--abundance_calculation', help='Method employed to calculate the absolute abundances [%(default)s]', default='ot_trun', type=str, choices=['sum', 'ot_trun','tt_trun'])
app.add_argument('--tail_percentage', help='Percentage range for the tail of the truncated mean or low_avg method [%(default)s]', default=10, type=float)
app.add_argument('--min_gtdb_markers', help='Minimum GTDBtk marker genes shared between MGS and outgroup for rooting trees [%(default)s]', default=10, type=int)
Expand All @@ -86,15 +87,14 @@ def cli():
app.add_argument('--min_nonN', help='Minimum fraction of non-N characters of a sample to be included in a phylogeny [%(default)s]', default=0.5, type=float)
app.add_argument('--min_marker_genes', help='Minimum marker genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=2, type=int)
app.add_argument('--min_signature_genes', help='Minimum signature genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=50, type=int)
app.add_argument('--af_cutoff', help='Cutoff for average median allele frequency of SG alignment for determining mixed/pure population of a MAG in a sample [%(default)s]', default=0, type=float)
app.add_argument('--phylo', help='Software for phylogeny inference. Either fast (fasttree) or slow and more accurate (iqtree) [%(default)s]', default='fasttree', type=str, choices=['fasttree', 'iqtree'])
app.add_argument('--tax_scope_threshold', help='Threshold for assigning the taxonomic scope of a gene cluster [%(default)s]', default=0.9, type=float)
app.add_argument('--synteny_adj_cutoff', help='Minimum number of times gene clusters should be adjacent to be included in synteny graph [%(default)s]', default=1, type=int)
app.add_argument('--synteny_mcl_inflation', help='Inflation parameter for mcl clustering of synteny graph. Usually between 1.2 and 5. Higher values produce smaller clusters [%(default)s]', default=5, type=float)

## Benchmarking
app.add_argument('--benchmark',help='Run MAGinator in benchmarking mode',action='store_true')
app.add_argument('--signature_reads',help='Run the signature reads profiling only',action='store_true')
app.add_argument('--multi',help='Method used by msamtools to treat multihit inserts [%(default)s]',default='proportional',type=str,choices=['proportional','ignore','all','equal'])
########## Workflow ##########
master = Controller(ap)

Expand Down Expand Up @@ -124,10 +124,8 @@ def cli():
# Signature genes
logging.info('Identifying signature genes')
logging.debug('Creating a gene count matrix of the readmappings')
if master.signature_reads:
wf.run(snakefile=WORKFLOW_SIGNATURE_READS)
else:
wf.run(snakefile=WORKFLOW_GENE_COUNT_MAT)
wf.run(snakefile=WORKFLOW_SIGNATURE_READS)
#wf.run(snakefile=WORKFLOW_GENE_COUNT_MAT)
logging.debug('Sorting the matrix to only contain genes, that do not cluster across the Metagenomic Species')
wf.run(snakefile=WORKFLOW_PRESCREENING_GENES)
logging.debug('Identifying the signature genes')
Expand Down
19 changes: 19 additions & 0 deletions maginator/workflow/pileup_parse.Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,22 @@ rule parse:
script:
"scripts/mpileup.py"

rule allele_frequencies:
input:
stat=os.path.join(WD, 'phylo', 'stats.tab'),
gene=os.path.join(WD, 'phylo', 'stats_genes.tab')
output:
af_matrix=os.path.join(WD,'tabs','allele_frequencies.tab')
params:
af_cutoff = param_dict['af_cutoff'],
min_signature_genes = param_dict['min_signature_genes'],
min_nonN = param_dict['min_nonN'],
min_marker_genes = param_dict['min_marker_genes']
conda:
"envs/signature_genes.yaml"
resources:
cores=2,
mem_gb=190,
runtime='172800'
script:
"scripts/allele_frequency_mat.R"
59 changes: 59 additions & 0 deletions maginator/workflow/scripts/allele_frequency_mat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
library(dplyr)
gene_stats_file <- snakemake@input[["gene"]]
gene_stats <- read.table(file=gene_stats_file, header=TRUE)

stats_file <- snakemake@input[["stat"]]
stats <- read.table(file=stats_file, header=TRUE)

af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]])
min_signature_genes <- as.integer(snakemake@params[["min_signature_genes"]])
min_nonN <- as.numeric(snakemake@params[["min_nonN"]])
min_marker_genes <- as.integer(snakemake@params[["min_marker_genes"]])

# Identifying the samples in each tree fulfill the default tree-criteria
#(<50 detected SG, min 2 marker genes, min 50% non-n coverage)
filtered_data <- stats %>%
filter(Marker_N >= min_marker_genes, Signature_N>=min_signature_genes, NonN > min_nonN)

# Count the number of different samples for each cluster
sample_counts <- filtered_data %>%
group_by(Cluster) %>%
summarize(Count = n_distinct(Sample))

# Get unique clusters
unique_clusters <- unique(gene_stats$Cluster)
unique_samples <- unique(gene_stats$Sample)

# Create an empty matrix
empty_matrix <- matrix(NA, nrow = length(unique_samples), ncol = length(unique_clusters),
dimnames = list(unique_samples, unique_clusters))

# Loop through each cluster
for (cluster_id in unique_clusters) {
if ((dim(gene_stats[gene_stats$Cluster == cluster_id,]))[1]>af_cutoff){
# Subset data for the current cluster
cluster_data <- gene_stats[gene_stats$Cluster == cluster_id, ]
samples <- (filtered_data[filtered_data$Cluster==cluster_id, "Sample"])
absent_samples <- unique_samples[!unique_samples %in% samples]

# Calculate the median Signature_AF per sample for the current cluster
median_signature_af_per_sample <- cluster_data %>%
group_by(Sample) %>%
summarise(median_signature_af = median(Signature_AF, na.rm = TRUE))

# Convert median_signature_af_per_sample to a data frame
median_signature_af_per_sample_df <- as.data.frame(median_signature_af_per_sample)
# Replace values based on conditions
median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af > af_cutoff] <- 2
median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af == af_cutoff] <- 1

# Filling in the numbers in the matrix
empty_matrix[,cluster_id] = median_signature_af_per_sample_df$median_signature_af
empty_matrix[absent_samples,cluster_id] = 0 # if the sample is not in the tree - set NA as AF
}
}

AF_matrix <- empty_matrix

# Save the data frame to a CSV file
write.csv(AF_matrix, file = snakemake@output[["af_matrix"]], row.names = TRUE)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="maginator",
version="0.1.19",
version="0.1.20",
author="Jakob Russel & Trine Zachariasen",
author_email="russel2620@gmail.com,trine_zachariasen@hotmail.com",
description="MAGinator: Abundance, strain, and functional profiling of MAGs",
Expand Down

0 comments on commit d8460ac

Please sign in to comment.