Merge pull request #5 from sophie22/v0.0.2

v0.0.2
sophie22 · May 10, 2022 · a68d494 · a68d494
2 parents 14194d0 + f0c06e6
commit a68d494
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 49 deletions.
diff --git a/NGS148_34_139558_CB_CMCMD_S33_R1_001_suboptimal_genes_30x.tsv b/NGS148_34_139558_CB_CMCMD_S33_R1_001_suboptimal_genes_30x.tsv
@@ -0,0 +1,6 @@
+In sample NGS148, the following genes of panel CMCMD are not fully covered at 30x: 
+chromosome	startPos	endPos	GeneSymbol	Accession	genePercentage
+1	26126711	26142028	SELENON	NM_020451.2	94.93
+1	155580029	155583839	MSTO1	NM_018116.3	98.65
+2	152342263	152589624	NEB	NM_001271208.1	90.22
+X	64137652	64196194	ZC4H2	NM_018684.3	96.39
diff --git a/README.md b/README.md
@@ -15,14 +15,22 @@ Generate a report that lists any genes that have less than 100% coverage at 30x.
 - Ideally using python, write a script that takes the sambamba output and generates a report listing any genes that have less than 100% coverage at 30x
 - This script should be able to be applied to any gene panel
 
-# Version 0.0.1
-## What is required to run this script?
+### What is required to run this script?
 A new virtual environment was created using Python 3.8.0 and packages installed from the `requirements.txt`.
 The sambamba output file was converted to a tsv by replacing whitespaces with tabs by the following command: 
 
 `sed -e 's/ /\t/g' NGS148_34_139558_CB_CMCMD_S33_R1_001.sambamba_output.txt > NGS148_34_139558_CB_CMCMD_S33_R1_001.sambamba_output.tsv`
 
-## What does this script do?
+## Version 0.0.1
+### What does this script do?
 Using the `percentage30` column, regions with less than 100% coverage at 30x are identified and a list of unique gene symbols is saved to an output file.
 
 Command used to generate the example output: `python genes_coverage.py NGS148_34_139558_CB_CMCMD_S33_R1_001.sambamba_output.tsv`
+
+# Version 0.0.2
+## New in this version
+Calculate a combined percentage coverage value for each gene and identify genes with less than 100% coverage at 30x. Write information about genes with suboptimal coverage to an output file.
+Input file is now converted to tsv in the script, no need for separate command. Required depth is an input, defaults to 30 and script checks whether a corresponding column is present. Changed output filename to follow input filename.
+
+Command used to generate the example output: `python genes_coverage.py NGS148_34_139558_CB_CMCMD_S33_R1_001.sambamba_output.txt 30`
+
diff --git a/genes_coverage.py b/genes_coverage.py
@@ -1,31 +1,23 @@
+#!/usr/bin/python3
 # Python 3.8
+"""This script is used to identify poorly covered genes
+
+    Inputs:
+        * sambamba output file in tsv format
+        * optionally a coverage threshold value, which must have a
+        corresponding column name in the format of 'percentageX' where X is the
+        coverage threshold
+
+    Output:
+        * a file listing genes with less than 100% coverage
+"""
 
 # Import libraries/packages for use in the code
 import sys
+import subprocess
+from pathlib import Path
 import pandas as pd # v1.3.4
 
-### Read inputs from  the command line
-# sambamba output file (tsv)
-sambamba_file = sys.argv[1]
-# coverage threshold (default to 30)
-if len(sys.argv) > 2:
-    coverage_threshold = sys.argv[2]
-else:
-    coverage_threshold = "30"
-coverage_column = "percentage" + coverage_threshold
-
-### Load sambamba output file contents into a DataFrame
-sambamba_df = pd.read_csv(sambamba_file, sep='\t')
-# Calculate exon length by end-start position
-sambamba_df["ExonLength"] = sambamba_df["EndPosition"] - sambamba_df["StartPosition"]
-# Calculate number of bases above 30x coverage
-sambamba_df["AboveThreshold"] = sambamba_df[coverage_column] / 100 * sambamba_df["ExonLength"]
-
-# Identify unique genes
-panel_genes = sambamba_df["GeneSymbol;Accession"].unique().tolist()
-# Split 'GeneSymbol;Accession' into separate columns
-sambamba_df[["GeneSymbol", "Accession"]] = sambamba_df[
-    "GeneSymbol;Accession"].str.split(';', 1, expand=True)
 
 def genePCTcovered(df):
     """
@@ -43,33 +35,84 @@ def genePCTcovered(df):
     GeneSymbol = df["GeneSymbol"].to_list()[0]
     Accession = df["Accession"].to_list()[0]
 
-    # Calculate percentage coverage above threshold aacross all gene bases
+    # Calculate percentage coverage above threshold across all gene bases
     total_bases = df["ExonLength"].sum()
     bases_above_threshold = df["AboveThreshold"].sum()
     genePercentage = bases_above_threshold / total_bases * 100
 
     return([chromosome, startPos, endPos, GeneSymbol, Accession, genePercentage])
 
-### Calculate combined coverage of genes
-gene_coverage_dict = {}
-for i, gene in enumerate(panel_genes):
-    gene_df = sambamba_df.loc[(sambamba_df["GeneSymbol;Accession"] == gene)]
-    gene_coverage_dict[i] = genePCTcovered(gene_df)
-
-gene_coverage_df = pd.DataFrame.from_dict(gene_coverage_dict,
-                    orient='index', columns=["chromosome", "startPos",
-                    "endPos", "GeneSymbol", "Accession", "genePercentage"])
-# Round percentage values to 2 dp
-gene_coverage_df["genePercentage"] = gene_coverage_df["genePercentage"].round(2)
-
-### Identify unique genes with at least one exon with suboptimal coverage
-below_threshold_genes = gene_coverage_df[
-    gene_coverage_df["genePercentage"] < 100.00
-    ]
-
-### Write gene symbols with suboptimal coverage to file
-outfile = f"genes_suboptimal_coverage{coverage_threshold}x.txt"
-with open(outfile, 'w') as fh:
-    fh.write(f"The following genes had regions not fully covered at 30x: \n")
-    for gene in below_threshold_genes:
-        fh.write(gene + "\n")
+
+
+def main():
+    ### Read inputs from  the command line
+    # sambamba output file (txt)
+    sambamba_txt = sys.argv[1]
+    sambamba_filename = Path(sambamba_txt).stem
+    sambamba_tsv = sambamba_filename + ".tsv"
+    command = f"sed -e 's/ /\t/g' {sambamba_txt} > {sambamba_tsv}"
+    process = subprocess.call(command, shell=True)
+
+    # Parse sample and panel name from the input filename
+    sample_name = sambamba_filename.split("_")[0]
+    panel_name = sambamba_filename.split(".")[0].split("_")[4]
+
+    ### Load sambamba output file contents into a DataFrame
+    sambamba_df = pd.read_csv(sambamba_tsv, sep='\t')
+
+    # coverage threshold (default to 30)
+    if len(sys.argv) > 2:
+        coverage_threshold = sys.argv[2]
+    else:
+        coverage_threshold = "30"
+    coverage_column = "percentage" + coverage_threshold
+
+    try:
+        assert coverage_column in sambamba_df.columns, (
+            f"File has no {coverage_column} column, exiting!"
+        )
+        print("Sambamba file loaded correctly")
+    except AssertionError:
+        print(f"File has no {coverage_column} column, exiting!")
+        sys.exit(-1)
+
+    # Calculate exon length by end-start position
+    sambamba_df["ExonLength"] = sambamba_df["EndPosition"] - sambamba_df["StartPosition"]
+    # Calculate number of bases above 30x coverage
+    sambamba_df["AboveThreshold"] = sambamba_df[coverage_column] / 100 * sambamba_df["ExonLength"]
+
+    # Identify unique genes
+    panel_genes = sambamba_df["GeneSymbol;Accession"].unique().tolist()
+    # Split 'GeneSymbol;Accession' into separate columns
+    sambamba_df[["GeneSymbol", "Accession"]] = sambamba_df[
+        "GeneSymbol;Accession"].str.split(';', 1, expand=True)
+
+    ### Calculate combined coverage of genes
+    gene_coverage_dict = {}
+    for i, gene in enumerate(panel_genes):
+        gene_df = sambamba_df.loc[(sambamba_df["GeneSymbol;Accession"] == gene)]
+        gene_coverage_dict[i] = genePCTcovered(gene_df)
+
+    gene_coverage_df = pd.DataFrame.from_dict(gene_coverage_dict,
+                        orient='index', columns=["chromosome", "startPos",
+                        "endPos", "GeneSymbol", "Accession", "genePercentage"])
+    # Round percentage values to 2 dp
+    gene_coverage_df["genePercentage"] = gene_coverage_df["genePercentage"].round(2)
+
+    ### Identify unique genes with at least one exon with suboptimal coverage
+    below_threshold_genes = gene_coverage_df[
+        gene_coverage_df["genePercentage"] < 100.00
+        ]
+
+    ### Write gene symbols with suboptimal coverage to file
+    out_filename = sambamba_filename.rstrip(".sambamba_output")
+    outfile = out_filename + f"_suboptimal_genes_{coverage_threshold}x.tsv"
+    with open(outfile, 'w') as fh:
+        fh.write(f"In sample {sample_name}, the following genes of panel {panel_name} \
+are not fully covered at {coverage_threshold}x: \n")
+    below_threshold_genes.to_csv(outfile, sep="\t", index=False, mode='a')
+    count = len(below_threshold_genes)
+    print(f"Report file created listing {count} genes with suboptimal coverage")
+
+if __name__ == "__main__":
+    main()
diff --git a/genes_w_suboptimal_coverage30x.txt b/genes_w_suboptimal_coverage30x.txt
@@ -1,3 +1,4 @@
+In sample NGS148, the following genes  of panel CMCMD are not fully covered at 30x: 
 chromosome	startPos	endPos	GeneSymbol	Accession	genePercentage
 1	26126711	26142028	SELENON	NM_020451.2	94.93
 1	155580029	155583839	MSTO1	NM_018116.3	98.65