Merge branch 'v0.0.2' into main

sophie22 · May 7, 2022 · 14194d0 · 14194d0
2 parents 400e1a6 + b877241
commit 14194d0
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,6 @@ Generate a report that lists any genes that have less than 100% coverage at 30x.
 - Ideally using python, write a script that takes the sambamba output and generates a report listing any genes that have less than 100% coverage at 30x
 - This script should be able to be applied to any gene panel
 
-
 # Version 0.0.1
 ## What is required to run this script?
 A new virtual environment was created using Python 3.8.0 and packages installed from the `requirements.txt`.

diff --git a/genes_coverage.py b/genes_coverage.py
@@ -16,15 +16,56 @@
 
 ### Load sambamba output file contents into a DataFrame
 sambamba_df = pd.read_csv(sambamba_file, sep='\t')
+# Calculate exon length by end-start position
+sambamba_df["ExonLength"] = sambamba_df["EndPosition"] - sambamba_df["StartPosition"]
+# Calculate number of bases above 30x coverage
+sambamba_df["AboveThreshold"] = sambamba_df[coverage_column] / 100 * sambamba_df["ExonLength"]
+
+# Identify unique genes
+panel_genes = sambamba_df["GeneSymbol;Accession"].unique().tolist()
 # Split 'GeneSymbol;Accession' into separate columns
 sambamba_df[["GeneSymbol", "Accession"]] = sambamba_df[
     "GeneSymbol;Accession"].str.split(';', 1, expand=True)
 
-### Identify exons with less than 100% coverage at 30x
-below_threshold_exons_df = sambamba_df[sambamba_df[coverage_column] < 100.0]
+def genePCTcovered(df):
+    """
+        Args: DataFrame with at least the following columns
+            - 'ExonLength': which is the number of bases in the exon
+            - 'AboveThreshold': which is the number of bases with reads
+                above the threshold
+
+        Returns: list of chromosome, startPos, endPos, GeneSymbol, Accession
+            and the calculated genePercentage
+    """
+    chromosome = df["#chromosome"].to_list()[0]
+    startPos = df["StartPosition"].to_list()[0]
+    endPos = df["StartPosition"].to_list()[-1]
+    GeneSymbol = df["GeneSymbol"].to_list()[0]
+    Accession = df["Accession"].to_list()[0]
+
+    # Calculate percentage coverage above threshold aacross all gene bases
+    total_bases = df["ExonLength"].sum()
+    bases_above_threshold = df["AboveThreshold"].sum()
+    genePercentage = bases_above_threshold / total_bases * 100
+
+    return([chromosome, startPos, endPos, GeneSymbol, Accession, genePercentage])
+
+### Calculate combined coverage of genes
+gene_coverage_dict = {}
+for i, gene in enumerate(panel_genes):
+    gene_df = sambamba_df.loc[(sambamba_df["GeneSymbol;Accession"] == gene)]
+    gene_coverage_dict[i] = genePCTcovered(gene_df)
+
+gene_coverage_df = pd.DataFrame.from_dict(gene_coverage_dict,
+                    orient='index', columns=["chromosome", "startPos",
+                    "endPos", "GeneSymbol", "Accession", "genePercentage"])
+# Round percentage values to 2 dp
+gene_coverage_df["genePercentage"] = gene_coverage_df["genePercentage"].round(2)
 
 ### Identify unique genes with at least one exon with suboptimal coverage
-below_threshold_genes = below_threshold_exons_df["GeneSymbol"].unique().tolist()
+below_threshold_genes = gene_coverage_df[
+    gene_coverage_df["genePercentage"] < 100.00
+    ]
 
 ### Write gene symbols with suboptimal coverage to file
 outfile = f"genes_suboptimal_coverage{coverage_threshold}x.txt"

diff --git a/genes_w_suboptimal_coverage30x.txt b/genes_w_suboptimal_coverage30x.txt
@@ -0,0 +1,5 @@
+chromosome	startPos	endPos	GeneSymbol	Accession	genePercentage
+1	26126711	26142028	SELENON	NM_020451.2	94.93
+1	155580029	155583839	MSTO1	NM_018116.3	98.65
+2	152342263	152589624	NEB	NM_001271208.1	90.22
+X	64137652	64196194	ZC4H2	NM_018684.3	96.39