Skip to content

Commit

Permalink
Merge branch 'v0.0.2' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
sophie22 authored May 7, 2022
2 parents 400e1a6 + b877241 commit 14194d0
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 4 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ Generate a report that lists any genes that have less than 100% coverage at 30x.
- Ideally using python, write a script that takes the sambamba output and generates a report listing any genes that have less than 100% coverage at 30x
- This script should be able to be applied to any gene panel


# Version 0.0.1
## What is required to run this script?
A new virtual environment was created using Python 3.8.0 and packages installed from the `requirements.txt`.
Expand Down
47 changes: 44 additions & 3 deletions genes_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,56 @@

### Load sambamba output file contents into a DataFrame
sambamba_df = pd.read_csv(sambamba_file, sep='\t')
# Calculate exon length by end-start position
sambamba_df["ExonLength"] = sambamba_df["EndPosition"] - sambamba_df["StartPosition"]
# Calculate number of bases above 30x coverage
sambamba_df["AboveThreshold"] = sambamba_df[coverage_column] / 100 * sambamba_df["ExonLength"]

# Identify unique genes
panel_genes = sambamba_df["GeneSymbol;Accession"].unique().tolist()
# Split 'GeneSymbol;Accession' into separate columns
sambamba_df[["GeneSymbol", "Accession"]] = sambamba_df[
"GeneSymbol;Accession"].str.split(';', 1, expand=True)

### Identify exons with less than 100% coverage at 30x
below_threshold_exons_df = sambamba_df[sambamba_df[coverage_column] < 100.0]
def genePCTcovered(df):
"""
Args: DataFrame with at least the following columns
- 'ExonLength': which is the number of bases in the exon
- 'AboveThreshold': which is the number of bases with reads
above the threshold
Returns: list of chromosome, startPos, endPos, GeneSymbol, Accession
and the calculated genePercentage
"""
chromosome = df["#chromosome"].to_list()[0]
startPos = df["StartPosition"].to_list()[0]
endPos = df["StartPosition"].to_list()[-1]
GeneSymbol = df["GeneSymbol"].to_list()[0]
Accession = df["Accession"].to_list()[0]

# Calculate percentage coverage above threshold aacross all gene bases
total_bases = df["ExonLength"].sum()
bases_above_threshold = df["AboveThreshold"].sum()
genePercentage = bases_above_threshold / total_bases * 100

return([chromosome, startPos, endPos, GeneSymbol, Accession, genePercentage])

### Calculate combined coverage of genes
gene_coverage_dict = {}
for i, gene in enumerate(panel_genes):
gene_df = sambamba_df.loc[(sambamba_df["GeneSymbol;Accession"] == gene)]
gene_coverage_dict[i] = genePCTcovered(gene_df)

gene_coverage_df = pd.DataFrame.from_dict(gene_coverage_dict,
orient='index', columns=["chromosome", "startPos",
"endPos", "GeneSymbol", "Accession", "genePercentage"])
# Round percentage values to 2 dp
gene_coverage_df["genePercentage"] = gene_coverage_df["genePercentage"].round(2)

### Identify unique genes with at least one exon with suboptimal coverage
below_threshold_genes = below_threshold_exons_df["GeneSymbol"].unique().tolist()
below_threshold_genes = gene_coverage_df[
gene_coverage_df["genePercentage"] < 100.00
]

### Write gene symbols with suboptimal coverage to file
outfile = f"genes_suboptimal_coverage{coverage_threshold}x.txt"
Expand Down
5 changes: 5 additions & 0 deletions genes_w_suboptimal_coverage30x.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
chromosome startPos endPos GeneSymbol Accession genePercentage
1 26126711 26142028 SELENON NM_020451.2 94.93
1 155580029 155583839 MSTO1 NM_018116.3 98.65
2 152342263 152589624 NEB NM_001271208.1 90.22
X 64137652 64196194 ZC4H2 NM_018684.3 96.39

0 comments on commit 14194d0

Please sign in to comment.