Skip to content

Commit

Permalink
Merge pull request #2 from sophie22/v0.0.1
Browse files Browse the repository at this point in the history
v0.0.1 - first working version
  • Loading branch information
sophie22 authored May 7, 2022
2 parents 8aa983c + 2ecc800 commit 591b28e
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
34 changes: 34 additions & 0 deletions genes_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Python 3.8

# Import libraries/packages for use in the code
import sys
from numpy import outer
import pandas as pd # v1.3.4

### Read inputs from the command line
# sambamba output file (tsv)
sambamba_file = sys.argv[1]
# coverage threshold (default to 30)
if len(sys.argv) > 2:
coverage_threshold = sys.argv[2]
else:
coverage_threshold = "30"
coverage_column = "percentage" + coverage_threshold

### Load sambamba output file contents into a DataFrame
sambamba_df = pd.read_csv(sambamba_file, sep='\t')
# Split 'GeneSymbol;Accession' into separate columns
sambamba_df[["GeneSymbol", "Accession"]] = sambamba_df[
"GeneSymbol;Accession"].str.split(';', 1, expand=True)

### Identify exons with less than 100% coverage at 30x
below_threshold_exons_df = sambamba_df[sambamba_df[coverage_column] < 100.0]

### Identify unique genes with at least one exon with suboptimal coverage
below_threshold_genes = below_threshold_exons_df["GeneSymbol"].unique().tolist()

### Write gene symbols with suboptimal coverage to file
outfile = f"genes_suboptimal_coverage{coverage_threshold}x.txt"
with open(outfile, 'w') as fh:
for gene in below_threshold_genes:
fh.write(gene + "\n")
4 changes: 4 additions & 0 deletions genes_suboptimal_coverage30x.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SELENON
MSTO1
NEB
ZC4H2

0 comments on commit 591b28e

Please sign in to comment.