Skip to content
This repository was archived by the owner on Jul 9, 2019. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ alembic==0.8.2
psycopg2==2.6.1
PyVCF==0.6.8
pysam==0.9.0
numpy==1.11.2
pyfasta==0.5.2
57 changes: 21 additions & 36 deletions utils/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import os
import sys
from datetime import datetime
from pyfasta import Fasta
try:
from collections import OrderedDict
except ImportError:
Expand Down Expand Up @@ -62,43 +63,27 @@ def __init__(self):
self["GT"] = {"vcf_field_class": ["FORMAT"], "type": "int", "length": "P"}
self["PS"] = {"vcf_field_class": ["FORMAT"], "type": "int", "length": 1}

def verifyFasta(REFERENCE_FASTA):
"""
Checks if the path exists and opens it to make sure that it is a valid reference file
"""
if not os.path.exists(REFERENCE_FASTA):
raise Exception('reference file ({}) is invalid'.format(REFERENCE_FASTA))
# Check if the library can open the file correctly
f = Fasta(REFERENCE_FASTA)

def getReference(assembly, chromosome, start, end):
"""
Gets the Reference string from rest.ensemble.org
"""
# Ensemble takes MT and not M
if(chromosome == "M"):
chromosome = "MT"
server = "http://rest.ensembl.org"
request = "/sequence/region/human/" + chromosome + ":" + \
str(start) + ".." + str(end) + ":1?coord_system_version=" + assembly

nRetires = NUM_RETRIES
r = None
while(nRetires):
bFail = False
try:
r = requests.get(server + request,
headers={"Content-Type": "text/plain"})
except Exception as e:
bFail = True

if r is None or not r.ok or bFail:
nRetires -= 1
bFail = False

# Use a sleep timer if we are failing
import time
time.sleep(nRetires)
continue
else:
break
if r is None or not r.ok or bFail:
print server + request
r.raise_for_status()

return r.text
def getReference(assembly, chromosome, start, end, REFERENCE_FASTA):
"""
Gets the Reference string from the reference file that was provided
"""
if not os.path.exists(REFERENCE_FASTA):
raise Exception('reference file ({}) is invalid'.format(REFERENCE_FASTA))
f = Fasta(REFERENCE_FASTA)
if not chromosome.startswith('chr'):
chromosome = 'chr' + chromosome
if(chromosome == 'chrMT'):
chromosome = 'chrM'
return f[chromosome][start-1:end].upper()


def getFileName(inFile, splitStr=None):
Expand Down
8 changes: 6 additions & 2 deletions utils/maf2tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,11 @@
required=False,
type=str,
help="Loader JSON to load data into Tile DB.")
parser.add_argument('-r', '--reference_fasta', required=True, type=str,
help='reference fasta file to get the reference Allele information')

args = parser.parse_args()
helper.verifyFasta(args.reference_fasta)

if args.spark:
# call spark from within import script
Expand All @@ -101,7 +104,8 @@
"maf_pyspark.py",
"-c", args.config,
"-d", args.outputdir,
"-o", args.output,
"-o", args.output,
"-r", args.reference_fasta,
"-i"]

spark_cmd.extend(args.inputs)
Expand All @@ -118,7 +122,7 @@
raise Exception("Error running converter\n\nERROR: \n{}".format(error))

else:

multiprocess_import.REFERENCE_FASTA = args.reference_fasta
multiprocess_import.parallelGen(
args.config,
args.inputs,
Expand Down
6 changes: 3 additions & 3 deletions utils/maf_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from utils.configuration import ConfigReader

CSVLine = csvline.CSVLine

REFERENCE_FASTA = None

class MAF(File2Tile):

Expand Down Expand Up @@ -264,7 +264,7 @@ def writeCSVLine(self):
end = start

ref = helper.getReference(assembly, chromosome, start,
start)
start, REFERENCE_FASTA)
self.prev_TileDBValues['REF'] = ref
index = 0
for value in self.prev_TileDBValues['ALT']:
Expand All @@ -288,7 +288,7 @@ def writeCSVLine(self):
end = self.prev_ChromosomePosition[IDX.CHR_END]

ref = helper.getReference(assembly, chromosome, start,
start)
start, REFERENCE_FASTA)
self.prev_TileDBValues['REF'] = ref \
+ self.prev_TileDBValues['REF']
index = 0
Expand Down
9 changes: 7 additions & 2 deletions utils/maf_pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class CONST(IDX):
INDEX = 1
PLOIDY = 0
END_IDX = 0

REFERENCE_FASTA = None


class MAF_Spark:
Expand Down Expand Up @@ -587,7 +589,7 @@ def updateRefAltPos(iter):
end = start

newRef = helper.getReference(assembly, chromosome,
start, start)
start, start, CONST.REFERENCE_FASTA)
ref = newRef

index = 0
Expand All @@ -604,7 +606,7 @@ def updateRefAltPos(iter):
if bFlag:
start = start - 1
newRef = helper.getReference(assembly, chromosome,
start, start)
start, start, CONST.REFERENCE_FASTA)
ref = newRef + ref
index = 0
for value in alt:
Expand Down Expand Up @@ -749,8 +751,11 @@ def parallelGen(

parser.add_argument('-l', '--loader', required=False, type=str,
help='Loader JSON to load data into Tile DB.')
parser.add_argument('-r', '--reference_fasta', required=True, type=str,
help='reference fasta file to get the reference Allele information')

args = parser.parse_args()
CONST.REFERENCE_FASTA = args.reference_fasta

parallelGen(
args.config,
Expand Down
49 changes: 33 additions & 16 deletions utils/test/test_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,43 @@

import pytest
import gzip
from mock import patch
from unittest import TestCase
import utils.helper as helper

class TestReferenceFasta(TestCase):

def test_getReference():
assert helper.getReference("GRCh37", "1", 100, 101) != ""
assert helper.getReference("GRCh37", "M", 1, 2) == "GA"


def raiseException():
raise Exception("Test")


@patch('requests.get', side_effect=raiseException)
@patch('utils.helper.NUM_RETRIES', 2)
def test_getReference_neg(patched_fn):
with pytest.raises(Exception) as exec_info:
helper.getReference("GRCh37", "1", 100, 101)
assert patched_fn.call_count == 2
@classmethod
def setUpClass(self):
self.fasta = """>chr1
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA
CTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
>chrM
GANN
"""

@pytest.fixture(autouse=True)
def set_tmpdir(self, tmpdir):
self.tmpdir = tmpdir

def test_getReference(self):
fastafile = self.tmpdir.join('test.fasta')
fastafile.write(self.fasta)
assert helper.verifyFasta(str(fastafile)) is None
assert helper.getReference("GRCh37", "1", 100, 101, str(fastafile)) != ""
assert helper.getReference("GRCh37", "M", 1, 2, str(fastafile)) == "GA"
assert helper.getReference("GRCh37", "MT", 1, 2, str(fastafile)) == "GA"

def test_getReference_neg(self):
with pytest.raises(Exception) as exec_info:
helper.getReference("GRCh37", "1", 100, 101, '/tmp/missing.fasta')
assert 'is invalid' in str(exec_info.value)

def test_verifyFasta_neg(self):
fastafile = self.tmpdir.join('test_neg.fasta')
with pytest.raises(Exception) as exec_info:
helper.verifyFasta(str(fastafile))
assert 'is invalid' in str(exec_info.value)

def test_printers():
helper.log("test")
Expand Down
12 changes: 12 additions & 0 deletions utils/test/test_maf2tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ class TestMAF(unittest.TestCase):

@classmethod
def setUpClass(self):
self.fasta = """>chr1
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA
CTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
>chrM
GANN
"""
self.TESTDB_URI = "postgresql+psycopg2://@:5432/mafend2end"
if not database_exists(self.TESTDB_URI):
create_database(self.TESTDB_URI)
Expand Down Expand Up @@ -230,6 +237,11 @@ def test_end2end(self):
fp.write(json.dumps(config_json))

test_output_dir = self.tmpdir.mkdir("output")

ref_fasta = self.tmpdir.join('test_ref.fasta')
ref_fasta.write(self.fasta)

imp.multiprocess_import.REFERENCE_FASTA = str(ref_fasta)

imp.multiprocess_import.poolGenerateCSV((str(test_config), str(
input_file), str(test_output_dir) + "/" + "out.txt", False))
Expand Down