Intel-HLS · cpjagan · Nov 7, 2016 · Nov 7, 2016 · Nov 8, 2016 · Nov 8, 2016
diff --git a/requirements.txt b/requirements.txt
@@ -26,3 +26,5 @@ alembic==0.8.2
 psycopg2==2.6.1
 PyVCF==0.6.8
 pysam==0.9.0
+numpy==1.11.2
+pyfasta==0.5.2
diff --git a/utils/helper.py b/utils/helper.py
@@ -26,6 +26,7 @@
 import os
 import sys
 from datetime import datetime
+from pyfasta import Fasta
 try:
     from collections import OrderedDict
 except ImportError:
@@ -62,43 +63,27 @@ def __init__(self):
         self["GT"] = {"vcf_field_class": ["FORMAT"], "type": "int", "length": "P"}
         self["PS"] = {"vcf_field_class": ["FORMAT"], "type": "int", "length": 1}
 
+def verifyFasta(REFERENCE_FASTA):
+    """
+    Checks if the path exists and opens it to make sure that it is a valid reference file
+    """
+    if not os.path.exists(REFERENCE_FASTA):
+        raise Exception('reference file ({}) is invalid'.format(REFERENCE_FASTA))
+    # Check if the library can open the file correctly
+    f = Fasta(REFERENCE_FASTA)
 
-def getReference(assembly, chromosome, start, end):
-    """
-    Gets the Reference string from rest.ensemble.org
-    """
-    # Ensemble takes MT and not M
-    if(chromosome == "M"):
-        chromosome = "MT"
-    server = "http://rest.ensembl.org"
-    request = "/sequence/region/human/" + chromosome + ":" + \
-        str(start) + ".." + str(end) + ":1?coord_system_version=" + assembly
-
-    nRetires = NUM_RETRIES
-    r = None
-    while(nRetires):
-        bFail = False
-        try:
-            r = requests.get(server + request,
-                             headers={"Content-Type": "text/plain"})
-        except Exception as e:
-            bFail = True
-
-        if r is None or not r.ok or bFail:
-            nRetires -= 1
-            bFail = False
-
-            # Use a sleep timer if we are failing
-            import time
-            time.sleep(nRetires)
-            continue
-        else:
-            break
-    if r is None or not r.ok or bFail:
-        print server + request
-        r.raise_for_status()
-
-    return r.text
+def getReference(assembly, chromosome, start, end, REFERENCE_FASTA):
+    """
+    Gets the Reference string from the reference file that was provided
+    """
+    if not os.path.exists(REFERENCE_FASTA):
+        raise Exception('reference file ({}) is invalid'.format(REFERENCE_FASTA))
+    f = Fasta(REFERENCE_FASTA)
+    if not chromosome.startswith('chr'):
+        chromosome = 'chr' + chromosome
+    if(chromosome == 'chrMT'):
+        chromosome = 'chrM'
+    return f[chromosome][start-1:end].upper()
 
 
 def getFileName(inFile, splitStr=None):

diff --git a/utils/maf2tile.py b/utils/maf2tile.py
@@ -88,8 +88,11 @@
         required=False,
         type=str,
         help="Loader JSON to load data into Tile DB.")
+    parser.add_argument('-r', '--reference_fasta', required=True, type=str,
+                        help='reference fasta file to get the reference Allele information')
 
     args = parser.parse_args()
+    helper.verifyFasta(args.reference_fasta)
 
     if args.spark:
         # call spark from within import script
@@ -101,7 +104,8 @@
             "maf_pyspark.py", 
             "-c", args.config, 
             "-d", args.outputdir, 
-            "-o", args.output, 
+            "-o", args.output,
+            "-r", args.reference_fasta, 
             "-i"]
 
         spark_cmd.extend(args.inputs)
@@ -118,7 +122,7 @@
             raise Exception("Error running converter\n\nERROR: \n{}".format(error))
 
     else:
-
+        multiprocess_import.REFERENCE_FASTA = args.reference_fasta
         multiprocess_import.parallelGen(
             args.config,
             args.inputs,

diff --git a/utils/maf_importer.py b/utils/maf_importer.py
@@ -40,7 +40,7 @@
 from utils.configuration import ConfigReader
 
 CSVLine = csvline.CSVLine
-
+REFERENCE_FASTA = None
 
 class MAF(File2Tile):
 
@@ -264,7 +264,7 @@ def writeCSVLine(self):
             end = start
 
             ref = helper.getReference(assembly, chromosome, start,
-                    start)
+                    start, REFERENCE_FASTA)
             self.prev_TileDBValues['REF'] = ref
             index = 0
             for value in self.prev_TileDBValues['ALT']:
@@ -288,7 +288,7 @@ def writeCSVLine(self):
                 end = self.prev_ChromosomePosition[IDX.CHR_END]
 
                 ref = helper.getReference(assembly, chromosome, start,
-                        start)
+                        start, REFERENCE_FASTA)
                 self.prev_TileDBValues['REF'] = ref \
                     + self.prev_TileDBValues['REF']
                 index = 0

diff --git a/utils/maf_pyspark.py b/utils/maf_pyspark.py
@@ -56,6 +56,8 @@ class CONST(IDX):
     INDEX = 1
     PLOIDY = 0
     END_IDX = 0
+
+    REFERENCE_FASTA = None
 
 
 class MAF_Spark:
@@ -587,7 +589,7 @@ def updateRefAltPos(iter):
                 end = start
 
                 newRef = helper.getReference(assembly, chromosome,
-                        start, start)
+                        start, start, CONST.REFERENCE_FASTA)
                 ref = newRef
 
                 index = 0
@@ -604,7 +606,7 @@ def updateRefAltPos(iter):
                 if bFlag:
                     start = start - 1
                     newRef = helper.getReference(assembly, chromosome,
-                            start, start)
+                            start, start, CONST.REFERENCE_FASTA)
                     ref = newRef + ref
                     index = 0
                     for value in alt:
@@ -749,8 +751,11 @@ def parallelGen(
 
     parser.add_argument('-l', '--loader', required=False, type=str,
                         help='Loader JSON to load data into Tile DB.')
+    parser.add_argument('-r', '--reference_fasta', required=True, type=str,
+                        help='reference fasta file to get the reference Allele information')
 
     args = parser.parse_args()
+    CONST.REFERENCE_FASTA = args.reference_fasta
 
     parallelGen(
         args.config,

diff --git a/utils/test/test_helper.py b/utils/test/test_helper.py
@@ -22,26 +22,43 @@
 
 import pytest
 import gzip
-from mock import patch
+from unittest import TestCase
 import utils.helper as helper
 
+class TestReferenceFasta(TestCase):
 
-def test_getReference():
-    assert helper.getReference("GRCh37", "1", 100, 101) != ""
-    assert helper.getReference("GRCh37", "M", 1, 2) == "GA"
-
-
-def raiseException():
-    raise Exception("Test")
-
-
-@patch('requests.get', side_effect=raiseException)
-@patch('utils.helper.NUM_RETRIES', 2)
-def test_getReference_neg(patched_fn):
-    with pytest.raises(Exception) as exec_info:
-        helper.getReference("GRCh37", "1", 100, 101)
-    assert patched_fn.call_count == 2
+    @classmethod
+    def setUpClass(self):
+        self.fasta = """>chr1
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA
+CTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>chrM
+GANN
+"""
 
+    @pytest.fixture(autouse=True)
+    def set_tmpdir(self, tmpdir):
+        self.tmpdir = tmpdir
+
+    def test_getReference(self):
+        fastafile = self.tmpdir.join('test.fasta')
+        fastafile.write(self.fasta)
+        assert helper.verifyFasta(str(fastafile)) is None
+        assert helper.getReference("GRCh37", "1", 100, 101, str(fastafile)) != ""
+        assert helper.getReference("GRCh37", "M", 1, 2, str(fastafile)) == "GA"
+        assert helper.getReference("GRCh37", "MT", 1, 2, str(fastafile)) == "GA"
+
+    def test_getReference_neg(self):
+        with pytest.raises(Exception) as exec_info:
+            helper.getReference("GRCh37", "1", 100, 101, '/tmp/missing.fasta')
+        assert 'is invalid' in str(exec_info.value)
+
+    def test_verifyFasta_neg(self):
+        fastafile = self.tmpdir.join('test_neg.fasta')
+        with pytest.raises(Exception) as exec_info:
+            helper.verifyFasta(str(fastafile))
+        assert 'is invalid' in str(exec_info.value)
 
 def test_printers():
     helper.log("test")

diff --git a/utils/test/test_maf2tile.py b/utils/test/test_maf2tile.py
@@ -75,6 +75,13 @@ class TestMAF(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
+        self.fasta = """>chr1
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA
+CTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>chrM
+GANN
+"""
         self.TESTDB_URI = "postgresql+psycopg2://@:5432/mafend2end"
         if not database_exists(self.TESTDB_URI):
             create_database(self.TESTDB_URI)
@@ -230,6 +237,11 @@ def test_end2end(self):
             fp.write(json.dumps(config_json))
 
         test_output_dir = self.tmpdir.mkdir("output")
+
+        ref_fasta = self.tmpdir.join('test_ref.fasta')
+        ref_fasta.write(self.fasta)
+
+        imp.multiprocess_import.REFERENCE_FASTA = str(ref_fasta)
 
         imp.multiprocess_import.poolGenerateCSV((str(test_config), str(
             input_file), str(test_output_dir) + "/" + "out.txt", False))