feat(instrument): attempt to recover when read names not in Illumina …

…format (#14) * feat: attempt to recover machine name when read names malformed * fix(instrument): swap fcid where it should've been iid * feat: with malformed read names, check header for PU and PM * chore: remove unused variable * fix: apply non-illumina format "low confidence" uniformly * refactor(encoding): rename var to make use clearer 'max/min_phred_score' renamed to 'highest/lowest_ascii'
stjudecloud · Jun 24, 2021 · 9ab0951 · 9ab0951
1 parent 7a408e6
commit 9ab0951
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 23 deletions.
diff --git a/ngsderive/commands/encoding.py b/ngsderive/commands/encoding.py
@@ -47,37 +47,31 @@ def main(ngsfiles, outfile=sys.stdout, n_samples=1000000):
         for read in itertools.islice(ngsfile, n_samples):
             score_set.update(read["quality"])
 
-        max_phred_score = str(max(score_set) + 33)
-        min_phred_score = str(min(score_set) + 33)
+        highest_ascii = str(max(score_set) + 33)
+        lowest_ascii = str(min(score_set) + 33)
         if score_set <= ILLUMINA_1_3_SET:
             result = {
                 "File": ngsfilepath,
-                "Evidence": "ASCII range: {}-{}".format(
-                    min_phred_score, max_phred_score
-                ),
+                "Evidence": "ASCII range: {}-{}".format(lowest_ascii, highest_ascii),
                 "ProbableEncoding": "Illumina 1.3",
             }
         elif score_set <= ILLUMINA_1_0_SET:
             result = {
                 "File": ngsfilepath,
-                "Evidence": "ASCII range: {}-{}".format(
-                    min_phred_score, max_phred_score
-                ),
+                "Evidence": "ASCII range: {}-{}".format(lowest_ascii, highest_ascii),
                 "ProbableEncoding": "Solexa/Illumina 1.0",
             }
         elif score_set <= SANGER_SET:
             result = {
                 "File": ngsfilepath,
-                "Evidence": "ASCII range: {}-{}".format(
-                    min_phred_score, max_phred_score
-                ),
+                "Evidence": "ASCII range: {}-{}".format(lowest_ascii, highest_ascii),
                 "ProbableEncoding": "Sanger/Illumina 1.8",
             }
         else:
             result = {
                 "File": ngsfilepath,
                 "Evidence": "ASCII values outside known PHRED encoding ranges: {}-{}".format(
-                    min_phred_score, max_phred_score
+                    lowest_ascii, highest_ascii
                 ),
                 "ProbableEncoding": "Unknown",
             }

diff --git a/ngsderive/commands/instrument.py b/ngsderive/commands/instrument.py
@@ -76,7 +76,7 @@
     "^A[A-Z0-9]{4}$": ["MiSeq"],  # MiSeq flow cell
     "^B[A-Z0-9]{4}$": ["MiSeq"],  # MiSeq flow cell
     "^D[A-Z0-9]{4}$": ["MiSeq"],  # MiSeq nano flow cell
-    # "^D[A-Z0-9]{4}$" : ["HiSeq 2000", "HiSeq 2500"],                                 # Unknown HiSeq flow cell examined in SJ data
+    # "^D[A-Z0-9]{4}$" : ["HiSeq 2000", "HiSeq 2500"],  # Unknown HiSeq flow cell examined in SJ data
     "^G[A-Z0-9]{4}$": ["MiSeq"],  # MiSeq micro flow cell
 }
 
@@ -109,7 +109,6 @@ def predict_instrument_from_iids(iids):
 
 def derive_instrument_from_fcid(fcid):
     matching_instruments = set()
-    detected_at_least_one_instrument = False
 
     for pattern in flowcell_ids.keys():
         if re.search(pattern, fcid):
@@ -155,16 +154,28 @@ def resolve_instrument(
             return set(["unknown"]), "no confidence", "no match"
 
     if len(possible_instruments_by_iid) == 0:
-        confidence = "medium confidence"
-        if len(possible_instruments_by_fcid) > 1:
+        if not malformed_read_names_detected:
+            confidence = "medium confidence"
+            if len(possible_instruments_by_fcid) > 1:
+                confidence = "low confidence"
+            return possible_instruments_by_fcid, confidence, "flowcell id"
+        else:
             confidence = "low confidence"
-        return possible_instruments_by_fcid, confidence, "flowcell id"
+            return possible_instruments_by_fcid, confidence, "flowcell id"
 
     if len(possible_instruments_by_fcid) == 0:
-        confidence = "medium confidence"
-        if len(possible_instruments_by_fcid) > 1:
+        if not malformed_read_names_detected:
+            confidence = "medium confidence"
+            if len(possible_instruments_by_iid) > 1:
+                confidence = "low confidence"
+            return possible_instruments_by_iid, confidence, "instrument id"
+        else:
             confidence = "low confidence"
-        return possible_instruments_by_iid, confidence, "instrument id"
+            return (
+                possible_instruments_by_iid,
+                confidence,
+                "instrument id",
+            )
 
     overlapping_instruments = possible_instruments_by_iid & possible_instruments_by_fcid
     if len(overlapping_instruments) == 0:
@@ -220,11 +231,22 @@ def main(ngsfiles, outfile=sys.stdout, n_samples=10000):
             parts = read["query_name"].split(":")
             if len(parts) != 7:  # not Illumina format
                 malformed_read_names = True
+                iid = parts[0]  # attempt to recover machine name
+                instruments.add(iid)
+                for rg in ngsfile.handle.header.to_dict()["RG"]:
+                    if rg["ID"] == read["read_group"]:
+                        if "PU" in rg:
+                            flowcells.add(rg["PU"])
+                        if "PM" in rg:
+                            instruments.add(rg["PM"])
                 continue
             iid, fcid = parts[0], parts[2]
             instruments.add(iid)
             flowcells.add(fcid)
-
+        if malformed_read_names:
+            logger.warning(
+                "Encountered read names not in Illumina format. Recovery attempted."
+            )
         (
             possible_instruments_by_iid,
             detected_instrument_by_iid,

diff --git a/ngsderive/utils.py b/ngsderive/utils.py
@@ -71,6 +71,7 @@ def __iter__(self):
     def __next__(self):
         query_name = None
         query = None
+        read_group = None
         quality = None
         if self.filetype == NGSFileType.FASTQ:
             query_name = self.handle.readline().strip()
@@ -101,15 +102,31 @@ def __next__(self):
             read = next(self.handle)
             query_name = read.query_name
             query = read.query_alignment_sequence
+            read_group = read.get_tag("RG")
             if self.store_qualities:
                 quality = read.query_alignment_qualities
 
         self.read_num += 1
 
         if self.store_qualities:
-            return {"query_name": query_name, "query": query, "quality": quality}
+            if read_group:
+                return {
+                    "query_name": query_name,
+                    "query": query,
+                    "read_group": read_group,
+                    "quality": quality,
+                }
+            else:
+                return {"query_name": query_name, "query": query, "quality": quality}
         else:
-            return {"query_name": query_name, "query": query}
+            if read_group:
+                return {
+                    "query_name": query_name,
+                    "query": query,
+                    "read_group": read_group,
+                }
+            else:
+                return {"query_name": query_name, "query": query}
 
 
 def sort_gff(filename):