Skip to content

Commit

Permalink
feat(instrument): attempt to recover when read names not in Illumina …
Browse files Browse the repository at this point in the history
…format (#14)

* feat: attempt to recover machine name when read names malformed

* fix(instrument): swap fcid where it should've been iid

* feat: with malformed read names, check header for PU and PM

* chore: remove unused variable

* fix: apply non-illumina format "low confidence" uniformly

* refactor(encoding): rename var to make use clearer

'max/min_phred_score' renamed to 'highest/lowest_ascii'
  • Loading branch information
a-frantz authored Jun 24, 2021
1 parent 7a408e6 commit 9ab0951
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 23 deletions.
18 changes: 6 additions & 12 deletions ngsderive/commands/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,37 +47,31 @@ def main(ngsfiles, outfile=sys.stdout, n_samples=1000000):
for read in itertools.islice(ngsfile, n_samples):
score_set.update(read["quality"])

max_phred_score = str(max(score_set) + 33)
min_phred_score = str(min(score_set) + 33)
highest_ascii = str(max(score_set) + 33)
lowest_ascii = str(min(score_set) + 33)
if score_set <= ILLUMINA_1_3_SET:
result = {
"File": ngsfilepath,
"Evidence": "ASCII range: {}-{}".format(
min_phred_score, max_phred_score
),
"Evidence": "ASCII range: {}-{}".format(lowest_ascii, highest_ascii),
"ProbableEncoding": "Illumina 1.3",
}
elif score_set <= ILLUMINA_1_0_SET:
result = {
"File": ngsfilepath,
"Evidence": "ASCII range: {}-{}".format(
min_phred_score, max_phred_score
),
"Evidence": "ASCII range: {}-{}".format(lowest_ascii, highest_ascii),
"ProbableEncoding": "Solexa/Illumina 1.0",
}
elif score_set <= SANGER_SET:
result = {
"File": ngsfilepath,
"Evidence": "ASCII range: {}-{}".format(
min_phred_score, max_phred_score
),
"Evidence": "ASCII range: {}-{}".format(lowest_ascii, highest_ascii),
"ProbableEncoding": "Sanger/Illumina 1.8",
}
else:
result = {
"File": ngsfilepath,
"Evidence": "ASCII values outside known PHRED encoding ranges: {}-{}".format(
min_phred_score, max_phred_score
lowest_ascii, highest_ascii
),
"ProbableEncoding": "Unknown",
}
Expand Down
40 changes: 31 additions & 9 deletions ngsderive/commands/instrument.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"^A[A-Z0-9]{4}$": ["MiSeq"], # MiSeq flow cell
"^B[A-Z0-9]{4}$": ["MiSeq"], # MiSeq flow cell
"^D[A-Z0-9]{4}$": ["MiSeq"], # MiSeq nano flow cell
# "^D[A-Z0-9]{4}$" : ["HiSeq 2000", "HiSeq 2500"], # Unknown HiSeq flow cell examined in SJ data
# "^D[A-Z0-9]{4}$" : ["HiSeq 2000", "HiSeq 2500"], # Unknown HiSeq flow cell examined in SJ data
"^G[A-Z0-9]{4}$": ["MiSeq"], # MiSeq micro flow cell
}

Expand Down Expand Up @@ -109,7 +109,6 @@ def predict_instrument_from_iids(iids):

def derive_instrument_from_fcid(fcid):
matching_instruments = set()
detected_at_least_one_instrument = False

for pattern in flowcell_ids.keys():
if re.search(pattern, fcid):
Expand Down Expand Up @@ -155,16 +154,28 @@ def resolve_instrument(
return set(["unknown"]), "no confidence", "no match"

if len(possible_instruments_by_iid) == 0:
confidence = "medium confidence"
if len(possible_instruments_by_fcid) > 1:
if not malformed_read_names_detected:
confidence = "medium confidence"
if len(possible_instruments_by_fcid) > 1:
confidence = "low confidence"
return possible_instruments_by_fcid, confidence, "flowcell id"
else:
confidence = "low confidence"
return possible_instruments_by_fcid, confidence, "flowcell id"
return possible_instruments_by_fcid, confidence, "flowcell id"

if len(possible_instruments_by_fcid) == 0:
confidence = "medium confidence"
if len(possible_instruments_by_fcid) > 1:
if not malformed_read_names_detected:
confidence = "medium confidence"
if len(possible_instruments_by_iid) > 1:
confidence = "low confidence"
return possible_instruments_by_iid, confidence, "instrument id"
else:
confidence = "low confidence"
return possible_instruments_by_iid, confidence, "instrument id"
return (
possible_instruments_by_iid,
confidence,
"instrument id",
)

overlapping_instruments = possible_instruments_by_iid & possible_instruments_by_fcid
if len(overlapping_instruments) == 0:
Expand Down Expand Up @@ -220,11 +231,22 @@ def main(ngsfiles, outfile=sys.stdout, n_samples=10000):
parts = read["query_name"].split(":")
if len(parts) != 7: # not Illumina format
malformed_read_names = True
iid = parts[0] # attempt to recover machine name
instruments.add(iid)
for rg in ngsfile.handle.header.to_dict()["RG"]:
if rg["ID"] == read["read_group"]:
if "PU" in rg:
flowcells.add(rg["PU"])
if "PM" in rg:
instruments.add(rg["PM"])
continue
iid, fcid = parts[0], parts[2]
instruments.add(iid)
flowcells.add(fcid)

if malformed_read_names:
logger.warning(
"Encountered read names not in Illumina format. Recovery attempted."
)
(
possible_instruments_by_iid,
detected_instrument_by_iid,
Expand Down
21 changes: 19 additions & 2 deletions ngsderive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def __iter__(self):
def __next__(self):
query_name = None
query = None
read_group = None
quality = None
if self.filetype == NGSFileType.FASTQ:
query_name = self.handle.readline().strip()
Expand Down Expand Up @@ -101,15 +102,31 @@ def __next__(self):
read = next(self.handle)
query_name = read.query_name
query = read.query_alignment_sequence
read_group = read.get_tag("RG")
if self.store_qualities:
quality = read.query_alignment_qualities

self.read_num += 1

if self.store_qualities:
return {"query_name": query_name, "query": query, "quality": quality}
if read_group:
return {
"query_name": query_name,
"query": query,
"read_group": read_group,
"quality": quality,
}
else:
return {"query_name": query_name, "query": query, "quality": quality}
else:
return {"query_name": query_name, "query": query}
if read_group:
return {
"query_name": query_name,
"query": query,
"read_group": read_group,
}
else:
return {"query_name": query_name, "query": query}


def sort_gff(filename):
Expand Down

0 comments on commit 9ab0951

Please sign in to comment.