Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Commit

Permalink
Merge branch 'issue86' into 'master'
Browse files Browse the repository at this point in the history
Only consider best alignment for each read

Closes #86

See merge request algorithm/taiyaki!378
  • Loading branch information
marcus1487 committed Nov 12, 2020
2 parents 9ec09fc + fb5188c commit cba4b4a
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 35 deletions.
2 changes: 1 addition & 1 deletion bin/_bin_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def get_train_flipflop_parser():
'--warmup_batches', type=int, default=200,
help='Over first n batches, increase learning rate like cosine.')
trn_grp.add_argument(
'--lr_warmup', metavar='rate', type=Positive(float),
'--lr_warmup', metavar='rate', type=Positive(float),
help='Start learning rate for warmup. Defaults to lr_min.')
trn_grp.add_argument(
'--min_momentum', type=Positive(float),
Expand Down
36 changes: 19 additions & 17 deletions misc/assess_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def samacc(align_fn, min_coverage=0.6):
:returns: list of ACC_METRICS namedtuples containing accuracy metrics for
each valid alignment.
"""
res = []
res = {}
with pysam.AlignmentFile(align_fn, 'r') as sf:
for read in sf.fetch(until_eof=True):
if read.flag != 0 and read.flag != 16:
Expand All @@ -111,27 +111,29 @@ def samacc(align_fn, min_coverage=0.6):
readlen = bins[0] + bins[1]
perr = min(0.75, float(mismatch) / readlen)
pmatch = 1.0 - perr
accuracy = float(correct) / alnlen

entropy = pmatch * np.log2(pmatch)
if mismatch > 0:
entropy += perr * np.log2(perr / 3.0)

res.append(ACC_METRICS(
reference=read.reference_name,
query=read.query_name,
strand='-' if read.is_reverse else '+',
reference_start=read.reference_start,
reference_end=read.reference_end,
match=bins[0],
mismatch=mismatch,
insertion=bins[1],
deletion=bins[2],
coverage=coverage,
id=float(correct) / float(bins[0]),
accuracy=float(correct) / alnlen,
information=bins[0] * (2.0 + entropy)))

return res
if read.query not in res or res[read.query].accuracy < accuracy:
res[read.query] = ACC_METRICS(
reference=read.reference_name,
query=read.query_name,
strand='-' if read.is_reverse else '+',
reference_start=read.reference_start,
reference_end=read.reference_end,
match=bins[0],
mismatch=mismatch,
insertion=bins[1],
deletion=bins[2],
coverage=coverage,
id=float(correct) / float(bins[0]),
accuracy=accuracy,
information=bins[0] * (2.0 + entropy))

return list(res.values())


def acc_plot(acc, mode, median, title, fill=PLOT_DO_FILL):
Expand Down
12 changes: 6 additions & 6 deletions misc/calibrate_qscores_byread.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,8 @@ def get_alignment_data(alignment_file):
# Try to read the file as a Guppy alignment summary file
read_ids = t['read_id']
accuracies = t['alignment_accuracy']
alignment_lens = (t['alignment_strand_end']
- t['alignment_strand_start'])
alignment_lens = (t['alignment_strand_end'] -
t['alignment_strand_start'])
print("Interpreted alignment file as Guppy output")
accuracies[accuracies < 0] = np.nan
return read_ids, accuracies, alignment_lens
Expand All @@ -157,10 +157,10 @@ def get_alignment_data(alignment_file):
read_ids = t['query']
accuracies = t['accuracy']
# Query length in alignment not available directly in taiyaki summary
alignment_lens = (t['reference_end']
- t['reference_start']
+ t['insertion']
- t['deletion'])
alignment_lens = (t['reference_end'] -
t['reference_start'] +
t['insertion'] -
t['deletion'])
print("Interpreted alignment file as Taiyaki output")
return read_ids, accuracies, alignment_lens
except ValueError:
Expand Down
4 changes: 2 additions & 2 deletions misc/plot_mapped_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_parser():
help='Max number of reads to read from each file. Not used if ' +
'read_ids are given')
parser.add_argument(
'--read_ids', nargs='+', default=[],
'--read_ids', nargs='+', default=[],
help='One or more read_ids. If not present, plots the first ' +
'[--nreads] in each file')
parser.add_argument(
Expand All @@ -58,7 +58,7 @@ def get_parser():
help='Do not display status messages.')

parser.add_argument(
'mapped_signal_files', nargs='+',
'mapped_signal_files', nargs='+',
help='Inputs: one or more mapped signal files')

return parser
Expand Down
2 changes: 1 addition & 1 deletion misc/plot_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_parser():
parser.add_argument(
'output', help='Output png file')
parser.add_argument(
'input_directories', nargs='+',
'input_directories', nargs='+',
help='One or more directories containing {} and {} files'.format(
BATCH_LOG_FILENAME, VAL_LOG_FILENAME))

Expand Down
4 changes: 2 additions & 2 deletions taiyaki/basecall_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def stitch_chunks(out, chunk_starts, chunk_ends, stride, path_stitching=False):
# middle chunks
for i in range(1, nchunks - 1):
start = (chunk_ends[i - 1] - chunk_starts[i]) // (2 * stride)
end = (chunk_ends[i] + chunk_starts[i + 1]
- 2 * chunk_starts[i]) // (2 * stride)
end = (chunk_ends[i] + chunk_starts[i + 1] -
2 * chunk_starts[i]) // (2 * stride)
if path_stitching:
start += 1
end += 1
Expand Down
2 changes: 1 addition & 1 deletion taiyaki/common_cmdargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def add_common_command_args(parser, arglist):
help='Integer specifying which GPU to use, or "cpu" to use CPU ' +
'only. Other accepted formats: "cuda" (use default GPU), "cuda:2" '
'or "cuda2" (use GPU 2).')),
('eps', lambda: parser.add_argument(
('eps', lambda: parser.add_argument(
'--eps', default=1e-6, metavar='adjustment',
type=Positive(float), help='Small value to stabilise optimiser')),
('filter_max_dwell', lambda: parser.add_argument(
Expand Down
4 changes: 2 additions & 2 deletions taiyaki/flipflop_remap.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ def flipflop_remap(transition_scores, sequence, alphabet=DEFAULT_ALPHABET,
bases = np.array([alphabet.find(b) for b in sequence])
flops = flipflopfings.flopmask(bases)

stay_index = np.where(flops, bases + (2 * nbase + 1)
* nbase, bases + 2 * nbase * bases)
stay_index = np.where(flops, bases + (2 * nbase + 1) *
nbase, bases + 2 * nbase * bases)
from_base = (bases + flops * nbase)[:-1]
to_base = np.maximum(bases, nbase * flops)[1:]
step_index = from_base + 2 * nbase * to_base
Expand Down
6 changes: 3 additions & 3 deletions test/unit/test_ctc_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ def setUpClass(self):
paths = {}
weights = {}

paths['015'] = [0, 0, 1, 5, 5]
weights['015'] = [1.0, 1.0, 0.5, 1.0]
paths['015'] = [0, 0, 1, 5, 5]
weights['015'] = [1.0, 1.0, 0.5, 1.0]

paths['237'] = [2, 2, 3, 7, 7]
paths['237'] = [2, 2, 3, 7, 7]
weights['237'] = [1.0, 0.5, 1.0, 1.0]

weights['510'] = [0.0] # No weight for this sequence/path
Expand Down

0 comments on commit cba4b4a

Please sign in to comment.