Merge branch 'issue86' into 'master'

Only consider best alignment for each read Closes #86 See merge request algorithm/taiyaki!378
nanoporetech · Nov 12, 2020 · cba4b4a · cba4b4a
2 parents 9ec09fc + fb5188c
commit cba4b4a
Show file tree

Hide file tree

Showing 9 changed files with 37 additions and 35 deletions.
diff --git a/bin/_bin_argparse.py b/bin/_bin_argparse.py
@@ -64,7 +64,7 @@ def get_train_flipflop_parser():
         '--warmup_batches', type=int, default=200,
         help='Over first n batches, increase learning rate like cosine.')
     trn_grp.add_argument(
-        '--lr_warmup',  metavar='rate', type=Positive(float),
+        '--lr_warmup', metavar='rate', type=Positive(float),
         help='Start learning rate for warmup. Defaults to lr_min.')
     trn_grp.add_argument(
         '--min_momentum', type=Positive(float),

diff --git a/misc/assess_alignment.py b/misc/assess_alignment.py
@@ -89,7 +89,7 @@ def samacc(align_fn, min_coverage=0.6):
     :returns: list of ACC_METRICS namedtuples containing accuracy metrics for
         each valid alignment.
     """
-    res = []
+    res = {}
     with pysam.AlignmentFile(align_fn, 'r') as sf:
         for read in sf.fetch(until_eof=True):
             if read.flag != 0 and read.flag != 16:
@@ -111,27 +111,29 @@ def samacc(align_fn, min_coverage=0.6):
             readlen = bins[0] + bins[1]
             perr = min(0.75, float(mismatch) / readlen)
             pmatch = 1.0 - perr
+            accuracy = float(correct) / alnlen
 
             entropy = pmatch * np.log2(pmatch)
             if mismatch > 0:
                 entropy += perr * np.log2(perr / 3.0)
 
-            res.append(ACC_METRICS(
-                reference=read.reference_name,
-                query=read.query_name,
-                strand='-' if read.is_reverse else '+',
-                reference_start=read.reference_start,
-                reference_end=read.reference_end,
-                match=bins[0],
-                mismatch=mismatch,
-                insertion=bins[1],
-                deletion=bins[2],
-                coverage=coverage,
-                id=float(correct) / float(bins[0]),
-                accuracy=float(correct) / alnlen,
-                information=bins[0] * (2.0 + entropy)))
-
-    return res
+            if read.query not in res or res[read.query].accuracy < accuracy:
+                res[read.query] = ACC_METRICS(
+                    reference=read.reference_name,
+                    query=read.query_name,
+                    strand='-' if read.is_reverse else '+',
+                    reference_start=read.reference_start,
+                    reference_end=read.reference_end,
+                    match=bins[0],
+                    mismatch=mismatch,
+                    insertion=bins[1],
+                    deletion=bins[2],
+                    coverage=coverage,
+                    id=float(correct) / float(bins[0]),
+                    accuracy=accuracy,
+                    information=bins[0] * (2.0 + entropy))
+
+    return list(res.values())
 
 
 def acc_plot(acc, mode, median, title, fill=PLOT_DO_FILL):

diff --git a/misc/calibrate_qscores_byread.py b/misc/calibrate_qscores_byread.py
@@ -143,8 +143,8 @@ def get_alignment_data(alignment_file):
         # Try to read the file as a Guppy alignment summary file
         read_ids = t['read_id']
         accuracies = t['alignment_accuracy']
-        alignment_lens = (t['alignment_strand_end']
-                          - t['alignment_strand_start'])
+        alignment_lens = (t['alignment_strand_end'] -
+                          t['alignment_strand_start'])
         print("Interpreted alignment file as Guppy output")
         accuracies[accuracies < 0] = np.nan
         return read_ids, accuracies, alignment_lens
@@ -157,10 +157,10 @@ def get_alignment_data(alignment_file):
         read_ids = t['query']
         accuracies = t['accuracy']
         # Query length in alignment not available directly in taiyaki summary
-        alignment_lens = (t['reference_end']
-                          - t['reference_start']
-                          + t['insertion']
-                          - t['deletion'])
+        alignment_lens = (t['reference_end'] -
+                          t['reference_start'] +
+                          t['insertion'] -
+                          t['deletion'])
         print("Interpreted alignment file as Taiyaki output")
         return read_ids, accuracies, alignment_lens
     except ValueError:

diff --git a/misc/plot_mapped_signals.py b/misc/plot_mapped_signals.py
@@ -31,7 +31,7 @@ def get_parser():
         help='Max number of reads to read from each file. Not used if ' +
         'read_ids are given')
     parser.add_argument(
-        '--read_ids',  nargs='+', default=[],
+        '--read_ids', nargs='+', default=[],
         help='One or more read_ids. If not present, plots the first ' +
         '[--nreads] in each file')
     parser.add_argument(
@@ -58,7 +58,7 @@ def get_parser():
         help='Do not display status messages.')
 
     parser.add_argument(
-        'mapped_signal_files',  nargs='+',
+        'mapped_signal_files', nargs='+',
         help='Inputs: one or more mapped signal files')
 
     return parser

diff --git a/misc/plot_training.py b/misc/plot_training.py
@@ -45,7 +45,7 @@ def get_parser():
     parser.add_argument(
         'output', help='Output png file')
     parser.add_argument(
-        'input_directories',  nargs='+',
+        'input_directories', nargs='+',
         help='One or more directories containing {} and {} files'.format(
             BATCH_LOG_FILENAME, VAL_LOG_FILENAME))
 

diff --git a/taiyaki/basecall_helpers.py b/taiyaki/basecall_helpers.py
@@ -76,8 +76,8 @@ def stitch_chunks(out, chunk_starts, chunk_ends, stride, path_stitching=False):
         # middle chunks
         for i in range(1, nchunks - 1):
             start = (chunk_ends[i - 1] - chunk_starts[i]) // (2 * stride)
-            end = (chunk_ends[i] + chunk_starts[i + 1]
-                   - 2 * chunk_starts[i]) // (2 * stride)
+            end = (chunk_ends[i] + chunk_starts[i + 1] -
+                   2 * chunk_starts[i]) // (2 * stride)
             if path_stitching:
                 start += 1
                 end += 1

diff --git a/taiyaki/common_cmdargs.py b/taiyaki/common_cmdargs.py
@@ -38,7 +38,7 @@ def add_common_command_args(parser, arglist):
             help='Integer specifying which GPU to use, or "cpu" to use CPU ' +
             'only. Other accepted formats: "cuda" (use default GPU), "cuda:2" '
             'or "cuda2" (use GPU 2).')),
-        ('eps', lambda:  parser.add_argument(
+        ('eps', lambda: parser.add_argument(
             '--eps', default=1e-6, metavar='adjustment',
             type=Positive(float), help='Small value to stabilise optimiser')),
         ('filter_max_dwell', lambda: parser.add_argument(

diff --git a/taiyaki/flipflop_remap.py b/taiyaki/flipflop_remap.py
@@ -132,8 +132,8 @@ def flipflop_remap(transition_scores, sequence, alphabet=DEFAULT_ALPHABET,
     bases = np.array([alphabet.find(b) for b in sequence])
     flops = flipflopfings.flopmask(bases)
 
-    stay_index = np.where(flops, bases + (2 * nbase + 1)
-                          * nbase, bases + 2 * nbase * bases)
+    stay_index = np.where(flops, bases + (2 * nbase + 1) *
+                          nbase, bases + 2 * nbase * bases)
     from_base = (bases + flops * nbase)[:-1]
     to_base = np.maximum(bases, nbase * flops)[1:]
     step_index = from_base + 2 * nbase * to_base

diff --git a/test/unit/test_ctc_loss.py b/test/unit/test_ctc_loss.py
@@ -48,10 +48,10 @@ def setUpClass(self):
         paths = {}
         weights = {}
 
-        paths['015'] = [0,    0,    1,    5,    5]
-        weights['015'] = [1.0,   1.0,  0.5,  1.0]
+        paths['015'] = [0, 0, 1, 5, 5]
+        weights['015'] = [1.0, 1.0, 0.5, 1.0]
 
-        paths['237'] = [2,    2,    3,    7,    7]
+        paths['237'] = [2, 2, 3, 7, 7]
         weights['237'] = [1.0, 0.5, 1.0, 1.0]
 
         weights['510'] = [0.0]  # No weight for this sequence/path