Merge branch 'DaveMendMaster' into 'master'

Dave mend master See merge request algorithm/taiyaki!380
nanoporetech · Nov 12, 2020 · d04b340 · d04b340
2 parents cba4b4a + 712e46b
commit d04b340
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 83 deletions.
diff --git a/misc/align.py b/misc/align.py
@@ -68,11 +68,11 @@ def call_bwa_mem(fin, fout, genome, clargs=''):
         genome (str): path to reference to align against
         clargs (str): optional cmd line arguments to pass to bwa as a string
 
-    Returns: 
+    Returns:
         str: stdout of bwa command
 
-    Raises: 
-        :subprocess:`CalledProcessError`: subprocess error message from bwa call
+    Raises:
+        :subprocess:`CalledProcessError`: subprocess err. message from bwa call
     """
     command_line = "bwa mem {} {} {} > {}".format(clargs, genome, fin, fout)
     try:

diff --git a/misc/plot_training.py b/misc/plot_training.py
@@ -53,16 +53,15 @@ def get_parser():
 
 
 def moving_average(a, n=3):
-    """ Generate moving average. 
-
+    """ Generate moving average.
     Args:
         a (:np:`ndarray`) : 1D input array
         n (int, optional) : square window length
 
     Returns:
         :np:`ndarray` : 1D output array
 
-    Note: If length of a is less than n, and for elements earlier than the nth, 
+    Note: If length of a is less than n, and for elements earlier than the nth,
         average as many points as available.
     """
     x = np.cumsum(a, dtype=float)

diff --git a/taiyaki/decode.py b/taiyaki/decode.py
@@ -15,17 +15,17 @@
 def flipflop_viterbi(scores, _never_use_cupy=False):
     """ Find highest scoring flipflop paths for a batch of score matrices.
     Args:
-        scores (:torch:`Tensor`): batch of score matrices with dimensions 
-            [T, batch size, S] where T is the number of blocks (time axis) and S
-            is the number of distinct flipflop transitions. For 4 bases S = 40, 
-            and in general S = 2 * nbase * (nbase + 1). Note that the input 
-            scores should be on a log scale, i.e. the score of a path is 
+        scores (:torch:`Tensor`): batch of score matrices with dimensions
+            [T, batch size, S] where T is the number of blocks (time axis) and
+            S is the number of distinct flipflop transitions. For 4 bases
+            S = 40, and in general S = 2 * nbase * (nbase + 1). Note that the
+            input scores should be on a log scale, i.e. the score of a path is
             determined by summing the scores of the individual transitions.
         _never_use_cupy (bool): this method delegates to cupy implementation if
             possible, unless _never_use_cupy=True, defaults to False
 
     Returns:
-        tuple(:torch:`Tensor`, :torch:`Tensor`, :torch:`Tensor`): 
+        tuple(:torch:`Tensor`, :torch:`Tensor`, :torch:`Tensor`):
             fwd scores tensor, traceback tensor, flipflop path tensor
     """
     use_cupy = all([
@@ -43,18 +43,18 @@ def flipflop_make_trans(scores, _never_use_cupy=False):
     """ Calculates posterior probabilities (not logs!) from raw model output.
 
     Args:
-        scores (:torch:`Tensor`): batch of score matrices with dimensions 
-            [T, batch size, S] where T is the number of blocks (time axis) and S
-            is the number of distinct flipflop transitions. For 4 bases S = 40, 
-            and in general S = 2 * nbase * (nbase + 1).  This should consist of 
+        scores (:torch:`Tensor`): batch of score matrices with dimensions
+            [T, batch size, S] where T is the number of blocks (time axis) and
+            S is the number of distinct flipflop transitions. For 4 bases
+            S = 40, and in general S=2*nbase*(nbase+1). This should consist of
             globally normalised transition scores for a flipflop CRF.
         _never_use_cupy (bool): this method delegates to cupy implementation if
             possible, unless _never_use_cupy=True, defaults to False
 
-    Returns: 
+    Returns:
         :torch:`Tensor`: floats of shape (T x batch size x S) containing
-             posterior transition probabilities (not logs!)  It can be verified 
-             that this is equivalent to the derivative of the log-partition 
+             posterior transition probabilities (not logs!)  It can be verified
+             that this is equivalent to the derivative of the log-partition
              function with respect to the raw scores.
     """
     use_cupy = all([
@@ -74,17 +74,17 @@ def flipflop_make_trans(scores, _never_use_cupy=False):
 
 @torch.no_grad()
 def _flipflop_viterbi(scores):
-    """ Find highest scoring flipflop paths for a batch of score matrices. This 
+    """ Find highest scoring flipflop paths for a batch of score matrices. This
         is an idiomatic pytorch implementation.
 
     Args:
-        scores (:torch:`Tensor`): batch of score matrices with dimensions 
-            [T, batch size, S] where T is the number of blocks (time axis) and S
-            is the number of distinct flipflop transitions. For 4 bases S = 40, 
-            and in general S = 2 * nbase * (nbase + 1).
+        scores (:torch:`Tensor`): batch of score matrices with dimensions
+            [T, batch size, S] where T is the number of blocks (time axis) and
+            S is the number of distinct flipflop transitions. For 4 bases
+            S = 40, and in general S = 2 * nbase * (nbase + 1).
 
     Returns:
-        tuple(:torch:`Tensor`, :torch:`Tensor`, :torch:`Tensor`): 
+        tuple(:torch:`Tensor`, :torch:`Tensor`, :torch:`Tensor`):
             fwd scores tensor, traceback tensor, flipflop path tensor
     """
     T, N, S = scores.shape

diff --git a/taiyaki/fast5utils.py b/taiyaki/fast5utils.py
@@ -48,13 +48,13 @@ def iterate_file_read_pairs(filepaths, read_ids, limit=None, verbose=0):
 
 
 def iterate_files_reads_unpaired(filepaths, read_ids, limit=None, verbose=0):
-    """ Iterate over unpaired lists of filepaths and read_ids, looking in all 
-    the files given and returning only those read_ids in the read_ids list. If 
+    """ Iterate over unpaired lists of filepaths and read_ids, looking in all
+    the files given and returning only those read_ids in the read_ids list. If
     read_ids is None then get all the reads in the files.
 
     Args:
         filepaths (list of str): list of filepaths
-        read_ids (list of str or None): list of read ids or None for all reads 
+        read_ids (list of str or None): list of read ids or None for all reads
             in each file
         limit (int, optional): Maximum number of tuples to produce
         verbose (int, optional): Output level of debug verbosity
@@ -90,40 +90,40 @@ def iterate_files_reads_unpaired(filepaths, read_ids, limit=None, verbose=0):
 
 def iterate_fast5_reads(
         path, strand_list=None, limit=None, verbose=0, recursive=False):
-    """ Iterate over reads in a directory of fast5 files or a single fast5 file. 
-        Files may be single or multi-read fast5s.
+    """ Iterate over reads in a directory of fast5 files or a single fast5
+        file. Files may be single or multi-read fast5s.
 
     Args:
         path (str): Directory (or filename for a single file)
-        strand_list (str or None, optional): Path to file containing list of 
-            files and/or read ids to iterate over (as described in notes) or 
+        strand_list (str or None, optional): Path to file containing list of
+            files and/or read ids to iterate over (as described in notes) or
             None for all files and reads
-        limit (int or None, optional): Maximum number of reads to consider or 
+        limit (int or None, optional): Maximum number of reads to consider or
             None for all
-        verbose (int, optional): 0 prints no messages, 1 prints a message for 
+        verbose (int, optional): 0 prints no messages, 1 prints a message for
             every file read, 2 prints the list of files before starting as well
         recursive (bool, optional): Search path recursively for fast5 files
 
     Yields:
-        (tuple(str, str)): filepath and read_id for each read. You may say, "why 
-            not yield an ont_fast_api object instead of a nasty tuple?" I would 
-            say: "yes, I did try that, but it led to unfathomable nastiness when 
-            I fed these objects in as arguments to multiple processes."
+        (tuple(str, str)): filepath and read_id for each read. You may say,
+            "why not yield an ont_fast_api object instead of a nasty tuple?" I
+            would say: "yes, I tried that, but it led to unfathomable nastiness
+            when I fed these objects in as arguments to multiple processes."
 
     Notes:
-        If strand_list is given, then only return the reads spcified, according 
+        If strand_list is given, then only return the reads spcified, according
         to the following rules:
 
         (A) If the strand list file has a column 'read_id' and no column
-            'filename' or 'filename_fast5' then look through all fast5 files in 
+            'filename' or 'filename_fast5' then look through all fast5 files in
             the path and return all reads with read_ids in that column.
         (B) If the strand list file has a column 'filename' or 'filename_fast5'
-            and no column 'read_id' then look through all filenames specified 
+            and no column 'read_id' then look through all filenames specified
             and return all reads in them.
-        (C) If the strand list has a column 'filename' or 'filename_fast5' _and_ 
-            a column 'read_id' then loop through the rows in the strand list, 
-            returning the appropriate tuple for each row. We check that each 
-            file exists and contains the read_id.
+        (C) If the strand list has a column 'filename' or 'filename_fast5'
+            _and_ a column 'read_id' then loop through the rows in the strand
+            list, returning the appropriate tuple for each row. We check that
+            each file exists and contains the read_id.
 
     Example:
         read_iterator = iterate_fast5_reads('directory')
@@ -212,7 +212,7 @@ def get_filename(read):
 
 
 def get_channel_info(read):
-    """Get channel info from read object. 
+    """Get channel info from read object.
 
     Args:
         read (:ont_fast5_api:`Fast5Read`): the read object

diff --git a/taiyaki/qscores.py b/taiyaki/qscores.py
@@ -11,9 +11,9 @@ def qchar_from_qscore(score, zerochar=33):
     """Return ASCII character(s) encoding q score from score.
 
     Args:
-        score (float or list or :np:`ndarray`) : float or list or 1D input array 
-            of error prob = 10^(-score/10)
-        zerochar (int, optional) : ASCII code character encoding probability 1 
+        score (float or list or :np:`ndarray`) : float or list or 1D input
+            array of error prob = 10^(-score/10)
+        zerochar (int, optional) : ASCII code character encoding probability 1
             (score 0)
 
     Returns:
@@ -33,8 +33,8 @@ def qscore_from_errprob(errprob):
     Args:
         errprob (scalar or :np:`ndarray`): probability of error
 
-    Returns: 
-	scalar or :np:`ndarray`: -10 log_10(errprob)
+    Returns:
+        scalar or :np:`ndarray`: -10 log_10(errprob)
     """
     return -10.0 * np.log10(errprob)
 
@@ -44,7 +44,7 @@ def qchar_from_errprob(errprob, qscore_scale, qscore_offset):
 
     Args:
         errprob (scalar or :np:`ndarray`) : probability of error
-        qscore_scale (scalar): qscore <-- qscore * qscore_scale + qscore_offset,
+        qscore_scale (scalar): qscore <-- qscore*qscore_scale + qscore_offset,
             before encoding it as a character
         qscore_offset (scalar): see qscore_scale above
 
@@ -56,7 +56,7 @@ def qchar_from_errprob(errprob, qscore_scale, qscore_offset):
 
 
 def transitions_into_base(b, nbases, device):
-    """Return all transition-matrix indices for all transitions into base 
+    """Return all transition-matrix indices for all transitions into base
     (flip or flop).
 
     Args:
@@ -66,11 +66,11 @@ def transitions_into_base(b, nbases, device):
             E.g. 'cpu', 1, 'cuda:1'
 
     Returns:
-        :torch:`Tensor` : 1D tensor of longs representing indices 
+        :torch:`Tensor` : 1D tensor of longs representing indices
             (in range 0 to 39) for ACGT.
 
     Note:
-        All transitions, including those where no base is emitted, are included.
+        All transitions, inc. those where no base is emitted, are included.
     """
     # Transition A to b_flip
     colstart = nbases * 2 * b
@@ -97,22 +97,22 @@ def errprobs_from_trans(trans, path):
     errorprob = 1-p
 
     Args:
-    	trans (:torch:`Tensor`): Tensor of floats with shape 
-            (nblocks x batchsize x nstates) where nstates = 40 for 4-base models
-            containing posterior transition weights (not logs!)
-        
-        path (:torch:`Tensor`): Tensor of longs with shape 
-            ((nblocks+1) x batchsize) containing flip-flop states (integers 0-7 
-            for 4-base models). The transition that goes with trans[n,bn,:] is 
+        trans (:torch:`Tensor`): Tensor of floats with shape
+            (nblocks x batchsize x nstates) where nstates = 40 for 4-base
+            models containing posterior transition weights (not logs!)
+
+        path (:torch:`Tensor`): Tensor of longs with shape
+            ((nblocks+1) x batchsize) containing flip-flop states (integers 0-7
+            for 4-base models). The transition that goes with trans[n,bn,:] is
             the one from path[n,bn] to path[n+1,bn].
 
     Returns:
-        :torch:`Tensor` : errorprob = tensor of floats with shape 
+        :torch:`Tensor` : errorprob = tensor of floats with shape
             ((nblocks+1) x batchsize) containing errorprob for each element of
-            the path, and -1.0 in row 0. Note that this doesn't matter since 
-            these probabilities are removed later on in the pipeline. The output
-            matrix must be the same shape as the path in order to be fed into 
-            the stitching function.
+            the path, and -1.0 in row 0. Note that this doesn't matter since
+            these probabilities are removed later on in the pipeline. The
+            output matrix must be the same shape as the path in order to be fed
+            into the stitching function.
     """
     nblocks, batchsize, flip_flop_transitions = trans.shape
     nbases = flipflopfings.nbase_flipflop(flip_flop_transitions)
@@ -146,23 +146,23 @@ def path_errprobs_to_qstring(errprobs, path, qscore_scale, qscore_offset):
     """Make qscore string from error probs, ignoring stays.
 
     Args:
-        errprobs (:torch:`Tensor` or :np:`ndarray`): 1D tensor of floats or 1D 
+        errprobs (:torch:`Tensor` or :np:`ndarray`): 1D tensor of floats or 1D
             input array containing error probabilities for each element of path
-        path (:torch:`Tensor` or :np:`ndarray`): 1D tensor of longs or 1D input 
-            array of ints containing flip-flop states for each block, same 
+        path (:torch:`Tensor` or :np:`ndarray`): 1D tensor of longs or 1D input
+            array of ints containing flip-flop states for each block, same
             length as errprobs
-        qscore_scale (scalar): qscore <-- qscore * qscore_scale + qscore_offset,
+        qscore_scale (scalar): qscore <-- qscore*qscore_scale + qscore_offset,
             before encoding it as a character
         qscore_offset (scalar): see qscore_scale above
 
-    Returns: 
-	str : representing quality scores encoded as ASCII characters
+    Returns:
+        str : representing quality scores encoded as ASCII characters
 
-    Note: 
-        Elements of the path where no base is emitted are not included in the 
-        qstring, and the source base for the first transition is also not 
-        included. So the qstring is the same length as the basecall (provided we
-        don't include the source base for the first transition in the basecall)
+    Note:
+        Elements of the path where no base is emitted are not included in the
+        qstring, and the source base for the first transition is also not
+        included. So the qstring is the same length as the basecall (provided
+        we don't inc. the source base for the first transition in the basecall)
     """
     filtered_probs = errprobs[1:][path[1:] != path[:-1]]
     if type(filtered_probs) == torch.Tensor:

diff --git a/test/acceptance/test_merge_mappedsignalfiles.py b/test/acceptance/test_merge_mappedsignalfiles.py
@@ -56,7 +56,7 @@ def count_reads(self, mapped_signal_file, print_readlist=True):
         return len(read_ids)
 
     def test_merge(self):
-        """Test that merging two 'normal' mapped signal files produces the 
+        """Test that merging two 'normal' mapped signal files produces the
         output expected."""
         test_work_dir = self.work_dir("test_merge")
         merged_mapped_signal_file = os.path.join(
@@ -95,7 +95,7 @@ def test_merge(self):
         self.assertTrue(numreads_in > 2)
 
     def test_merge_batch(self):
-        """Test that merging two 'batch' mapped signal files produces the 
+        """Test that merging two 'batch' mapped signal files produces the
         output expected."""
         test_work_dir = self.work_dir("test_merge_batch")
         merged_mapped_signal_file = os.path.join(

diff --git a/test/unit/test_cmdargs.py b/test/unit/test_cmdargs.py
@@ -44,7 +44,8 @@ def test_nonnegative_valid_float_values(self):
             self.assertAlmostEqual(x, f(x))
 
     def test_nonnegative_invalid_float_values(self):
-        """Test that invalid floats don't get through non-negative arg check."""
+        """Test that invalid floats don't get through non-negative arg
+        check."""
         f = cmdargs.NonNegative(float)
         for x in [-1.0, -self.EPS, -1e-5]:
             with self.assertRaises(argparse.ArgumentTypeError):
@@ -71,7 +72,7 @@ def test_proportion_valid_float_values(self):
             self.assertAlmostEqual(x, f(x))
 
     def test_proportion_invalid_float_values(self):
-        """Test that invalid floats don't get through proportion (0-1) arg 
+        """Test that invalid floats don't get through proportion (0-1) arg
         check."""
         f = cmdargs.proportion
         for x in [-1e-30, -self.EPS, -1e-5, 1.0 + 1e-5, 1.0 + self.EPS]:

diff --git a/test/unit/test_maths.py b/test/unit/test_maths.py
@@ -11,7 +11,7 @@ def setUpClass(self):
         np.random.seed(0xdeadbeef)
 
     def test_004_med_mad(self):
-        """Test to see if med_mad works without setting axis (so flattening)."""
+        """Test to see if med_mad works with axis not set (so flattening)."""
         x = np.array(
             [[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.0, 0.5, 0.5, 1.0]])
         factor = 1