refactor groupby and sort and make them more powerful + docs

rnajena · Oct 23, 2024 · 82408a1 · 82408a1
1 parent a0e1be5
commit 82408a1
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 30 deletions.
diff --git a/sugar/core/cane.py b/sugar/core/cane.py
@@ -11,10 +11,29 @@
 
 
 class BioMatch(object):
+    """
+    The BioMatch object is returned by `~cane.match()` and the different match methods.
+
+    It is designed to behave as the original `python:re.Match` object.
+    See there for available methods.
+    Additionally, it has the `BioMatch.rf` attribute which holds the
+    reading frame (between -3 and 2, inclusive) of the match.
+
+    .. rubric:: Example::
+
+    >>> match = read()[0].match('AT.')
+    >>> match
+    <sugar.BioMatch object; seqid=AB047639; rf=2; span=(11, 14); match=ATA>
+    >>> print(match.group(), match.rf)
+    ATA 2
+    """
     def __init__(self, match, rf=None, lenseq=None, seqid=None):
+        #: original Match object
         self._match = match
+        #: reading frame (-3 to 2)
         self.rf = rf
         self.lenseq = lenseq
+        #: sequence id
         self.seqid = seqid
     def __getattr__(self, attr):
         return getattr(self._match, attr)
@@ -28,7 +47,22 @@ def span(self):
 
 
 class BioMatchList(collections.UserList):
+    """
+    List of `BioMatch` objects
+    """
     def groupby(self, attr='rf'):
+        """
+        Group matches
+
+        TODO: Use re-use groupby
+
+        :param str attr: Which attribute to group by.
+            Allowed values: rf, seqid, both,
+            default: rf
+        :return: dict with the group-by attributes as keys and BioMatchLists as values.
+            For ``'both'`` returns a nested dictionary,
+            outer keys are seqids, inner keys ar reading frames
+        """
         assert attr in ('rf', 'seqid', 'both')
         d = {}
         if attr == 'both':
@@ -41,21 +75,18 @@ def groupby(self, attr='rf'):
 
     @property
     def d(self):
+        """
+        Group matches by seqid, alias for ``BioMatchList.groupby('seqid')``
+        """
         return self.groupby('seqid')
 
 
 class ORFList(FeatureList):
-    def groupby(self, attr='rf'):
-        assert attr in ('rf', 'seqid', 'both')
-        d = {}
-        if attr == 'both':
-            for ft in self:
-                d.setdefault(ft.seqid, {}).setdefault(ft.meta.rf, ORFList()).append(ft)
-        else:
-            for ft in self:
-                d.setdefault(getattr(ft.meta, attr), ORFList()).append(ft)
-        return d
+    """
+    List of open reading frames (ORFs)
 
+    This object is a FeatureList with additional methods.
+    """
     def filter(self, minlen=0, rfs=None):
         orfs = []
         for orf in self:
@@ -64,11 +95,6 @@ def filter(self, minlen=0, rfs=None):
         self.data = orfs
         return self
 
-    def sort(self, s=('len',), reverse=False):
-        for k in s[::-1]:
-            self.data = sorted(self, key=len if k == 'len' else (lambda ft: ft.meta.get(k)), reverse=reverse)
-        return self
-
     @property
     def d(self):
         return self.groupby('seqid')
@@ -118,8 +144,8 @@ def find_orfs(seq, rf='fwd', start='start', stop='stop', need_start='always', ne
     # rf  0, 1, 2, -1, -2, -3, 'fwd', 'bwd', 'both'
     assert need_start in ('never', 'always', 'once')
     if need_start in ('once', 'always'):
-        starts = seq.matchall(start, rf=rf, gap=gap).groupby()
-    stops = seq.matchall(stop, rf=rf, gap=gap).groupby()
+        starts = seq.matchall(start, rf=rf, gap=gap).groupby('rf')
+    stops = seq.matchall(stop, rf=rf, gap=gap).groupby('rf')
     if rf == 'fwd':
         rf = (0, 1, 2)
     elif rf == 'bwd':

diff --git a/sugar/core/fts.py b/sugar/core/fts.py
@@ -635,12 +635,55 @@ def todict(self):
             d.setdefault(seqid, FeatureList()).append(ft)
         return d
 
+    def groupby(self, keys=('seqid',)):
+        """
+        Group features
+
+        :param str attr: Which attributes to group by.
+            Allowed values: rf, seqid, both,
+            default: rf
+
+        :param keys: Tuple of meta keys or functions to use for grouping.
+            May also be a single string or callable.
+            By default the method groups by only seqid.
+        :return: Nested dict structure
+
+        .. rubric:: Example:
+
+        >>> from sugar import read_fts
+        >>> fts = read_fts()
+        >>> fts.groupby('type');
+        """
+        from collections.abc import Iterable
+        if isinstance(keys, str):
+            keys = keys.split()
+        if not isinstance(keys, Iterable):
+            keys = [keys]
+        keyfuncs = [
+            (lambda ft: ft.meta.get(key)) if isinstance(key, str) else key
+            for key in keys
+            ]
+        d = {}
+        for ft in self:
+            d2 = d
+            for keyfunc in keyfuncs[:-1]:
+                d2 = d2.setdefault(keyfunc(ft), {})
+            d2.setdefault(keyfuncs[-1](ft), FeatureList()).append(ft)
+        return d
+        if attr == 'both':
+            for ft in self:
+                d.setdefault(ft.seqid, {}).setdefault(ft.meta.rf, FeatureList()).append(ft)
+        else:
+            for ft in self:
+                d.setdefault(getattr(ft.meta, attr), FeatureList()).append(ft)
+        return d
+
     @property
     def d(self):
         """
-        Alias for `FeatureList.todict()`
+        Alias for ``FeatureList.groupby('seqid')``
         """
-        return self.todict()
+        return self.groupby('seqid')
 
     @property
     def loc_range(self):
@@ -728,22 +771,35 @@ def reverse(self):
             ft.reverse()
         return self
 
-    def sort(self, key=None, reverse=False):
+    def sort(self, keys=None, reverse=False):
         """
         Sort features in-place
 
-        :param key: Key to use for sorting.
-            Should be a string, which is expected to be a valid attribute in the metadata of each feature.
-            Alternatively, the key sort function can be specified directly.
+        :param keys: Tuple of meta keys or functions to use for sorting.
+            None can be used as a single value or in the tuple
+            to apply the default sorting by position.
+            May also be a single string or callable.
         :param reverse: Use reversed order (default: False)
 
         :return: Sorted features
+
+        .. rubric:: Example:
+
+        >>> from sugar import read_fts
+        >>> fts = read_fts()
+        >>> fts.sort(('type', len));
         """
-        if key is not None and isinstance(key, str):
-            kfunc = lambda ft: ft.meta[key]
-        else:
-            kfunc = key
-        self.data = sorted(self, key=kfunc, reverse=reverse)
+        from collections.abc import Iterable
+        if isinstance(keys, str):
+            keys = keys.split()
+        if not isinstance(keys, Iterable):
+            keys = [keys]
+        keyfuncs = [
+            (lambda ft: ft.meta.get(key)) if isinstance(key, str) else key
+            for key in keys
+            ]
+        for keyfunc in keyfuncs[::-1]:
+            self.data = sorted(self.data, key=keyfunc, reverse=reverse)
         return self
 
     def copy(self):

diff --git a/sugar/tests/test_core_seq.py b/sugar/tests/test_core_seq.py
@@ -197,12 +197,12 @@ def test_orf():
     seqs=read()
     orfs = seqs[0].find_orfs()
     assert len(orfs) > 0
-    longest_orf = orfs.sort()[-1]
+    longest_orf = orfs.sort(len)[-1]
     assert seqs[0][longest_orf] == seqs[0]['cds']
 
     orfs2 = seqs[0].find_orfs(rf='both')
     assert len(orfs2) > len(orfs)
 
     orfs = seqs.find_orfs()
     for id_ in seqs.ids:
-        assert seqs.d[id_][orfs.d[id_].sort()[-1]] == seqs.d[id_]['cds']
+        assert seqs.d[id_][orfs.d[id_].sort(len)[-1]] == seqs.d[id_]['cds']