diff --git a/sugar/core/cane.py b/sugar/core/cane.py index 9485fe1..f79b1b5 100644 --- a/sugar/core/cane.py +++ b/sugar/core/cane.py @@ -11,10 +11,29 @@ class BioMatch(object): + """ + The BioMatch object is returned by `~cane.match()` and the different match methods. + + It is designed to behave as the original `python:re.Match` object. + See there for available methods. + Additionally, it has the `BioMatch.rf` attribute which holds the + reading frame (between -3 and 2, inclusive) of the match. + + .. rubric:: Example:: + + >>> match = read()[0].match('AT.') + >>> match + + >>> print(match.group(), match.rf) + ATA 2 + """ def __init__(self, match, rf=None, lenseq=None, seqid=None): + #: original Match object self._match = match + #: reading frame (-3 to 2) self.rf = rf self.lenseq = lenseq + #: sequence id self.seqid = seqid def __getattr__(self, attr): return getattr(self._match, attr) @@ -28,7 +47,22 @@ def span(self): class BioMatchList(collections.UserList): + """ + List of `BioMatch` objects + """ def groupby(self, attr='rf'): + """ + Group matches + + TODO: Use re-use groupby + + :param str attr: Which attribute to group by. + Allowed values: rf, seqid, both, + default: rf + :return: dict with the group-by attributes as keys and BioMatchLists as values. + For ``'both'`` returns a nested dictionary, + outer keys are seqids, inner keys ar reading frames + """ assert attr in ('rf', 'seqid', 'both') d = {} if attr == 'both': @@ -41,21 +75,18 @@ def groupby(self, attr='rf'): @property def d(self): + """ + Group matches by seqid, alias for ``BioMatchList.groupby('seqid')`` + """ return self.groupby('seqid') class ORFList(FeatureList): - def groupby(self, attr='rf'): - assert attr in ('rf', 'seqid', 'both') - d = {} - if attr == 'both': - for ft in self: - d.setdefault(ft.seqid, {}).setdefault(ft.meta.rf, ORFList()).append(ft) - else: - for ft in self: - d.setdefault(getattr(ft.meta, attr), ORFList()).append(ft) - return d + """ + List of open reading frames (ORFs) + This object is a FeatureList with additional methods. + """ def filter(self, minlen=0, rfs=None): orfs = [] for orf in self: @@ -64,11 +95,6 @@ def filter(self, minlen=0, rfs=None): self.data = orfs return self - def sort(self, s=('len',), reverse=False): - for k in s[::-1]: - self.data = sorted(self, key=len if k == 'len' else (lambda ft: ft.meta.get(k)), reverse=reverse) - return self - @property def d(self): return self.groupby('seqid') @@ -118,8 +144,8 @@ def find_orfs(seq, rf='fwd', start='start', stop='stop', need_start='always', ne # rf 0, 1, 2, -1, -2, -3, 'fwd', 'bwd', 'both' assert need_start in ('never', 'always', 'once') if need_start in ('once', 'always'): - starts = seq.matchall(start, rf=rf, gap=gap).groupby() - stops = seq.matchall(stop, rf=rf, gap=gap).groupby() + starts = seq.matchall(start, rf=rf, gap=gap).groupby('rf') + stops = seq.matchall(stop, rf=rf, gap=gap).groupby('rf') if rf == 'fwd': rf = (0, 1, 2) elif rf == 'bwd': diff --git a/sugar/core/fts.py b/sugar/core/fts.py index 60ef2a6..cd94a17 100644 --- a/sugar/core/fts.py +++ b/sugar/core/fts.py @@ -635,12 +635,55 @@ def todict(self): d.setdefault(seqid, FeatureList()).append(ft) return d + def groupby(self, keys=('seqid',)): + """ + Group features + + :param str attr: Which attributes to group by. + Allowed values: rf, seqid, both, + default: rf + + :param keys: Tuple of meta keys or functions to use for grouping. + May also be a single string or callable. + By default the method groups by only seqid. + :return: Nested dict structure + + .. rubric:: Example: + + >>> from sugar import read_fts + >>> fts = read_fts() + >>> fts.groupby('type'); + """ + from collections.abc import Iterable + if isinstance(keys, str): + keys = keys.split() + if not isinstance(keys, Iterable): + keys = [keys] + keyfuncs = [ + (lambda ft: ft.meta.get(key)) if isinstance(key, str) else key + for key in keys + ] + d = {} + for ft in self: + d2 = d + for keyfunc in keyfuncs[:-1]: + d2 = d2.setdefault(keyfunc(ft), {}) + d2.setdefault(keyfuncs[-1](ft), FeatureList()).append(ft) + return d + if attr == 'both': + for ft in self: + d.setdefault(ft.seqid, {}).setdefault(ft.meta.rf, FeatureList()).append(ft) + else: + for ft in self: + d.setdefault(getattr(ft.meta, attr), FeatureList()).append(ft) + return d + @property def d(self): """ - Alias for `FeatureList.todict()` + Alias for ``FeatureList.groupby('seqid')`` """ - return self.todict() + return self.groupby('seqid') @property def loc_range(self): @@ -728,22 +771,35 @@ def reverse(self): ft.reverse() return self - def sort(self, key=None, reverse=False): + def sort(self, keys=None, reverse=False): """ Sort features in-place - :param key: Key to use for sorting. - Should be a string, which is expected to be a valid attribute in the metadata of each feature. - Alternatively, the key sort function can be specified directly. + :param keys: Tuple of meta keys or functions to use for sorting. + None can be used as a single value or in the tuple + to apply the default sorting by position. + May also be a single string or callable. :param reverse: Use reversed order (default: False) :return: Sorted features + + .. rubric:: Example: + + >>> from sugar import read_fts + >>> fts = read_fts() + >>> fts.sort(('type', len)); """ - if key is not None and isinstance(key, str): - kfunc = lambda ft: ft.meta[key] - else: - kfunc = key - self.data = sorted(self, key=kfunc, reverse=reverse) + from collections.abc import Iterable + if isinstance(keys, str): + keys = keys.split() + if not isinstance(keys, Iterable): + keys = [keys] + keyfuncs = [ + (lambda ft: ft.meta.get(key)) if isinstance(key, str) else key + for key in keys + ] + for keyfunc in keyfuncs[::-1]: + self.data = sorted(self.data, key=keyfunc, reverse=reverse) return self def copy(self): diff --git a/sugar/tests/test_core_seq.py b/sugar/tests/test_core_seq.py index 24d092d..f6d7758 100644 --- a/sugar/tests/test_core_seq.py +++ b/sugar/tests/test_core_seq.py @@ -197,7 +197,7 @@ def test_orf(): seqs=read() orfs = seqs[0].find_orfs() assert len(orfs) > 0 - longest_orf = orfs.sort()[-1] + longest_orf = orfs.sort(len)[-1] assert seqs[0][longest_orf] == seqs[0]['cds'] orfs2 = seqs[0].find_orfs(rf='both') @@ -205,4 +205,4 @@ def test_orf(): orfs = seqs.find_orfs() for id_ in seqs.ids: - assert seqs.d[id_][orfs.d[id_].sort()[-1]] == seqs.d[id_]['cds'] + assert seqs.d[id_][orfs.d[id_].sort(len)[-1]] == seqs.d[id_]['cds']