Skip to content

Commit

Permalink
refactor groupby and sort and make them more powerful + docs
Browse files Browse the repository at this point in the history
  • Loading branch information
trichter committed Oct 23, 2024
1 parent a0e1be5 commit 82408a1
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 30 deletions.
60 changes: 43 additions & 17 deletions sugar/core/cane.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,29 @@


class BioMatch(object):
"""
The BioMatch object is returned by `~cane.match()` and the different match methods.
It is designed to behave as the original `python:re.Match` object.
See there for available methods.
Additionally, it has the `BioMatch.rf` attribute which holds the
reading frame (between -3 and 2, inclusive) of the match.
.. rubric:: Example::
>>> match = read()[0].match('AT.')
>>> match
<sugar.BioMatch object; seqid=AB047639; rf=2; span=(11, 14); match=ATA>
>>> print(match.group(), match.rf)
ATA 2
"""
def __init__(self, match, rf=None, lenseq=None, seqid=None):
#: original Match object
self._match = match
#: reading frame (-3 to 2)
self.rf = rf
self.lenseq = lenseq
#: sequence id
self.seqid = seqid
def __getattr__(self, attr):
return getattr(self._match, attr)
Expand All @@ -28,7 +47,22 @@ def span(self):


class BioMatchList(collections.UserList):
"""
List of `BioMatch` objects
"""
def groupby(self, attr='rf'):
"""
Group matches
TODO: Use re-use groupby
:param str attr: Which attribute to group by.
Allowed values: rf, seqid, both,
default: rf
:return: dict with the group-by attributes as keys and BioMatchLists as values.
For ``'both'`` returns a nested dictionary,
outer keys are seqids, inner keys ar reading frames
"""
assert attr in ('rf', 'seqid', 'both')
d = {}
if attr == 'both':
Expand All @@ -41,21 +75,18 @@ def groupby(self, attr='rf'):

@property
def d(self):
"""
Group matches by seqid, alias for ``BioMatchList.groupby('seqid')``
"""
return self.groupby('seqid')


class ORFList(FeatureList):
def groupby(self, attr='rf'):
assert attr in ('rf', 'seqid', 'both')
d = {}
if attr == 'both':
for ft in self:
d.setdefault(ft.seqid, {}).setdefault(ft.meta.rf, ORFList()).append(ft)
else:
for ft in self:
d.setdefault(getattr(ft.meta, attr), ORFList()).append(ft)
return d
"""
List of open reading frames (ORFs)
This object is a FeatureList with additional methods.
"""
def filter(self, minlen=0, rfs=None):
orfs = []
for orf in self:
Expand All @@ -64,11 +95,6 @@ def filter(self, minlen=0, rfs=None):
self.data = orfs
return self

def sort(self, s=('len',), reverse=False):
for k in s[::-1]:
self.data = sorted(self, key=len if k == 'len' else (lambda ft: ft.meta.get(k)), reverse=reverse)
return self

@property
def d(self):
return self.groupby('seqid')
Expand Down Expand Up @@ -118,8 +144,8 @@ def find_orfs(seq, rf='fwd', start='start', stop='stop', need_start='always', ne
# rf 0, 1, 2, -1, -2, -3, 'fwd', 'bwd', 'both'
assert need_start in ('never', 'always', 'once')
if need_start in ('once', 'always'):
starts = seq.matchall(start, rf=rf, gap=gap).groupby()
stops = seq.matchall(stop, rf=rf, gap=gap).groupby()
starts = seq.matchall(start, rf=rf, gap=gap).groupby('rf')
stops = seq.matchall(stop, rf=rf, gap=gap).groupby('rf')
if rf == 'fwd':
rf = (0, 1, 2)
elif rf == 'bwd':
Expand Down
78 changes: 67 additions & 11 deletions sugar/core/fts.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,12 +635,55 @@ def todict(self):
d.setdefault(seqid, FeatureList()).append(ft)
return d

def groupby(self, keys=('seqid',)):
"""
Group features
:param str attr: Which attributes to group by.
Allowed values: rf, seqid, both,
default: rf
:param keys: Tuple of meta keys or functions to use for grouping.
May also be a single string or callable.
By default the method groups by only seqid.
:return: Nested dict structure
.. rubric:: Example:
>>> from sugar import read_fts
>>> fts = read_fts()
>>> fts.groupby('type');
"""
from collections.abc import Iterable
if isinstance(keys, str):
keys = keys.split()
if not isinstance(keys, Iterable):
keys = [keys]
keyfuncs = [
(lambda ft: ft.meta.get(key)) if isinstance(key, str) else key
for key in keys
]
d = {}
for ft in self:
d2 = d
for keyfunc in keyfuncs[:-1]:
d2 = d2.setdefault(keyfunc(ft), {})
d2.setdefault(keyfuncs[-1](ft), FeatureList()).append(ft)
return d
if attr == 'both':
for ft in self:
d.setdefault(ft.seqid, {}).setdefault(ft.meta.rf, FeatureList()).append(ft)
else:
for ft in self:
d.setdefault(getattr(ft.meta, attr), FeatureList()).append(ft)
return d

@property
def d(self):
"""
Alias for `FeatureList.todict()`
Alias for ``FeatureList.groupby('seqid')``
"""
return self.todict()
return self.groupby('seqid')

@property
def loc_range(self):
Expand Down Expand Up @@ -728,22 +771,35 @@ def reverse(self):
ft.reverse()
return self

def sort(self, key=None, reverse=False):
def sort(self, keys=None, reverse=False):
"""
Sort features in-place
:param key: Key to use for sorting.
Should be a string, which is expected to be a valid attribute in the metadata of each feature.
Alternatively, the key sort function can be specified directly.
:param keys: Tuple of meta keys or functions to use for sorting.
None can be used as a single value or in the tuple
to apply the default sorting by position.
May also be a single string or callable.
:param reverse: Use reversed order (default: False)
:return: Sorted features
.. rubric:: Example:
>>> from sugar import read_fts
>>> fts = read_fts()
>>> fts.sort(('type', len));
"""
if key is not None and isinstance(key, str):
kfunc = lambda ft: ft.meta[key]
else:
kfunc = key
self.data = sorted(self, key=kfunc, reverse=reverse)
from collections.abc import Iterable
if isinstance(keys, str):
keys = keys.split()
if not isinstance(keys, Iterable):
keys = [keys]
keyfuncs = [
(lambda ft: ft.meta.get(key)) if isinstance(key, str) else key
for key in keys
]
for keyfunc in keyfuncs[::-1]:
self.data = sorted(self.data, key=keyfunc, reverse=reverse)
return self

def copy(self):
Expand Down
4 changes: 2 additions & 2 deletions sugar/tests/test_core_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,12 @@ def test_orf():
seqs=read()
orfs = seqs[0].find_orfs()
assert len(orfs) > 0
longest_orf = orfs.sort()[-1]
longest_orf = orfs.sort(len)[-1]
assert seqs[0][longest_orf] == seqs[0]['cds']

orfs2 = seqs[0].find_orfs(rf='both')
assert len(orfs2) > len(orfs)

orfs = seqs.find_orfs()
for id_ in seqs.ids:
assert seqs.d[id_][orfs.d[id_].sort()[-1]] == seqs.d[id_]['cds']
assert seqs.d[id_][orfs.d[id_].sort(len)[-1]] == seqs.d[id_]['cds']

0 comments on commit 82408a1

Please sign in to comment.