From e73e95aa3225cb542477d3f1a34d068cfe5856ba Mon Sep 17 00:00:00 2001 From: Tom Eulenfeld Date: Wed, 23 Oct 2024 01:30:40 +0200 Subject: [PATCH] rename BioSeq.get() and BioBasket.get() to getitem(), rename Feature.overlap() to overlaps(), more docs --- docs/conf.py | 4 +- docs/src/sugar.core.rst | 1 - sugar/core/fts.py | 85 +++++++++++++++++--- sugar/core/meta.py | 5 ++ sugar/core/seq.py | 169 +++++++++++++++++++++++++++++++-------- sugar/tests/test_docs.py | 12 ++- 6 files changed, 229 insertions(+), 47 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index c27b099..e1c8789 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,6 +60,7 @@ def download_logo(): .. _Infernal: http://eddylab.org/infernal/ .. _MMseqs2: https://github.com/soedinglab/MMseqs2 .. _Stockholm: https://en.wikipedia.org/wiki/Stockholm_format +.. _str: https://docs.python.org/3/library/stdtypes.html#string-methods """ extensions = ['sphinx.ext.autodoc', @@ -94,5 +95,6 @@ def download_logo(): intersphinx_mapping = { - 'python': ('https://docs.python.org/3/', None) + 'python': ('https://docs.python.org/3/', None), + #'seaborn': ('https://seaborn.pydata.org/', None), } diff --git a/docs/src/sugar.core.rst b/docs/src/sugar.core.rst index d32eab6..f11a080 100644 --- a/docs/src/sugar.core.rst +++ b/docs/src/sugar.core.rst @@ -19,4 +19,3 @@ Submodules :hidden: sugar.core.* - diff --git a/sugar/core/fts.py b/sugar/core/fts.py index 1f27512..60ef2a6 100644 --- a/sugar/core/fts.py +++ b/sugar/core/fts.py @@ -45,9 +45,6 @@ import sys from enum import Flag, StrEnum, auto from sugar.core.meta import Meta -# from .sequence import Sequence -# from ..copyable import Copyable -# from .seqtypes import NucleotideSequence class Defect(Flag): @@ -117,7 +114,7 @@ def __init__(self, start, stop, strand=Strand.FORWARD, Inclusive ending base or residue position of the feature. strand : Strand The strand direction. - Always :attr:`Strand.FORWARD` for peptide features. + Always `Strand.FORWARD` for peptide features. defect : Defect A possible defect of the location. """ @@ -149,7 +146,10 @@ def __hash__(self): @property def stride(self): - return 1 if self.strand == Strand.FORWARD else -1 + """ + Stride is -1 for the negative strand, else +1 + """ + return -1 if self.strand == '-' else 1 def __len__(self): return self.stop - self.start @@ -179,6 +179,7 @@ def defect(self, v): def _slice_locs(seq, locs, splitter=None, filler=None, gap=None): + # TODO document # Concatenate subsequences for each location of the feature strand = None for loc in locs: @@ -202,7 +203,7 @@ def _slice_locs(seq, locs, splitter=None, filler=None, gap=None): # slice_start = loc.start - seq._seqstart # slice_stop = loc.stop - seq._seqstart # add_seq = seq[slice_start:slice_stop] - add_seq = seq.get(slice(loc.start, loc.stop), gap=gap) + add_seq = seq.getitem(slice(loc.start, loc.stop), gap=gap) if loc.strand == '-': add_seq = add_seq.reverse().complement() if filler is not None and prev_loc is not None: @@ -245,7 +246,7 @@ class Feature(): :param dict meta: The metadata describing the feature. - ..note:: + .. note:: The following metadata attributes can be accessed directly as an attribute of Feature: *type*, *name*, *id* and *seqid*. For example the feature id can be obtained by both `Feature.id` @@ -268,6 +269,9 @@ def __init__(self, type=None, locs=None, start=None, stop=None, meta=None): @property def type(self): + """ + Alias for ``Feature.meta.type`` + """ return self.meta.get('type') @type.setter @@ -276,6 +280,9 @@ def type(self, value): @property def id(self): + """ + Alias for ``Feature.meta.id`` + """ return self.meta.get('id') @id.setter @@ -284,6 +291,9 @@ def id(self, value): @property def seqid(self): + """ + Alias for ``Feature.meta.seqid`` + """ return self.meta.get('seqid') @seqid.setter @@ -292,6 +302,9 @@ def seqid(self, value): @property def name(self): + """ + Alias for ``Feature.meta.name`` + """ return self.meta.get('name') @name.setter @@ -328,6 +341,9 @@ def _slice(self): # needs to return list of slices @property def loc(self): + """ + Access first location + """ l, *_ = self.locs return l @@ -375,7 +391,10 @@ def __len__(self): lr = self.loc_range return lr[1] - lr[0] - def overlap(self, other): + def overlaps(self, other): + """ + Weather the location ranges overlaps with other feature + """ if not isinstance(other, Feature): raise NotImplementedError() lr1 = self.loc_range @@ -390,6 +409,11 @@ def __sub__(self, other): return (sum(lr1) - sum(lr2)) // 2 def reverse(self): + """ + Reverse the feature. + + After the operation the feature will be located on the reverse complement strand. + """ ft = self for loc in ft.locs: loc.start, loc.stop = -loc.stop, -loc.start @@ -413,7 +437,7 @@ def write(self, *args, **kw): class FeatureList(collections.UserList): def __init__(self, data=None): """ - A `FeatureList` is a set of features belonging to one sequence. + A `FeatureList` is a set of features belonging to one or several sequences. Its advantage over a simple list is the base/residue position based indexing: @@ -504,6 +528,9 @@ def _repr_pretty_(self, p, cycle): p.text(str(self)) def tostr(self, raw=False, w=80, wt=12, wl=20, h=80, exclude_fts=()): + """ + Return string with information about features, used by ``__str__()`` method + """ def _sort_meta_key(m): order = ['name', 'gene'] try: @@ -561,11 +588,19 @@ def _sort_meta_key(m): return '\n'.join(out) def tofmtstr(self, fmt, **kw): + """ + Write features to a string of specified format, see `~.main.write_fts()` + """ out = io.StringIO() self.write(out, fmt=fmt, **kw) return out.getvalue() def get(self, type): + """ + Return the first feature of specified feature type, e.g. ``'cds'`` + + :param type: String or list of multiple strings + """ type_ = type if isinstance(type_, tuple): type_ = tuple(t.lower() for t in type_) @@ -575,6 +610,11 @@ def get(self, type): return ft def select(self, type): + """ + Return new `featureList` with all features of specified feature type, e.g. ``'cds'`` + + :param type: String or list of multiple strings + """ type_ = type if isinstance(type_, tuple): type_ = tuple(t.lower() for t in type_) @@ -586,6 +626,9 @@ def select(self, type): return FeatureList(fts) def todict(self): + """ + Return a dictionary with sequence ids as keys and FeatureLists as values + """ d = {} for ft in self: seqid = ft.meta.get('seqid', '') @@ -594,6 +637,9 @@ def todict(self): @property def d(self): + """ + Alias for `FeatureList.todict()` + """ return self.todict() @property @@ -629,6 +675,9 @@ def write(self, fname, fmt=None, **kw): def slice(self, start, stop): + """ + Return a sub-annotation between start and stop + """ if start is None: i_start = -sys.maxsize else: @@ -670,11 +719,26 @@ def slice(self, start, stop): return sub_annot def reverse(self): + """ + Reverse all features, see `Feature.reverse()` + + :return: Reversed features + """ for ft in self: ft.reverse() return self def sort(self, key=None, reverse=False): + """ + Sort features in-place + + :param key: Key to use for sorting. + Should be a string, which is expected to be a valid attribute in the metadata of each feature. + Alternatively, the key sort function can be specified directly. + :param reverse: Use reversed order (default: False) + + :return: Sorted features + """ if key is not None and isinstance(key, str): kfunc = lambda ft: ft.meta[key] else: @@ -683,6 +747,9 @@ def sort(self, key=None, reverse=False): return self def copy(self): + """ + Return a deep copy of the object + """ return deepcopy(self) diff --git a/sugar/core/meta.py b/sugar/core/meta.py index 3eef136..9997e2c 100644 --- a/sugar/core/meta.py +++ b/sugar/core/meta.py @@ -66,9 +66,11 @@ def __getattr__(self, name): __delattr__ = __delitem__ def copy(self): + """Return a deep copy of the object""" return copy.deepcopy(self) def update(self, adict={}): + """Update from other mapping or iterable""" for (key, value) in adict.items(): self.__setitem__(key, value) @@ -93,6 +95,9 @@ def _repr_pretty_(self, p, cycle): p.text(str(self)) def tostr(self, w=80): + """ + Return string describing the metadata, is used by ``__str__()`` method. + """ def _key2str(): line = f'{k:>{lenkey}}: {self[k]}' if len(line) > w: diff --git a/sugar/core/seq.py b/sugar/core/seq.py index 9fa267f..8f11023 100644 --- a/sugar/core/seq.py +++ b/sugar/core/seq.py @@ -23,15 +23,13 @@ COMPLEMENT_TRANS = str.maketrans(COMPLEMENT_ALL) -class _Slicable_GetItemInplace(): - def __init__(self, seq): - self.seq = seq +class _Slicable_GetItem(): + def __init__(self, obj, **kw): + self.obj = obj + self.kw = kw def __getitem__(self, i): - return self.seq.get(i, inplace=True) - # self.seq.data = self.seq[i].data - # return self.seq - + return self.obj.getitem(i, **self.kw) class _BioSeqStrMethods(): @@ -242,7 +240,7 @@ class MutableMetaString(): def __init__(self, data, id='', meta=None, type=None): #: Namespace holding all available string methods, #: see `_BioSeqStrMethods` for available methods - #: and `str` for documentation of the methods + #: and str_ for documentation of the methods self.str = _BioSeqStrMethods(self) #: Property holding the data string self.data = str(data).upper() @@ -337,7 +335,7 @@ def __contains__(self, char): def __len__(self): return len(self.data) - def get(self, index, gap=None): + def getitem(self, index, gap=None): """ TODO """ @@ -349,12 +347,11 @@ def get(self, index, gap=None): index = adj(index) elif isinstance(index, slice): index = slice(adj(index.start), adj(index.stop), index.step) - print(index) return self.__class__(self.data[index], meta=self.meta) def __getitem__(self, index): - return self.get(index) + return self.getitem(index) def __setitem__(self, index, value): l = list(self.data) @@ -446,8 +443,13 @@ def gc(self): @property def i(self): - """TODO""" - return _Slicable_GetItemInplace(self) + """Return slicable object to support in-place slicing + + Deprecated: Use getitem() or sl attribute. + """ + msg = 'BioSeq.i is deprecated, use geitem() method or sl attribute' + warnings.warn(msg, DeprecationWarning, stacklevel=2) + return _Slicable_GetItem(self, inplace=True) def rc(self): """ @@ -457,14 +459,49 @@ def rc(self): def __getitem__(self, index): - return self.get(index) - - def get(self, index, inplace=False, gap=None): - """ - TODO - """ + return self.getitem(index) + + def getitem(self, index, inplace=False, gap=None): + """ + Slice the sequence and return a subsequence + + This is the method which is called if you slice with ``BioSeq[]`` syntax. + If you want to use non-default options call this method directly, + or by the `BioSeq.sl` attribute. + + .. rubric:: Example: + + >>> from sugar import read + >>> seq = read()[0] + >>> print(seq[5:10]) + CCCCT + >>> print(seq[5]) + C + >>> print(seq['cds'][:3]) + ATG + + :param index: Specifies which part of the sequence is returned. + The following types are supported. + + int,slice + location is specified by int or slice + `.Location` + specified by location + `.Feature` + specified by feature + str + position of first feature of given type, e.g. ``'cds'`` + will return sequence with first coding sequence + :param bool inplace: + The subsequence is not only returned, but the original + sequence is modified in-place (default: False) + :param str gap: + gaps of the given characters will be accounted for when + slicing the sequence (default: gaps will not be accounted for) + """ + # TODO: add correct_fts kwargs try: - subseq = super().get(index, gap=gap) + subseq = super().getitem(index, gap=gap) except: if isinstance(index, str): index = self.fts.get(index) @@ -554,6 +591,26 @@ def get(self, index, inplace=False, gap=None): # return subseq + def sl(self, **kw): + """ + Method allowing to call `BioSeq.getitem()` with non-default options and extended indexing syntax + + Returns a slicable object. Use the ``BioSeq[]`` notation directly if you use default arguments. + + .. rubric:: Example: + + >>> from sugar import read + >>> seq = read()[0] + >>> print(seq[:5]) + ACCTG + >>> print(seq.sl(inplace=True, gap='-')[:5:2]) + ACG + >>> print(seq) # was modified in-place + ACG + """ + return _Slicable_GetItem(self, **kw) + + def biotranslate(self, *args, **kw): from sugar.core.translate import translate import warnings @@ -710,10 +767,9 @@ def __init__(self, data=None, meta=None): #: Namespace holding all available string methods, #: #: The `BioBasket.str` methods call the corresponding `BioSeq.str` methods under the hood - #: and return either the altered `BioBasket` object or a list of results. - #: see `_BioSeqStrMethods` for available methods - #: and `str` for documentation of the methods - + #: and return either the altered `BioBasket` object or a list with results. + #: See `_BioSeqStrMethods` for available methods + #: and str_ for documentation of the methods self.str = _BioBasketStrMethods(self) if data is None: data = [] @@ -800,30 +856,74 @@ def __repr__(self): return f'{type(self).__name__}({super().__repr__()}, meta=dict({metastr}))' def __getitem__(self, i): - return self.get(i) - - def get(self, i, gap=None): - """TODO""" + return self.getitem(i) + + def getitem(self, i, **kw): + """ + Slice sequences + + This is the method which is called if you slice with ``BioBasket[]`` syntax. + If you want to use non-default options call this method directly, + or by the `BioBasket.sl` attribute. + + .. rubric:: Example: + + >>> from sugar import read + >>> seqs = read() + >>> print(seqs[:2, 5:10]) + 2 seqs in basket + AB047639 5 CCCCT ... + AB677533 5 CCCCC ... + >>> print(seqs[:2, 'cds'][:, 0:3]) + 2 seqs in basket + AB047639 3 ATG ... + AB677533 3 ATG ... + + :param index: + Specifies which part of the sequences or which sequences are returned. + + int + Returns a `BioSeq` from the basket + slice + Returns a new `BioBasket` object with a subset of the sequences + str,feature,location + Updates all sequences inisde the basket, see `BioSeq.getitem()` + (int, object) + Returns a `BioSeq` from the basket and slices it with the object, see `BioSeq.getitem()` + (slice, object) + Returns a new `BioBasket` object with a subset of the sequences which are replaced + by subsequences according to `BioSeq.getitem()` + :param \*\*kw: + Aditional kwargs are passed to `BioSeq.getitem()`. + """ if isinstance(i, int): return self.data[i] elif isinstance(i, slice): seqs = self.__class__(self.data[i], meta=self.meta) elif isinstance(i, (str, Feature, Location)): seqs = self.__class__(self.data, meta=self.meta) - seqs.data = [seq.get(i, gap=gap) for seq in seqs.data] + seqs.data = [seq.getitem(i, **kw) for seq in seqs.data] elif len(i) == 2: i, j = i if isinstance(i, int): - return self.data[i].get(j, gap=gap) + return self.data[i].getitem(j, **kw) elif isinstance(i, slice): seqs = self.__class__(self.data[i], meta=self.meta) - seqs.data = [seq.get(j, gap=gap) for seq in seqs.data] + seqs.data = [seq.getitem(j, **kw) for seq in seqs.data] else: raise TypeError('Index not supported') else: raise TypeError('Index not supported') return seqs + def sl(self, **kw): + """ + Method allowing to call `BioBasket.getitem()` with non-default options and extended indexing syntax + + Returns a slicable object. Use the ``BioBasket[]`` notation directly if you use default arguments. + """ + return _Slicable_GetItem(self, **kw) + def __setitem__(self, i, value): if isinstance(i, (int, slice)): if isinstance(value, (str, BioSeq)): @@ -912,7 +1012,8 @@ def countplot(self, y='letter', x='count', hue='id', order=None, plot='show', This method might undergo disrupting changes or it might be removed in a later version. Under the hood this method uses pandas and seaborn libraries. - For a help on most arguments, see ``seaborn.barplot()``. + For a help on most arguments, see + `seaborn.barplot() `_. """ import matplotlib.pyplot as plt import seaborn as sns @@ -1008,7 +1109,7 @@ def find_orfs(self, *args, **kw): def tofmtstr(self, fmt, **kw): """ - Write object to a string of specified format, see `~.main.write()` + Write sequences to a string of specified format, see `~.main.write()` """ out = io.StringIO() self.write(out, fmt=fmt, **kw) @@ -1016,7 +1117,7 @@ def tofmtstr(self, fmt, **kw): def tostr(self, h=19, w=80, wid=19, wlen=4, showgc=True, add_hint=False, raw=False): """ - Return string with information about sequences, used by ``__str__`` magic + Return string with information about sequences, used by ``__str__()`` method """ if raw: return '\n'.join(str(seq) for seq in self) diff --git a/sugar/tests/test_docs.py b/sugar/tests/test_docs.py index 573e0a8..ec930bd 100644 --- a/sugar/tests/test_docs.py +++ b/sugar/tests/test_docs.py @@ -5,10 +5,10 @@ def doctest_module(m): - from doctest import testmod + from doctest import testmod, ELLIPSIS raised = False try: - testmod(m, raise_on_error=True) + testmod(m, raise_on_error=True, optionflags=ELLIPSIS) except Exception: raised = True if raised: @@ -26,6 +26,14 @@ def test_docs_core(): from sugar import core doctest_module(core) +def test_docs_core_seq(): + from sugar.core import seq + doctest_module(seq) + +def test_docs_core_meta(): + from sugar.core import meta + doctest_module(meta) + def test_docs_io(): from sugar import _io doctest_module(_io)