diff --git a/setup.py b/setup.py index c6c9d6f..023fdc0 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read(name): setup( name="arpa", - version="0.1.0a2", + version="0.1.0a3", author="Stefan Fischer", author_email="sfischer13@ymail.com", url="https://github.com/sfischer13/python-arpa/", @@ -21,13 +21,15 @@ def read(name): description="This is a library for reading ARPA n-gram models.", long_description=read("README.rst"), keywords="ARPA n-gram ngram language model LM language technology LT " - "natural language processing NLP", + "computational linguistics CL natural language processing NLP", install_requires=["enum34"], packages=find_packages("src"), package_dir={"": "src"}, classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python", @@ -36,6 +38,8 @@ def read(name): "Programming Language :: Python :: 3.2", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing", diff --git a/src/arpa/__init__.py b/src/arpa/__init__.py index 113a593..207b1da 100644 --- a/src/arpa/__init__.py +++ b/src/arpa/__init__.py @@ -22,10 +22,15 @@ """ This is a library for reading ARPA n-gram models. -It may be imported directly: +The package may be imported directly:: import arpa +Details about the ARPA n-gram format can be found here: + +- `SRILM `_ +- `ICSI Speech `_ + The library was initiated by Stefan Fischer and is developed and maintained by many others. """ @@ -40,4 +45,4 @@ __date__ = "2015-07-05" __license__ = "MIT" __status__ = "development" -__version__ = "0.1.0a2" +__version__ = "0.1.0a3" diff --git a/src/arpa/arpa.py b/src/arpa/arpa.py index d754c10..38776f5 100644 --- a/src/arpa/arpa.py +++ b/src/arpa/arpa.py @@ -5,22 +5,26 @@ def dump(obj, fp): + """Serialize obj to fp (a file-like object) in ARPA format.""" raise NotImplementedError # TODO: obj.write(fp) def dumpf(obj, path, mode="wt", encoding=None): + """Serialize obj to path in ARPA format.""" with open(path, mode=mode, encoding=encoding) as f: dump(obj, f) def dumps(obj): + """Serialize obj to an ARPA formatted str.""" with StringIO() as f: dump(obj, f) return f.getvalue() def load(fp, model=None, parser=None): + """Deserialize fp (a file-like object) to a Python object.""" if not model: model = "simple" if not parser: @@ -38,10 +42,12 @@ def load(fp, model=None, parser=None): def loadf(path, mode="rt", encoding=None, model=None, parser=None): + """Deserialize path (a text file) to a Python object.""" with open(path, mode=mode, encoding=encoding) as f: return load(f, model, parser) def loads(s, model=None, parser=None): + """Deserialize s (a str) to a Python object.""" with StringIO(s) as f: return load(f, model, parser) diff --git a/src/arpa/exceptions.py b/src/arpa/exceptions.py index c393e56..15a0c1d 100644 --- a/src/arpa/exceptions.py +++ b/src/arpa/exceptions.py @@ -2,9 +2,10 @@ class ARPAException(Exception): - """Base exception that is never raised.""" + """Common base class for all package exceptions.""" pass class ParseException(ARPAException): + """ARPA file could not be parsed.""" pass diff --git a/src/arpa/models/base.py b/src/arpa/models/base.py index ae29dc1..0f9602b 100644 --- a/src/arpa/models/base.py +++ b/src/arpa/models/base.py @@ -15,15 +15,7 @@ def add_entry(self, ngram, p, bo=None, order=None): pass def log10_p(self, ngram): - if not ngram: - raise ValueError - elif isinstance(ngram, str): - ngram = self._str2tuple(ngram) - elif isinstance(ngram, list): - ngram = tuple(ngram) - elif not isinstance(ngram, tuple): - raise ValueError - + ngram = self._check_input(ngram) try: return self._log10_p(ngram) except KeyError: @@ -33,9 +25,16 @@ def log10_p(self, ngram): log10_bo = 0 return log10_bo + self.log10_p(ngram[1:]) + def log10_s(self, sentence): + words = self._check_input(sentence) + return sum(self.log10_p(words[:i]) for i in range(1, len(words) + 1)) + def p(self, ngram): return 10 ** self.log10_p(ngram) + def s(self, sentence): + return 10 ** self.log10_s(sentence) + @abstractmethod def _log10_bo(self, ngram): pass @@ -45,5 +44,14 @@ def _log10_p(self, ngram): pass @staticmethod - def _str2tuple(s): - return tuple(s.strip().split(" ")) + def _check_input(input): + if not input: + raise ValueError + elif isinstance(input, tuple): + return input + elif isinstance(input, list): + return tuple(input) + elif isinstance(input, str): + return tuple(input.strip().split(" ")) + else: + raise ValueError