Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: sfischer13/python-arpa
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 0.1.0a2
Choose a base ref
...
head repository: sfischer13/python-arpa
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 0.1.0a3
Choose a head ref
  • 5 commits
  • 5 files changed
  • 1 contributor

Commits on Jul 6, 2015

  1. Copy the full SHA
    a92005f View commit details

Commits on Jul 7, 2015

  1. documentation

    sfischer13 committed Jul 7, 2015
    Copy the full SHA
    e53010f View commit details

Commits on Jul 8, 2015

  1. documentation

    sfischer13 committed Jul 8, 2015
    Copy the full SHA
    34e7866 View commit details

Commits on Jul 9, 2015

  1. Copy the full SHA
    b6f1100 View commit details

Commits on Jul 10, 2015

  1. Copy the full SHA
    403df88 View commit details
Showing with 40 additions and 16 deletions.
  1. +6 −2 setup.py
  2. +7 −2 src/arpa/__init__.py
  3. +6 −0 src/arpa/arpa.py
  4. +2 −1 src/arpa/exceptions.py
  5. +19 −11 src/arpa/models/base.py
8 changes: 6 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -13,21 +13,23 @@ def read(name):

setup(
name="arpa",
version="0.1.0a2",
version="0.1.0a3",
author="Stefan Fischer",
author_email="sfischer13@ymail.com",
url="https://github.com/sfischer13/python-arpa/",
license="MIT",
description="This is a library for reading ARPA n-gram models.",
long_description=read("README.rst"),
keywords="ARPA n-gram ngram language model LM language technology LT "
"natural language processing NLP",
"computational linguistics CL natural language processing NLP",
install_requires=["enum34"],
packages=find_packages("src"),
package_dir={"": "src"},
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python",
@@ -36,6 +38,8 @@ def read(name):
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing",
9 changes: 7 additions & 2 deletions src/arpa/__init__.py
Original file line number Diff line number Diff line change
@@ -22,10 +22,15 @@
"""
This is a library for reading ARPA n-gram models.
It may be imported directly:
The package may be imported directly::
import arpa
Details about the ARPA n-gram format can be found here:
- `SRILM <http://www.speech.sri.com/projects/srilm/manpages/ngram-format.5.html>`_
- `ICSI Speech <http://www1.icsi.berkeley.edu/Speech/faq/grammarfmts.html>`_
The library was initiated by Stefan Fischer and is developed and maintained by many others.
"""

@@ -40,4 +45,4 @@
__date__ = "2015-07-05"
__license__ = "MIT"
__status__ = "development"
__version__ = "0.1.0a2"
__version__ = "0.1.0a3"
6 changes: 6 additions & 0 deletions src/arpa/arpa.py
Original file line number Diff line number Diff line change
@@ -5,22 +5,26 @@


def dump(obj, fp):
"""Serialize obj to fp (a file-like object) in ARPA format."""
raise NotImplementedError
# TODO: obj.write(fp)


def dumpf(obj, path, mode="wt", encoding=None):
"""Serialize obj to path in ARPA format."""
with open(path, mode=mode, encoding=encoding) as f:
dump(obj, f)


def dumps(obj):
"""Serialize obj to an ARPA formatted str."""
with StringIO() as f:
dump(obj, f)
return f.getvalue()


def load(fp, model=None, parser=None):
"""Deserialize fp (a file-like object) to a Python object."""
if not model:
model = "simple"
if not parser:
@@ -38,10 +42,12 @@ def load(fp, model=None, parser=None):


def loadf(path, mode="rt", encoding=None, model=None, parser=None):
"""Deserialize path (a text file) to a Python object."""
with open(path, mode=mode, encoding=encoding) as f:
return load(f, model, parser)


def loads(s, model=None, parser=None):
"""Deserialize s (a str) to a Python object."""
with StringIO(s) as f:
return load(f, model, parser)
3 changes: 2 additions & 1 deletion src/arpa/exceptions.py
Original file line number Diff line number Diff line change
@@ -2,9 +2,10 @@


class ARPAException(Exception):
"""Base exception that is never raised."""
"""Common base class for all package exceptions."""
pass


class ParseException(ARPAException):
"""ARPA file could not be parsed."""
pass
30 changes: 19 additions & 11 deletions src/arpa/models/base.py
Original file line number Diff line number Diff line change
@@ -15,15 +15,7 @@ def add_entry(self, ngram, p, bo=None, order=None):
pass

def log10_p(self, ngram):
if not ngram:
raise ValueError
elif isinstance(ngram, str):
ngram = self._str2tuple(ngram)
elif isinstance(ngram, list):
ngram = tuple(ngram)
elif not isinstance(ngram, tuple):
raise ValueError

ngram = self._check_input(ngram)
try:
return self._log10_p(ngram)
except KeyError:
@@ -33,9 +25,16 @@ def log10_p(self, ngram):
log10_bo = 0
return log10_bo + self.log10_p(ngram[1:])

def log10_s(self, sentence):
words = self._check_input(sentence)
return sum(self.log10_p(words[:i]) for i in range(1, len(words) + 1))

def p(self, ngram):
return 10 ** self.log10_p(ngram)

def s(self, sentence):
return 10 ** self.log10_s(sentence)

@abstractmethod
def _log10_bo(self, ngram):
pass
@@ -45,5 +44,14 @@ def _log10_p(self, ngram):
pass

@staticmethod
def _str2tuple(s):
return tuple(s.strip().split(" "))
def _check_input(input):
if not input:
raise ValueError
elif isinstance(input, tuple):
return input
elif isinstance(input, list):
return tuple(input)
elif isinstance(input, str):
return tuple(input.strip().split(" "))
else:
raise ValueError