diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ef763d5..525b112 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,6 +49,11 @@ pipenv run flake8 . Documentation ------------- +```sh +cd docs +pipenv run sphinx-apidoc -f -o . ../arpa +``` + ```sh cd docs pipenv run make html diff --git a/HISTORY.md b/HISTORY.md index f24e2f8..e592f70 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -20,6 +20,9 @@ You should [Keep a CHANGELOG](https://keepachangelog.com/), too! ### Security +[0.1.0b4](https://github.com/sfischer13/python-arpa/compare/0.1.0b3...0.1.0b4) - 2018-12-12 +------------------------------------------------------------------------------------------- + [0.1.0b3](https://github.com/sfischer13/python-arpa/compare/0.1.0b2...0.1.0b3) - 2018-12-06 ------------------------------------------------------------------------------------------- diff --git a/MANIFEST.in b/MANIFEST.in index ff264c2..5a7c957 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -22,6 +22,7 @@ recursive-include docs *.rst recursive-include tests * recursive-exclude tests *.arpa +recursive-exclude tests *.arpa.* recursive-exclude * __pycache__ recursive-exclude * *.py[co] diff --git a/Pipfile.lock b/Pipfile.lock index 9fc215e..83b9d16 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -218,10 +218,10 @@ }, "idna": { "hashes": [ - "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", - "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" ], - "version": "==2.7" + "version": "==2.8" }, "imagesize": { "hashes": [ @@ -410,10 +410,10 @@ }, "requests": { "hashes": [ - "sha256:65b3a120e4329e33c9889db89c80976c5272f56ea92d3e74da8a463992e3ff54", - "sha256:ea881206e59f41dbd0bd445437d792e43906703fff75ca8ff43ccdb11f33f263" + "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", + "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" ], - "version": "==2.20.1" + "version": "==2.21.0" }, "requests-toolbelt": { "hashes": [ @@ -431,10 +431,10 @@ }, "six": { "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" ], - "version": "==1.11.0" + "version": "==1.12.0" }, "snowballstemmer": { "hashes": [ diff --git a/README.md b/README.md index f4308e6..27e8508 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,9 @@ Python ARPA Package =================== -[![PyPI Version](https://img.shields.io/pypi/v/arpa.svg)](https://pypi.python.org/pypi/arpa) [![Documentation Status](https://readthedocs.org/projects/arpa/badge/?version=latest)](https://arpa.readthedocs.io/en/latest/?badge=latest) [![Travis](https://img.shields.io/travis/sfischer13/python-arpa.svg)](https://travis-ci.org/sfischer13/python-arpa) [![Coverage Status](https://coveralls.io/repos/sfischer13/python-arpa/badge.svg?branch=master&service=github)](https://coveralls.io/github/sfischer13/python-arpa?branch=master) - Python library for reading ARPA n-gram models. -It was initiated by Stefan Fischer and is developed and maintained by many others. -- [Documentation](https://readthedocs.org/projects/arpa/badge/?version=latest) is available. +- [Documentation](https://arpa.readthedocs.io/en/latest/) is available. - [Changes](https://github.com/sfischer13/python-arpa/blob/master/HISTORY.md) between releases are documented. - [Bugs](https://github.com/sfischer13/python-arpa/issues) can be reported on the issue tracker. - [Questions](mailto:sfischer13@ymail.com) can be asked via e-mail. @@ -15,18 +12,31 @@ It was initiated by Stefan Fischer and is developed and maintained by many other Setup ----- -[![PyPI Python Versions](https://img.shields.io/pypi/pyversions/arpa.svg)](https://pypi.python.org/pypi/arpa) +### Python 3.4+ + +[![PyPI Python Versions](https://img.shields.io/pypi/pyversions/arpa.svg)](https://pypi.python.org/pypi/arpa) [![PyPI Version](https://img.shields.io/pypi/v/arpa.svg)](https://pypi.python.org/pypi/arpa) + +In order to install the Python 3 version: + + $ pip install --user -U arpa -The package is available on [PyPI](https://pypi.python.org/pypi/arpa): +### Python 2.7 - $ pip install arpa +[![PyPI Python Versions](https://img.shields.io/pypi/pyversions/arpa-backport.svg)](https://pypi.python.org/pypi/arpa-backport) [![PyPI Version](https://img.shields.io/pypi/v/arpa-backport.svg)](https://pypi.python.org/pypi/arpa-backport) + +In order to install the Python 2.7 version: + + $ pip install --user -U arpa-backport Usage ----- The package may be imported directly: - import arpa + import arpa # Python 3.4+ + # OR + import arpa_backport as arpa # Python 2.7 + models = arpa.loadf("foo.arpa") lm = models[0] # ARPA files may contain several models. @@ -42,9 +52,12 @@ The package may be imported directly: lm.s("This is the end .", sos=False, eos=False) lm.log_s("This is the end .", sos=False, eos=False) -Contribute ----------- +Development +----------- + +[![Travis](https://img.shields.io/travis/sfischer13/python-arpa.svg)](https://travis-ci.org/sfischer13/python-arpa) [![Documentation Status](https://readthedocs.org/projects/arpa/badge/?version=latest)](https://arpa.readthedocs.io/en/latest/?badge=latest) [![Coverage Status](https://coveralls.io/repos/sfischer13/python-arpa/badge.svg?branch=master&service=github)](https://coveralls.io/github/sfischer13/python-arpa?branch=master) +*Contributions are welcome!* Write a bug report or send a pull request. Other [contributors](https://github.com/sfischer13/python-arpa/graphs/contributors) have done so before. diff --git a/arpa/__init__.py b/arpa/__init__.py index 24bf523..a7105a3 100644 --- a/arpa/__init__.py +++ b/arpa/__init__.py @@ -41,8 +41,8 @@ __author__ = 'Stefan Fischer' __contact__ = 'Stefan Fischer ' __copyright__ = 'Copyright (c) 2015-2018 Stefan Fischer' -__credits__ = [] -__date__ = '2018-12-06' +__credits__ = ['Stefan Fischer'] +__date__ = '2018-12-12' __license__ = 'MIT' __status__ = 'development' -__version__ = '0.1.0b3' +__version__ = '0.1.0b4' diff --git a/arpa/api.py b/arpa/api.py index 6831bf4..ad0b2da 100644 --- a/arpa/api.py +++ b/arpa/api.py @@ -1,3 +1,5 @@ +import gzip + from io import StringIO from .models.simple import ARPAModelSimple @@ -9,10 +11,15 @@ def dump(obj, fp): obj.write(fp) -def dumpf(obj, path, mode='wt', encoding=None): - """Serialize obj to path in ARPA format.""" - with open(path, mode=mode, encoding=encoding) as f: - dump(obj, f) +def dumpf(obj, path, encoding=None): + """Serialize obj to path in ARPA format (.arpa, .gz).""" + path = str(path) + if path.endswith('.gz'): + with gzip.open(path, mode='wt', encoding=encoding) as f: + return dump(obj, f) + else: + with open(path, mode='wt', encoding=encoding) as f: + dump(obj, f) def dumps(obj): @@ -40,10 +47,15 @@ def load(fp, model=None, parser=None): raise ValueError -def loadf(path, mode='rt', encoding=None, model=None, parser=None): - """Deserialize path (a text file) to a Python object.""" - with open(path, mode=mode, encoding=encoding) as f: - return load(f, model=model, parser=parser) +def loadf(path, encoding=None, model=None, parser=None): + """Deserialize path (.arpa, .gz) to a Python object.""" + path = str(path) + if path.endswith('.gz'): + with gzip.open(path, mode='rt', encoding=encoding) as f: + return load(f, model=model, parser=parser) + else: + with open(path, mode='rt', encoding=encoding) as f: + return load(f, model=model, parser=parser) def loads(s, model=None, parser=None): diff --git a/arpa/exceptions.py b/arpa/exceptions.py index 2f5aaec..5136ea8 100644 --- a/arpa/exceptions.py +++ b/arpa/exceptions.py @@ -7,6 +7,12 @@ class ARPAException(Exception): pass +class FatalException(ARPAException): + """This should not have happened.""" + + pass + + class FrozenException(ARPAException): """Language model is frozen.""" diff --git a/arpa/models/simple.py b/arpa/models/simple.py index d07b051..fd90932 100644 --- a/arpa/models/simple.py +++ b/arpa/models/simple.py @@ -24,10 +24,9 @@ def add_count(self, order, count): def add_entry(self, ngram, p, bo=None, order=None): if self._vocabulary is not None: raise FrozenException - key = tuple(ngram) - self._ps[key] = p + self._ps[ngram] = p if bo is not None: - self._bos[key] = bo + self._bos[ngram] = bo def counts(self): return sorted(self._counts.items()) diff --git a/arpa/parsers/quick.py b/arpa/parsers/quick.py index 2c307fd..e379e25 100644 --- a/arpa/parsers/quick.py +++ b/arpa/parsers/quick.py @@ -64,7 +64,7 @@ def _header(self, line): match = self.re_header.match(line) if match: self._state = self.State.ENTRY - self._tmp_order = match.group(1) + self._tmp_order = int(match.group(1)) elif line == '\\end\\': self._result.append(self._tmp_model) self._state = self.State.DATA @@ -80,8 +80,8 @@ def _entry(self, line): if match: p = self._float_or_int(match.group(1)) ngram = tuple(match.group(4).split(' ')) - bo = match.group(7) - bo = self._float_or_int(bo) if bo else None + bo_match = match.group(7) + bo = self._float_or_int(bo_match) if bo_match else None self._tmp_model.add_entry(ngram, p, bo, self._tmp_order) elif not line: self._state = self.State.HEADER # last entry diff --git a/docs/api.rst b/docs/api.rst deleted file mode 100644 index 5150687..0000000 --- a/docs/api.rst +++ /dev/null @@ -1,6 +0,0 @@ -API -=== - -.. automodule:: arpa - :members: - :undoc-members: diff --git a/docs/arpa.models.rst b/docs/arpa.models.rst new file mode 100644 index 0000000..cbbab73 --- /dev/null +++ b/docs/arpa.models.rst @@ -0,0 +1,30 @@ +arpa.models package +=================== + +Submodules +---------- + +arpa.models.base module +----------------------- + +.. automodule:: arpa.models.base + :members: + :undoc-members: + :show-inheritance: + +arpa.models.simple module +------------------------- + +.. automodule:: arpa.models.simple + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: arpa.models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/arpa.parsers.rst b/docs/arpa.parsers.rst new file mode 100644 index 0000000..0205e58 --- /dev/null +++ b/docs/arpa.parsers.rst @@ -0,0 +1,30 @@ +arpa.parsers package +==================== + +Submodules +---------- + +arpa.parsers.base module +------------------------ + +.. automodule:: arpa.parsers.base + :members: + :undoc-members: + :show-inheritance: + +arpa.parsers.quick module +------------------------- + +.. automodule:: arpa.parsers.quick + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: arpa.parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/arpa.rst b/docs/arpa.rst new file mode 100644 index 0000000..73c363f --- /dev/null +++ b/docs/arpa.rst @@ -0,0 +1,38 @@ +arpa package +============ + +Subpackages +----------- + +.. toctree:: + + arpa.models + arpa.parsers + +Submodules +---------- + +arpa.api module +--------------- + +.. automodule:: arpa.api + :members: + :undoc-members: + :show-inheritance: + +arpa.exceptions module +---------------------- + +.. automodule:: arpa.exceptions + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: arpa + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index 8801e5e..f47ce05 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,7 @@ # The short X.Y version version = '0.1' # The full version, including alpha/beta/rc tags -release = '0.1.0b3' +release = '0.1.0b4' # -- General configuration --------------------------------------------------- @@ -182,3 +182,9 @@ # -- Extension configuration ------------------------------------------------- + +nitpick_ignore = [ + ('py:class', 'Exception'), + ('py:class', 'enum.Enum'), + ('py:class', 'object'), +] diff --git a/docs/index.rst b/docs/index.rst index 3b21c2f..c1d5aee 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,16 +3,15 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Documentation for Python :mod:`arpa` -==================================== +Python :mod:`arpa` package +========================== .. toctree:: :maxdepth: 2 - :caption: Contents: setup examples - api + arpa Indices and tables diff --git a/setup.py b/setup.py index 2bdf9fb..8a6d24d 100644 --- a/setup.py +++ b/setup.py @@ -46,18 +46,23 @@ description='Library for reading ARPA n-gram models.', include_package_data=True, install_requires=[], - keywords='ARPA n-gram ngram language model LM language technology LT ' - 'computational linguistics CL natural language processing NLP unigram bigram trigram', + keywords='ARPA,n-gram,ngram,language model,LM,language technology,LT,' + 'computational linguistics,CL,natural language processing,NLP,unigram,bigram,trigram', license='MIT', long_description=readme + '\n\n' + history, long_description_content_type='text/markdown', name='arpa', package_dir={'arpa': 'arpa'}, packages=['arpa'], + project_urls={ + 'bug tracker': 'https://github.com/sfischer13/python-arpa/issues/', + 'documentation': 'https://arpa.readthedocs.io/', + 'source code': 'https://github.com/sfischer13/python-arpa/', + }, python_requires='~=3.4', setup_requires=['pytest-runner'], tests_require=['pytest'], url='https://github.com/sfischer13/python-arpa', - version='0.1.0b3', - zip_safe=False, + version='0.1.0b4', + zip_safe=True, ) diff --git a/tests/.gitignore b/tests/.gitignore index 8739862..cf9c6fb 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1 +1,2 @@ data/test.arpa +data/test.arpa.gz diff --git a/tests/data/download.sh b/tests/data/download.sh index 931ed50..92878d2 100755 --- a/tests/data/download.sh +++ b/tests/data/download.sh @@ -1,2 +1,5 @@ #!/usr/bin/env bash + wget https://raw.githubusercontent.com/kpu/kenlm/master/lm/test.arpa + +gzip -k test.arpa diff --git a/tests/test_arpa.py b/tests/test_arpa.py index b8c2f59..d4bcc01 100644 --- a/tests/test_arpa.py +++ b/tests/test_arpa.py @@ -9,6 +9,7 @@ PARSERS = [None, 'quick'] TEST_ARPA = os.path.join(os.path.dirname(__file__), 'data/test.arpa') +TEST_ARPA_GZ = os.path.join(os.path.dirname(__file__), 'data/test.arpa.gz') def test_load_option_model(): @@ -31,14 +32,38 @@ def test_load_dump(): assert fp.read() == gp.read() -def test_loadf_dumpf(): +def test_loadf_dumpf_read(): for p in PARSERS: - lm = arpa.loadf(TEST_ARPA, parser=p)[0] - out = tempfile.NamedTemporaryFile(mode='w+t', delete=False) - arpa.dumpf(lm, out.name) - out.close() - assert filecmp.cmp(TEST_ARPA, out.name, shallow=False) - os.unlink(out.name) + for src in [TEST_ARPA, TEST_ARPA_GZ]: + # read + lm = arpa.loadf(src, parser=p)[0] + # write + out = tempfile.NamedTemporaryFile(mode='w+t', suffix='.arpa', delete=False) + arpa.dumpf(lm, out.name) + out.close() + # compare + assert filecmp.cmp(TEST_ARPA, out.name, shallow=False) + os.unlink(out.name) + + +def test_loadf_dumpf_write(): + for p in PARSERS: + for suf in ['.arpa', '.gz']: + # read + lm1 = arpa.loadf(TEST_ARPA, parser=p)[0] + # write + out1 = tempfile.NamedTemporaryFile(mode='w+t', suffix=suf, delete=False) + arpa.dumpf(lm1, out1.name) + out1.close() + # read again + lm2 = arpa.loadf(out1.name, parser=p)[0] + # write again + out2 = tempfile.NamedTemporaryFile(mode='w+t', suffix='.arpa', delete=False) + arpa.dumpf(lm2, out2.name) + out2.close() + # compare + assert filecmp.cmp(TEST_ARPA, out2.name, shallow=False) + os.unlink(out2.name) def test_loads_dumps(): diff --git a/tests/test_model_simple.py b/tests/test_model_simple.py index 6c469a0..272c727 100644 --- a/tests/test_model_simple.py +++ b/tests/test_model_simple.py @@ -8,7 +8,7 @@ def test_new_model_contains_not(): def test_new_model_contains(): lm = ARPAModelSimple() - lm.add_entry(['foo'], 1.0) + lm.add_entry(('foo', ), 1.0) assert 'foo' in lm