Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,22 @@ Installation
Installation by hand::

python setup.py install
pycorpora-download

Installation with pip::

pip install --no-cache-dir pycorpora
pycorpora-download

The package does not include data from the Corpora Project; instead, the data
is downloaded when the package is installed (using either of the methods
above). By default, the "master" branch of the `Corpora Project GitHub
repository <https://github.com/dariusk/corpora>`_ is used as the source for the
data. You can specify an alternative URL to download the data from using the
argument ``--corpora-zip-url`` on the command line with either of the two
methods above::
is downloaded by the `pycorpora-download` script (installed when
the pycorpora package is installed). By default, the "master" branch of the
`Corpora Project GitHub repository <https://github.com/dariusk/corpora>`_ is
used as the source for the data. You can specify an alternative URL to
download the data from using the argument ``--corpora-zip-url`` on the command
line when running `pycorpora-download`::

python setup.py install --corpora-zip-url=https://github.com/dariusk/corpora/archive/master.zip

... or, with ``pip``::

pip install pycorpora --install-option="--corpora-zip-url=https://github.com/dariusk/corpora/archive/master.zip"
pycorpora-download --corpora-zip-url=https://github.com/dariusk/corpora/archive/master.zip

(The intention of ``--corpora-zip-url`` is to let you install Corpora Project
data from a particular branch, commit or fork, so that changes to the bleeding
Expand All @@ -55,9 +53,9 @@ edge of the project don't break your code.)
Update
------

Update Corpora Project data by reinstalling with pip::
Update Corpora Project data by running the `pycorpora-download` script again::

pip install --upgrade --force-reinstall pycorpora
pycorpora-download

Usage
-----
Expand Down
41 changes: 41 additions & 0 deletions pycorpora/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import argparse
from distutils.dir_util import mkpath, copy_tree
import glob
import io
import sys
import zipfile
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen

def main():

DEFAULT_ZIP = "https://github.com/dariusk/corpora/archive/master.zip"

parser = argparse.ArgumentParser(description="Install corpora data.")
parser.add_argument(
"--corpora-zip-url",
help='URL pointing to .zip file of corpora data (defaults to current master on GitHub)',
default=DEFAULT_ZIP
)
parsed = parser.parse_args()

url = parsed.corpora_zip_url
print("Installing corpora data from " + url)
mkpath("./corpora-download")
resp = urlopen(url).read()
remote = io.BytesIO(resp)
zf = zipfile.ZipFile(remote, "r")
zf.extractall("corpora-download")
try:
data_dir = glob.glob("./corpora-download/*/data")[0]
except IndexError:
raise IndexError(
"malformed corpora archive: expecting a subdirectory '*/data'")
copy_tree(data_dir, "pycorpora/data")

if __name__ == '__main__':
main()
43 changes: 5 additions & 38 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,48 +5,11 @@
except ImportError:
from distutils.core import setup
from distutils.command.install import install
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
from distutils.dir_util import mkpath, copy_tree
import glob
import io
import zipfile


class DownloadAndInstall(install):
user_options = install.user_options + [
('corpora-zip-url=', None,
'URL pointing to .zip file of corpora data ' +
'(defaults to current master on GitHub)')
]

def initialize_options(self, *args, **kwargs):
install.initialize_options(self, *args, **kwargs)
self.corpora_zip_url = None

def run(self):
if self.corpora_zip_url is None:
self.corpora_zip_url = \
"https://github.com/dariusk/corpora/archive/master.zip"
print("Installing corpora data from " + self.corpora_zip_url)
mkpath("./corpora-download")
resp = urlopen(self.corpora_zip_url).read()
remote = io.BytesIO(resp)
zf = zipfile.ZipFile(remote, "r")
zf.extractall("corpora-download")
try:
data_dir = glob.glob("./corpora-download/*/data")[0]
except IndexError:
raise IndexError(
"malformed corpora archive: expecting a subdirectory '*/data'")
copy_tree(data_dir, "pycorpora/data")
install.run(self)


setup(
name="pycorpora",
version="0.1.2",
Expand All @@ -59,6 +22,11 @@ def run(self):
license="LICENSE.txt",
long_description=open("README.rst").read(),
keywords="nlp corpus text language",
entry_points = {
'console_scripts': [
"pycorpora-download = pycorpora.downloader:main"
]
},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
Expand All @@ -79,5 +47,4 @@ def run(self):
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Artistic Software",
"Topic :: Scientific/Engineering :: Artificial Intelligence"],
cmdclass={'install': DownloadAndInstall},
)