Numerous doc updates (dmlc#66)

zhiheng-huang · Apr 24, 2018 · f68b325 · f68b325
1 parent 14a724b
commit f68b325
Show file tree

Hide file tree

Showing 45 changed files with 437 additions and 402 deletions.
diff --git a/.gitignore b/.gitignore
@@ -107,6 +107,12 @@ ENV/
 tests/data/
 tests/data/embedding/
 tests/data/my_embed/
+tests/externaldata/
+.pytest_cache
 
 # docs
 docs/html
+
+# release
+scripts/*.zip
+docs/examples/*.zip
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -61,6 +61,7 @@ stage("Deploy") {
       python setup.py install
       export LD_LIBRARY_PATH=/usr/local/cuda/lib64
       make clean
+      make release
       make -C docs html"""
 
       if (env.BRANCH_NAME.startsWith("PR-")) {
@@ -77,4 +78,4 @@ stage("Deploy") {
       }
     }
   }
-}
+}
diff --git a/Makefile b/Makefile
@@ -25,11 +25,19 @@ docs:
 
 clean:
 	rm -rf gluonnlp.egg-info build dist | true
-	rm -rf tests/data
+	rm -rf tests/data | true
+	rm scripts/*.zip | true
+	rm docs/examples/*.zip | true
 	make -C docs clean
 
+dist_scripts:
+	find scripts/* -type d -prune | grep -v 'tests\|__pycache__' | xargs -n 1 -I{} zip -r {}.zip {}
+
+dist_notebooks:
+	find docs/examples/* -type d -prune | grep -v 'tests\|__pycache__' | xargs -n 1 -I{} zip -r {}.zip {}
+
 test:
 	py.test -v --capture=no --durations=0  tests/unittest scripts
 
-release:
+release: dist_scripts dist_notebooks
 	python setup.py sdist
diff --git a/docs/Makefile b/docs/Makefile
@@ -51,6 +51,7 @@ help:
 clean:
 	rm -rf $(BUILDDIR)/*
 	rm -rf gen_modules
+	rm -rf html
 
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html

diff --git a/docs/api/data.batchify.rst b/docs/api/data.batchify.rst
@@ -0,0 +1,21 @@
+gluonnlp.data.batchify
+======================
+
+Batchify functions can be used to transform a dataset into mini-batches that can be processed
+efficiently.
+
+.. currentmodule:: gluonnlp.data.batchify
+
+Batch Loaders
+-------------
+.. autosummary::
+    :nosignatures:
+
+    Stack
+    Pad
+    Tuple
+
+API Reference
+-------------
+.. automodule:: gluonnlp.data.batchify
+    :members:
diff --git a/docs/api/data.rst b/docs/api/data.rst
@@ -1,8 +1,10 @@
-Gluon NLP Datasets and Data API
-===============================
+gluonnlp.data
+=============
 
 Gluon NLP Toolkit provides tools for building efficient data pipelines for NLP tasks.
 
+.. currentmodule:: gluonnlp.data
+
 Public Datasets
 ---------------
 Popular datasets for NLP tasks are provided in gluonnlp.
@@ -17,22 +19,146 @@ is a popular language modeling dataset from Salesforce.
 It is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia.
 The dataset is available under the Creative Commons Attribution-ShareAlike License.
 
-.. autoclass:: gluonnlp.data.WikiText2
-.. autoclass:: gluonnlp.data.WikiText103
+.. autosummary::
+    :nosignatures:
+
+    WikiText2
+    WikiText103
 
 Sentiment Analysis: IMDB
 ~~~~~~~~~~~~~~~~~~~~~~~~
 `IMDB <http://ai.stanford.edu/~amaas/data/sentiment/>`_ is a popular dataset for binary sentiment classification.
 It provides a set of 25,000 highly polar movie reviews for training, 25,000 for testing, and additional unlabeled data.
 
-.. autoclass:: gluonnlp.data.IMDB
+.. autosummary::
+    :nosignatures:
+
+    IMDB
 
 Word Embedding Evaluation Datasets
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 There are a number of commonly used datasets for intrinsic evaluation for word embeddings.
 
 The similarity-based evaluation datasets include:
 
+.. autosummary::
+    :nosignatures:
+
+    WordSim353
+    MEN
+    RadinskyMTurk
+    RareWords
+    SimLex999
+    SimVerb3500
+    SemEval17Task2
+    BakerVerb143
+    YangPowersVerb130
+
+Analogy-based evaluation datasets include:
+
+.. autosummary::
+    :nosignatures:
+
+    GoogleAnalogyTestSet
+    BiggerAnalogyTestSet
+
+CoNLL Datasets
+~~~~~~~~~~~~~~
+The `CoNLL <http://www.conll.org/previous-tasks>`_ datasets are from a series of annual
+competitions held at the top tier conference of the same name. The conference is organized by SIGNLL.
+
+These datasets include data for the shared tasks, such as part-of-speech (POS) tagging, chunking,
+named entity recognition (NER), semantic role labeling (SRL), etc.
+
+We provide built in support for CoNLL 2000 -- 2002, 2004, as well as the Universal Dependencies
+dataset which is used in the 2017 and 2018 competitions.
+
+.. autosummary::
+    :nosignatures:
+
+    CoNLL2000
+    CoNLL2001
+    CoNLL2002
+    CoNLL2004
+    UniversalDependencies21
+
+
+Machine Translation Datasets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We provide several standard datasets for machine translation.
+
+.. autosummary::
+    :nosignatures:
+
+    IWSLT2015
+    WMT2016BPE
+
+Datasets
+--------
+
+Dataset API for processing common text formats. The following classes can be used or subclassed to
+load custom datasets.
+
+.. autosummary::
+    :nosignatures:
+
+    TextLineDataset
+    CorpusDataset
+    LanguageModelDataset
+
+Transforms
+----------
+
+Text data transformation functions. They can be used for processing text sequences in conjunction
+with `Dataset.transform` method.
+
+.. autosummary::
+    :nosignatures:
+
+    ClipSequence
+    PadSequence
+    NLTKMosesTokenizer
+    SpacyTokenizer
+
+Samplers
+--------
+
+Samplers determine how to iterate through datasets. The below samplers and batch samplers can help
+iterate through sequence data.
+
+.. autosummary::
+    :nosignatures:
+
+    SortedSampler
+    FixedBucketSampler
+    SortedBucketSampler
+
+Utilities
+---------
+
+Miscellaneous utility classes and functions for processing text and sequence data.
+
+.. autosummary::
+    :nosignatures:
+
+    Counter
+    count_tokens
+    concat_sequence
+    slice_sequence
+    train_valid_split
+    register
+    create
+    list_datasets
+
+API Reference
+-------------
+
+.. autoclass:: gluonnlp.data.WikiText2
+
+.. autoclass:: gluonnlp.data.WikiText103
+
+.. autoclass:: gluonnlp.data.IMDB
+
 .. autoclass:: gluonnlp.data.WordSim353
     :members:
 
@@ -60,25 +186,12 @@ The similarity-based evaluation datasets include:
 .. autoclass:: gluonnlp.data.YangPowersVerb130
     :members:
 
-Analogy-based evaluation datasets include:
-
 .. autoclass:: gluonnlp.data.GoogleAnalogyTestSet
     :members:
 
 .. autoclass:: gluonnlp.data.BiggerAnalogyTestSet
     :members:
 
-CoNLL Datasets
-~~~~~~~~~~~~~~
-The `CoNLL <http://www.conll.org/previous-tasks>`_ datasets are from a series of annual
-competitions held at the top tier conference of the same name. The conference is organized by SIGNLL.
-
-These datasets include data for the shared tasks, such as part-of-speech (POS) tagging, chunking,
-named entity recognition (NER), semantic role labeling (SRL), etc.
-
-We provide built in support for CoNLL 2000 -- 2002, 2004, as well as the Universal Dependencies
-dataset which is used in the 2017 and 2018 competitions.
-
 .. autoclass:: gluonnlp.data.CoNLL2000
 
 .. autoclass:: gluonnlp.data.CoNLL2001
@@ -89,41 +202,37 @@ dataset which is used in the 2017 and 2018 competitions.
 
 .. autoclass:: gluonnlp.data.UniversalDependencies21
 
-
-Machine Translation Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-We provide several standard datasets for machine translation.
-
 .. autoclass:: gluonnlp.data.IWSLT2015
 
 .. autoclass:: gluonnlp.data.WMT2016BPE
 
-Datasets
---------
+.. autoclass:: gluonnlp.data.TextLineDataset
 
-.. automodule:: gluonnlp.data.dataset
-    :members:
+.. autoclass:: gluonnlp.data.CorpusDataset
 
-Transformers
-------------
+.. autoclass:: gluonnlp.data.LanguageModelDataset
 
-.. automodule:: gluonnlp.data.transforms
-    :members:
+.. autoclass:: gluonnlp.data.ClipSequence
 
-Batch Loaders
--------------
+.. autoclass:: gluonnlp.data.PadSequence
 
-.. automodule:: gluonnlp.data.batchify
-    :members:
+.. autoclass:: gluonnlp.data.NLTKMosesTokenizer
 
-Samplers
---------
+.. autoclass:: gluonnlp.data.SpacyTokenizer
 
-.. automodule:: gluonnlp.data.sampler
-    :members:
+.. autoclass:: gluonnlp.data.SortedSampler
 
-Utilities
----------
+.. autoclass:: gluonnlp.data.FixedBucketSampler
+
+.. autoclass:: gluonnlp.data.SortedBucketSampler
+
+.. autoclass:: gluonnlp.data.Counter
+
+.. autofunction:: gluonnlp.data.count_tokens
+
+.. autofunction:: gluonnlp.data.concat_sequence
+
+.. autofunction:: gluonnlp.data.slice_sequence
+
+.. autofunction:: gluonnlp.data.train_valid_split
 
-.. automodule:: gluonnlp.data.utils
-    :members:
diff --git a/docs/api/embedding.rst b/docs/api/embedding.rst
@@ -0,0 +1,43 @@
+gluonnlp.embedding
+==================
+
+This page describes ``gluonnlp`` APIs for text embedding, such as loading pre-trained
+embedding vectors for text tokens and storing them in the ``mxnet.ndarray.NDArray`` format,
+and utility for intrinsic evaluation of text embeddings.
+
+.. autosummary::
+    :nosignatures:
+
+    gluonnlp.embedding
+
+.. currentmodule:: gluonnlp.embedding
+.. autosummary::
+    :nosignatures:
+
+    register
+    create
+    list_sources
+    TokenEmbedding
+    GloVe
+    FastText
+
+API Reference
+-------------
+
+.. raw:: html
+
+   <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+.. automodule:: gluonnlp.embedding
+    :members: register, create, list_sources
+.. autoclass:: gluonnlp.embedding.TokenEmbedding
+    :members: from_file, serialize, deserialize
+.. autoclass:: gluonnlp.embedding.GloVe
+.. autoclass:: gluonnlp.embedding.FastText
+
+.. automodule:: gluonnlp.embedding.evaluation
+    :members:
+
+.. raw:: html
+
+   <script>auto_index("api-reference");</script>