make_nearest_neighbour_index tool (TF2.0)

command-line tool to: 1- generate embeddings from data using a TF-Hub module 2- build an ANN index using ANNOY library 3- query the built index using command-line PiperOrigin-RevId: 288292037
jackalhan · Jan 7, 2020 · 945303a · 945303a
1 parent 6aee531
commit 945303a
Show file tree

Hide file tree

Showing 14 changed files with 2,021 additions and 2 deletions.
diff --git a/tensorflow_hub/BUILD b/tensorflow_hub/BUILD
@@ -455,6 +455,23 @@ py_library(
     name = "expect_protobuf_installed",
 )
 
+# We expect apache_beam to already be installed on the system, e.g. via
+# `pip install apache_beam`
+py_library(
+    name = "expect_apache_beam_installed",
+)
+
+# We expect annoy to already be installed on the system, e.g. via
+# `pip install annoy`
+py_library(
+    name = "expect_annoy_installed",
+)
+
+# An expectation for resources import
+py_library(
+    name = "expect_resources_installed",
+)
+
 py_library(
     name = "module_v2",
     srcs = ["module_v2.py"],

diff --git a/tensorflow_hub/pip_package/BUILD b/tensorflow_hub/pip_package/BUILD
@@ -23,5 +23,6 @@ sh_binary(
     data = [
         "//tensorflow_hub",
         "//tensorflow_hub/tools/make_image_classifier",
+        "//tensorflow_hub/tools/make_nearest_neighbour_index",
     ],
 )
diff --git a/tensorflow_hub/pip_package/setup.py b/tensorflow_hub/pip_package/setup.py
@@ -25,8 +25,8 @@
 
 # Can't import the module during setup.py.
 # Use execfile to find __version__.
-with open("tensorflow_hub/version.py") as in_file:
-    exec(in_file.read())
+with open('tensorflow_hub/version.py') as in_file:
+  exec(in_file.read())
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.12.0',
@@ -61,12 +61,16 @@
     install_requires=REQUIRED_PACKAGES,
     extras_require={
         'make_image_classifier': ['keras_preprocessing[image]'],
+        'make_nearest_neighbour_index': ['apache_beam', 'annoy'],
     },
     entry_points={
         'console_scripts': [
             ('make_image_classifier = '
              'tensorflow_hub.tools.make_image_classifier.'
              'make_image_classifier:run_main [make_image_classifier]'),
+            ('make_nearest_neighbour_index = tensorflow_hub.tools.'
+             'make_nearest_neighbour_index.main:main '
+             '[make_nearest_neighbour_index]'),
         ],
     },
     # PyPI package information.

diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/BUILD b/tensorflow_hub/tools/make_nearest_neighbour_index/BUILD
@@ -0,0 +1,103 @@
+# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+licenses(["notice"])  # Apache 2.0 License
+
+package(
+    default_visibility = [
+        "//:__subpackages__",
+        "//tensorflow_hub:__subpackages__",
+    ],
+)
+
+# A library for embedding_generator.
+py_library(
+    name = "embedding_generator",
+    srcs = ["embedding_generator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow_hub:expect_apache_beam_installed",
+        "//tensorflow_hub:expect_sklearn_installed",
+        "//tensorflow_hub:expect_tensorflow_installed",
+        "//tensorflow_hub",
+    ],
+)
+
+# A library for index_builder.
+py_library(
+    name = "index_builder",
+    srcs = ["index_builder.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow_hub:expect_annoy_installed",
+        "//tensorflow_hub:expect_tensorflow_installed",
+    ],
+)
+
+# A library for similarity_finder.
+py_library(
+    name = "similarity_finder",
+    srcs = ["similarity_finder.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow_hub:expect_annoy_installed",
+        "//tensorflow_hub:expect_tensorflow_installed",
+        "//tensorflow_hub",
+    ],
+)
+
+# The make_nearest_neighbour_index script as a py_binary.
+py_binary(
+    name = "make_nearest_neighbour_index",
+    srcs = ["make_nearest_neighbour_index.py"],
+    python_version = "PY3",
+    deps = [
+        ":embedding_generator",
+        ":index_builder",
+        ":similarity_finder",
+        "//tensorflow_hub:expect_absl_py_installed",  # ":app"
+        "//tensorflow_hub:expect_tensorflow_installed",
+    ],
+)
+
+py_test(
+    name = "embedding_generator_test",
+    srcs = ["embedding_generator_test.py"],
+    data = [
+        "test_data/data/titles.txt",
+    ],
+    python_version = "PY3",
+    deps = [
+        ":embedding_generator",
+        "//tensorflow_hub:expect_resources_installed",
+        "//tensorflow_hub:expect_absl_py_installed",  # "/flags"
+        "//tensorflow_hub:expect_tensorflow_installed",
+    ],
+)
+
+py_test(
+    name = "index_builder_test",
+    srcs = ["index_builder_test.py"],
+    data = [
+        "test_data/embeds/emb-00000-of-00001.tfrecords",
+        "test_data/embeds/random_projection.matrix",
+    ],
+    python_version = "PY3",
+    deps = [
+        ":index_builder",
+        "//tensorflow_hub:expect_resources_installed",
+        "//tensorflow_hub:expect_absl_py_installed",  # "/flags"
+        "//tensorflow_hub:expect_tensorflow_installed",
+    ],
+)
diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/README.md b/tensorflow_hub/tools/make_nearest_neighbour_index/README.md
@@ -0,0 +1,143 @@
+# Building an Approximate Nearest Neighbour Embedding Index for Similarity Matching
+
+This `make_nearest_neighbour_index` tool helps you to generate embeddings from a
+TF-Hub module given your text input data and build an approximate nearest
+neighbours (ANN) index using the embeddings. The index can then be used for
+real-time similarity matching and retrieval.
+
+We use [Apache Beam](https://beam.apache.org/documentation/programming-guide/)
+to generate the embeddings from the TF-Hub module.
+We also use Spotify's [ANNOY](https://github.com/spotify/annoy) library to
+build the approximate nearest neighbours index.
+
+This tool uses **TensorFlow 2.0**.
+
+
+## Tool setup
+In order for you to use the tool in your local machine, you need to perform the
+following steps:
+
+```
+$ pip install "tensorflow~=2.0"
+$ pip install "tensorflow-hub[make_nearest_neighbour_index]~=0.8"
+```
+
+After installation, the `make_nearest_neighbour_index` executable is available
+on the commandline:
+
+```
+$ make_nearest_neighbour_index --help
+```
+
+## Tool usage
+The  make_nearest_neighbour_index expects one of the following four commands:
+
+### 1- generate
+The **generate** command generates embeddings for text input data using a TF-Hub
+module. The following are the parameters expected by the command:
+
+Parameter              | Type    | Description  |
+---------------------- |---------| -------------|
+ data_file_pattern     | string  | Path to data file(s) to generate embeddings for. The data is expected to be a single-column TSV.|
+ module_url            | string  | TF-Hub module to use. For more options, search https://tfhub.dev. This also can be a path to a [saved_model](https://www.tensorflow.org/guide/saved_model) directory|
+ embed_output_dir      | string  | The directory to store the generated embedding files to.|
+ projected_dim         | int     | **(Optional)** The desired target dimension to project the embedding to. If specified, [random projection](https://en.wikipedia.org/wiki/Random_projection) will be uses. |
+
+The following is an example usage of the command. The command generates text
+embeddings for a set of titles in titles-\*.txt input files using the tf2
+[nnlm-en-128](https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1)
+TF-Hub-module. In addition, it performs random projection of the generated
+embeddings to reduce the dimensionality from 128 to 64 (project-dim).
+
+```
+make_nearest_neighbour_index generate \
+	--data_file_pattern=./data/titles-*.txt \
+	--module_url=https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1 \
+	--embed_output_dir=./output/embed/ \
+	--projected_dim=64
+```
+
+This command produces (one or several) **.tfrecords** embedding files to the
+**embed_output_dir** location. In addition, if random projection was performed,
+a **random_projection.matrix** file is also produced in the **embed_output_dir**
+location, which is a pickled numpy array of the projection weights.
+This is needed for projected the input query and searching the embedding index.
+
+### 2- build
+The **build** command build an ANN index for input embeddings.
+The following are the parameters expected by the command:
+
+Parameter              | Type    | Description  |
+---------------------- |---------| -------------|
+ embed_output_dir    | string  | The directory of the .tfrecords file(s) with the embeddings to build the ANN index for.|
+ num_trees             | int     | **(Optional)** The number of trees to build the ANN index. For more details, refer to https://github.com/spotify/annoy. **Default is 100.** |
+ index_output_dir      | string  | The directory to store the created index and mapping files. |
+
+The following is an example usage of the command. The command builds an ANN
+index with 10 trees for embeddings in .tfrecord files with 64 dimensions.
+
+```
+make_nearest_neighbour_index build \
+	--embed_output_dir='./embed/ \
+	--index_output_dir=./output/index/ \
+	--num_trees=10
+```
+
+This command produces two files:
+
+1. **ann.index**: The serialized ANN index for the embeddings.
+
+2. **ann.index.mapping**: A pickled dictionary to map the internal index
+identifier of an item  to the original item.
+
+3. **random_projection.matrix**: If a random projection matrix was created in
+the embedding generation step, it will be copied to the index output directory.
+
+### 3- e2e
+The **e2e** command performs both embedding generation and index building steps.
+The following is an example usage of the command.
+
+```
+make_nearest_neighbour_index e2e \
+	--data_file_pattern=./test_data/large.txt \
+	--module_url=https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1 \
+	--embed_output_dir=./output/embed/ \
+	--index_output_dir=./output/index/ \
+	--projected_dim=64 \
+	--num_trees=100
+```
+
+### 4- query
+The **query** command allows you to use an ANN index to find similar items to
+a given one. The following are the parameters expected by the command:
+
+Parameter              | Type    | Description  |
+---------------------- |---------| -------------|
+ module_url            | string  | TF-Hub module to use to generate embedding for the input query item. This must be the same module used to generate embeddings in the ANN index. |
+ index_output_dir      | string  | A directory containing the **ann.index** and **ann.index.mapping** files. |
+ num_matches           | int     | The number of similar items to retrieve from the inded. **Default is 5**|
+
+```
+make_nearest_neighbour_index query \
+	--module_url=https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1 \
+	--index_output_dir=./output/index \
+  --num_matches=10
+```
+
+This command will load the provided ANN index, the random projection matrix
+(if provided), and the TF-Hub module, then perform the following:
+
+1.  Accept an input query item from commandline.
+
+2.  Generate embedding for the input item using the TF-Hub module.
+
+3.  (Optional) if a random projection matrix is provided, the embedding is
+    projected to the reduced dimensionality using the matrix weights.
+
+4.  The ANN index is queried using the input item embeddings to retrieve the
+    identifiers of the similar items.
+
+5.  The mapping is used to translate the ANN item identifier to the original
+    item.
+
+6.  The similar items are displayed.