diff --git a/tensorflow_hub/BUILD b/tensorflow_hub/BUILD index 1c45a255c..313859f20 100644 --- a/tensorflow_hub/BUILD +++ b/tensorflow_hub/BUILD @@ -455,6 +455,23 @@ py_library( name = "expect_protobuf_installed", ) +# We expect apache_beam to already be installed on the system, e.g. via +# `pip install apache_beam` +py_library( + name = "expect_apache_beam_installed", +) + +# We expect annoy to already be installed on the system, e.g. via +# `pip install annoy` +py_library( + name = "expect_annoy_installed", +) + +# An expectation for resources import +py_library( + name = "expect_resources_installed", +) + py_library( name = "module_v2", srcs = ["module_v2.py"], diff --git a/tensorflow_hub/pip_package/BUILD b/tensorflow_hub/pip_package/BUILD index fb5b512be..0c6878b31 100644 --- a/tensorflow_hub/pip_package/BUILD +++ b/tensorflow_hub/pip_package/BUILD @@ -23,5 +23,6 @@ sh_binary( data = [ "//tensorflow_hub", "//tensorflow_hub/tools/make_image_classifier", + "//tensorflow_hub/tools/make_nearest_neighbour_index", ], ) diff --git a/tensorflow_hub/pip_package/setup.py b/tensorflow_hub/pip_package/setup.py index 835d5986c..d84f411e1 100644 --- a/tensorflow_hub/pip_package/setup.py +++ b/tensorflow_hub/pip_package/setup.py @@ -25,8 +25,8 @@ # Can't import the module during setup.py. # Use execfile to find __version__. -with open("tensorflow_hub/version.py") as in_file: - exec(in_file.read()) +with open('tensorflow_hub/version.py') as in_file: + exec(in_file.read()) REQUIRED_PACKAGES = [ 'numpy >= 1.12.0', @@ -61,12 +61,16 @@ install_requires=REQUIRED_PACKAGES, extras_require={ 'make_image_classifier': ['keras_preprocessing[image]'], + 'make_nearest_neighbour_index': ['apache_beam', 'annoy'], }, entry_points={ 'console_scripts': [ ('make_image_classifier = ' 'tensorflow_hub.tools.make_image_classifier.' 'make_image_classifier:run_main [make_image_classifier]'), + ('make_nearest_neighbour_index = tensorflow_hub.tools.' + 'make_nearest_neighbour_index.main:main ' + '[make_nearest_neighbour_index]'), ], }, # PyPI package information. diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/BUILD b/tensorflow_hub/tools/make_nearest_neighbour_index/BUILD new file mode 100644 index 000000000..bf3812734 --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/BUILD @@ -0,0 +1,103 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +licenses(["notice"]) # Apache 2.0 License + +package( + default_visibility = [ + "//:__subpackages__", + "//tensorflow_hub:__subpackages__", + ], +) + +# A library for embedding_generator. +py_library( + name = "embedding_generator", + srcs = ["embedding_generator.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow_hub:expect_apache_beam_installed", + "//tensorflow_hub:expect_sklearn_installed", + "//tensorflow_hub:expect_tensorflow_installed", + "//tensorflow_hub", + ], +) + +# A library for index_builder. +py_library( + name = "index_builder", + srcs = ["index_builder.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow_hub:expect_annoy_installed", + "//tensorflow_hub:expect_tensorflow_installed", + ], +) + +# A library for similarity_finder. +py_library( + name = "similarity_finder", + srcs = ["similarity_finder.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow_hub:expect_annoy_installed", + "//tensorflow_hub:expect_tensorflow_installed", + "//tensorflow_hub", + ], +) + +# The make_nearest_neighbour_index script as a py_binary. +py_binary( + name = "make_nearest_neighbour_index", + srcs = ["make_nearest_neighbour_index.py"], + python_version = "PY3", + deps = [ + ":embedding_generator", + ":index_builder", + ":similarity_finder", + "//tensorflow_hub:expect_absl_py_installed", # ":app" + "//tensorflow_hub:expect_tensorflow_installed", + ], +) + +py_test( + name = "embedding_generator_test", + srcs = ["embedding_generator_test.py"], + data = [ + "test_data/data/titles.txt", + ], + python_version = "PY3", + deps = [ + ":embedding_generator", + "//tensorflow_hub:expect_resources_installed", + "//tensorflow_hub:expect_absl_py_installed", # "/flags" + "//tensorflow_hub:expect_tensorflow_installed", + ], +) + +py_test( + name = "index_builder_test", + srcs = ["index_builder_test.py"], + data = [ + "test_data/embeds/emb-00000-of-00001.tfrecords", + "test_data/embeds/random_projection.matrix", + ], + python_version = "PY3", + deps = [ + ":index_builder", + "//tensorflow_hub:expect_resources_installed", + "//tensorflow_hub:expect_absl_py_installed", # "/flags" + "//tensorflow_hub:expect_tensorflow_installed", + ], +) diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/README.md b/tensorflow_hub/tools/make_nearest_neighbour_index/README.md new file mode 100644 index 000000000..ad7f55ec5 --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/README.md @@ -0,0 +1,143 @@ +# Building an Approximate Nearest Neighbour Embedding Index for Similarity Matching + +This `make_nearest_neighbour_index` tool helps you to generate embeddings from a +TF-Hub module given your text input data and build an approximate nearest +neighbours (ANN) index using the embeddings. The index can then be used for +real-time similarity matching and retrieval. + +We use [Apache Beam](https://beam.apache.org/documentation/programming-guide/) +to generate the embeddings from the TF-Hub module. +We also use Spotify's [ANNOY](https://github.com/spotify/annoy) library to +build the approximate nearest neighbours index. + +This tool uses **TensorFlow 2.0**. + + +## Tool setup +In order for you to use the tool in your local machine, you need to perform the +following steps: + +``` +$ pip install "tensorflow~=2.0" +$ pip install "tensorflow-hub[make_nearest_neighbour_index]~=0.8" +``` + +After installation, the `make_nearest_neighbour_index` executable is available +on the commandline: + +``` +$ make_nearest_neighbour_index --help +``` + +## Tool usage +The make_nearest_neighbour_index expects one of the following four commands: + +### 1- generate +The **generate** command generates embeddings for text input data using a TF-Hub +module. The following are the parameters expected by the command: + +Parameter | Type | Description | +---------------------- |---------| -------------| + data_file_pattern | string | Path to data file(s) to generate embeddings for. The data is expected to be a single-column TSV.| + module_url | string | TF-Hub module to use. For more options, search https://tfhub.dev. This also can be a path to a [saved_model](https://www.tensorflow.org/guide/saved_model) directory| + embed_output_dir | string | The directory to store the generated embedding files to.| + projected_dim | int | **(Optional)** The desired target dimension to project the embedding to. If specified, [random projection](https://en.wikipedia.org/wiki/Random_projection) will be uses. | + +The following is an example usage of the command. The command generates text +embeddings for a set of titles in titles-\*.txt input files using the tf2 +[nnlm-en-128](https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1) +TF-Hub-module. In addition, it performs random projection of the generated +embeddings to reduce the dimensionality from 128 to 64 (project-dim). + +``` +make_nearest_neighbour_index generate \ + --data_file_pattern=./data/titles-*.txt \ + --module_url=https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1 \ + --embed_output_dir=./output/embed/ \ + --projected_dim=64 +``` + +This command produces (one or several) **.tfrecords** embedding files to the +**embed_output_dir** location. In addition, if random projection was performed, +a **random_projection.matrix** file is also produced in the **embed_output_dir** +location, which is a pickled numpy array of the projection weights. +This is needed for projected the input query and searching the embedding index. + +### 2- build +The **build** command build an ANN index for input embeddings. +The following are the parameters expected by the command: + +Parameter | Type | Description | +---------------------- |---------| -------------| + embed_output_dir | string | The directory of the .tfrecords file(s) with the embeddings to build the ANN index for.| + num_trees | int | **(Optional)** The number of trees to build the ANN index. For more details, refer to https://github.com/spotify/annoy. **Default is 100.** | + index_output_dir | string | The directory to store the created index and mapping files. | + +The following is an example usage of the command. The command builds an ANN +index with 10 trees for embeddings in .tfrecord files with 64 dimensions. + +``` +make_nearest_neighbour_index build \ + --embed_output_dir='./embed/ \ + --index_output_dir=./output/index/ \ + --num_trees=10 +``` + +This command produces two files: + +1. **ann.index**: The serialized ANN index for the embeddings. + +2. **ann.index.mapping**: A pickled dictionary to map the internal index +identifier of an item to the original item. + +3. **random_projection.matrix**: If a random projection matrix was created in +the embedding generation step, it will be copied to the index output directory. + +### 3- e2e +The **e2e** command performs both embedding generation and index building steps. +The following is an example usage of the command. + +``` +make_nearest_neighbour_index e2e \ + --data_file_pattern=./test_data/large.txt \ + --module_url=https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1 \ + --embed_output_dir=./output/embed/ \ + --index_output_dir=./output/index/ \ + --projected_dim=64 \ + --num_trees=100 +``` + +### 4- query +The **query** command allows you to use an ANN index to find similar items to +a given one. The following are the parameters expected by the command: + +Parameter | Type | Description | +---------------------- |---------| -------------| + module_url | string | TF-Hub module to use to generate embedding for the input query item. This must be the same module used to generate embeddings in the ANN index. | + index_output_dir | string | A directory containing the **ann.index** and **ann.index.mapping** files. | + num_matches | int | The number of similar items to retrieve from the inded. **Default is 5**| + +``` +make_nearest_neighbour_index query \ + --module_url=https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1 \ + --index_output_dir=./output/index \ + --num_matches=10 +``` + +This command will load the provided ANN index, the random projection matrix +(if provided), and the TF-Hub module, then perform the following: + +1. Accept an input query item from commandline. + +2. Generate embedding for the input item using the TF-Hub module. + +3. (Optional) if a random projection matrix is provided, the embedding is + projected to the reduced dimensionality using the matrix weights. + +4. The ANN index is queried using the input item embeddings to retrieve the + identifiers of the similar items. + +5. The mapping is used to translate the ANN item identifier to the original + item. + +6. The similar items are displayed. diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/embedding_generator.py b/tensorflow_hub/tools/make_nearest_neighbour_index/embedding_generator.py new file mode 100644 index 000000000..028dab4f0 --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/embedding_generator.py @@ -0,0 +1,135 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Generates embedding using a TF-Hub module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle + +import apache_beam as beam +from apache_beam.transforms import util +from sklearn.random_projection import gaussian_random_matrix +import tensorflow as tf +import tensorflow_hub as hub + +_RUNNER = 'DirectRunner' +_RANDOM_PROJECTION_FILENAME = 'random_projection.matrix' +_BATCH_SIZE = 1028 + +embed_fn = None + + +def generate_embeddings(items, module_url, random_projection_matrix=None): + """Generates embeddings using a TF-Hub module. + + Args: + items: The items to generate embedding for. + module_url: The TF-Hub module url. + random_projection_matrix: A numpy array of the random projection weights. + + Returns: + item, embedding tuple. + """ + + global embed_fn + if embed_fn is None: + embed_fn = hub.load(module_url) + embeddings = embed_fn(items).numpy() + if random_projection_matrix is not None: + embeddings = embeddings.dot(random_projection_matrix) + return items, embeddings + + +def to_tf_example(entries): + """Convert to tf example.""" + + examples = [] + + item_list, embedding_list = entries + for i in range(len(item_list)): + item = item_list[i] + embedding = embedding_list[i] + + features = { + 'item': + tf.train.Feature( + bytes_list=tf.train.BytesList(value=[item.encode('utf-8')])), + 'embedding': + tf.train.Feature( + float_list=tf.train.FloatList(value=embedding.tolist())) + } + + example = tf.train.Example(features=tf.train.Features( + feature=features)).SerializeToString(deterministic=True) + + examples.append(example) + + return examples + + +def generate_random_projection_weights(original_dim, projected_dim, output_dir): + """Generates a Gaussian random projection weights matrix.""" + + random_projection_matrix = None + if projected_dim and original_dim > projected_dim: + random_projection_matrix = gaussian_random_matrix( + n_components=projected_dim, n_features=original_dim).T + print('A Gaussian random weight matrix was creates with shape of {}'.format( + random_projection_matrix.shape)) + print('Storing random projection matrix to disk...') + output_file_path = os.path.join(output_dir, _RANDOM_PROJECTION_FILENAME) + with open(output_file_path, 'wb') as handle: + pickle.dump( + random_projection_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL) + print('Random projection matrix saved to disk.') + + return random_projection_matrix + + +def run(args): + """Runs the embedding generation Beam pipeline.""" + + if tf.io.gfile.exists(args.embed_output_dir): + print('Removing embedding output directory...') + tf.io.gfile.rmtree(args.embed_output_dir) + print('Creating empty output directory...') + tf.io.gfile.makedirs(args.embed_output_dir) + + options = beam.options.pipeline_options.PipelineOptions(**vars(args)) + + original_dim = hub.load(args.module_url)(['']).shape[1] + + random_projection_matrix = generate_random_projection_weights( + original_dim, args.projected_dim, args.embed_output_dir) + + print('Starting the Beam pipeline...') + with beam.Pipeline(runner=_RUNNER, options=options) as pipeline: + _ = ( + pipeline + | 'Read sentences from files' >> + beam.io.ReadFromText(file_pattern=args.data_file_pattern) + | 'Batch elements' >> util.BatchElements( + min_batch_size=_BATCH_SIZE / 2, max_batch_size=_BATCH_SIZE) + | 'Generate embeddings' >> beam.Map( + generate_embeddings, args.module_url, random_projection_matrix) + | 'Encode to tf example' >> beam.FlatMap(to_tf_example) + | 'Write to TFRecords files' >> beam.io.WriteToTFRecord( + file_path_prefix='{}/emb'.format(args.embed_output_dir), + file_name_suffix='.tfrecords') + ) + + print('Beam pipeline completed.') diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/embedding_generator_test.py b/tensorflow_hub/tools/make_nearest_neighbour_index/embedding_generator_test.py new file mode 100644 index 000000000..d271eae7f --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/embedding_generator_test.py @@ -0,0 +1,119 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests tensorflow_hub.tools.make_nearest_neighbour_index.embedding_generator.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +from absl import flags +import tensorflow as tf +from tensorflow_hub.tools.make_nearest_neighbour_index import embedding_generator +# resources dependency + +MNNI_FOLDER = ("org_tensorflow_hub/tools/" + "make_nearest_neighbour_index/") + +flags.DEFINE_string("data_file_pattern", None, "") +flags.DEFINE_string("module_url", None, "") +flags.DEFINE_integer("projected_dim", None, "") +flags.DEFINE_string("embed_output_dir", None, "") + +FLAGS = flags.FLAGS + + +def _get_resource(dirname, filename): + return os.path.join(os.path.dirname(__file__), filename) + + +class EmbeddingGeneratorTest(tf.test.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + # create run parameters + # FLAGS.data_file_pattern = _get_resource( + # MNNI_FOLDER, "test_data/data/titles.txt") + FLAGS.data_file_pattern = _get_resource(MNNI_FOLDER, + "test_data/data/titles.txt") + + FLAGS.module_url = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1" + FLAGS.embed_output_dir = os.path.join(self.get_temp_dir(), "embeds") + + def test_run(self): + FLAGS.projected_dim = None + + # Make sure we don't test for pre-existing files. + self.assertFalse(os.path.isfile(FLAGS.embed_output_dir)) + + # Run embedding_generator + embedding_generator.run(FLAGS) + + # Make sure that the embedding directory is created. + self.assertTrue(os.path.exists(FLAGS.embed_output_dir)) + # Make sure that the embedding file is created. + expected_embedding_file = os.path.join(FLAGS.embed_output_dir, + "emb-00000-of-00001.tfrecords") + self.assertTrue(os.path.isfile(expected_embedding_file)) + + def test_run_with_projection(self): + FLAGS.projected_dim = 64 + + # Make sure we don't test for pre-existing files. + self.assertFalse(os.path.isfile(FLAGS.embed_output_dir)) + + # Run embedding_generator + embedding_generator.run(FLAGS) + + # Make sure that the embedding directory is created. + self.assertTrue(os.path.exists(FLAGS.embed_output_dir)) + # Make sure that the embedding file is created. + expected_embedding_file = os.path.join(FLAGS.embed_output_dir, + "emb-00000-of-00001.tfrecords") + self.assertTrue(os.path.isfile(expected_embedding_file)) + # Make sure that the random prjection file is created. + expected_projection_matrix_file = os.path.join(FLAGS.embed_output_dir, + "random_projection.matrix") + self.assertTrue(os.path.isfile(expected_projection_matrix_file)) + + +def _ensure_tf2(): + """Ensure running with TensorFlow 2 behavior. + + This function is safe to call even before flags have been parsed. + + Raises: + ImportError: If tensorflow is too old for proper TF2 behavior. + """ + print("Running with tensorflow %s (git version %s)", tf.__version__, + tf.__git_version__) + if tf.__version__.startswith("1."): + if tf.__git_version__ == "unknown": # For internal testing use. + try: + tf.compat.v1.enable_v2_behavior() + return + except AttributeError: + pass # Fail below for missing enabler function. + raise ImportError("Sorry, this program needs TensorFlow 2.") + + +if __name__ == "__main__": + try: + _ensure_tf2() + except ImportError as e: + print("Skipping tests:", str(e)) + sys.exit(0) + tf.test.main() diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/index_builder.py b/tensorflow_hub/tools/make_nearest_neighbour_index/index_builder.py new file mode 100644 index 000000000..75037644f --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/index_builder.py @@ -0,0 +1,120 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Builds approximate nearest neighbor index for embeddings.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle +import shutil + +import annoy +import tensorflow as tf + +_INDEX_FILENAME = 'ann.index' +_MAPPING_FILENAME = 'ann.index.mapping' +_RANDOM_PROJECTION_FILENAME = 'random_projection.matrix' +_METRIC = 'angular' + + +def _parse_example(example): + """Parse TF Example.""" + + # Create a description of the features in the tfrecords. + feature_description = { + 'item': tf.io.FixedLenFeature([], tf.string), + 'embedding': tf.io.VarLenFeature(tf.float32) + } + # Parse the input `tf.Example` proto using the dictionary above. + return tf.io.parse_single_example(example, feature_description) + + +def _infer_dimensions(embed_file): + """Infers the embedding vector size.""" + + dimensions = None + for record in tf.data.TFRecordDataset(embed_file).map(_parse_example): + dimensions = record['embedding'].shape[0] + break + return dimensions + + +def run(args): + """Runs the index building process.""" + + embed_output_dir = args.embed_output_dir + output_dir = args.index_output_dir + num_trees = args.num_trees + index_file_path = os.path.join(output_dir, _INDEX_FILENAME) + mapping_file_path = os.path.join(output_dir, _MAPPING_FILENAME) + + if tf.io.gfile.exists(output_dir): + print('Index output directory...') + tf.io.gfile.rmtree(output_dir) + print('Creating empty output directory...') + tf.io.gfile.makedirs(output_dir) + + embed_files = tf.io.gfile.glob(os.path.join(embed_output_dir, '*.tfrecords')) + num_files = len(embed_files) + print('Found {} embedding file(s).'.format(num_files)) + + dimensions = _infer_dimensions(embed_files[0]) + print('Embedding size: {}'.format(dimensions)) + + annoy_index = annoy.AnnoyIndex(dimensions, metric=_METRIC) + + # Mapping between the item and its identifier in the index + mapping = {} + + item_counter = 0 + for i, embed_file in enumerate(embed_files): + print('Loading embeddings in file {} of {}...'.format( + i + 1, num_files)) + dataset = tf.data.TFRecordDataset(embed_file) + for record in dataset.map(_parse_example): + item = record['item'].numpy().decode('utf-8') + embedding = record['embedding'].values.numpy() + mapping[item_counter] = item + annoy_index.add_item(item_counter, embedding) + item_counter += 1 + if item_counter % 200000 == 0: + print('{} items loaded to the index'.format(item_counter)) + + print('A total of {} items added to the index'.format(item_counter)) + + print('Building the index with {} trees...'.format(num_trees)) + annoy_index.build(n_trees=num_trees) + print('Index is successfully built.') + + print('Saving index to disk...') + annoy_index.save(index_file_path) + print('Index is saved to disk. File size: {} GB'.format( + round(os.path.getsize(index_file_path) / float(1024**3), 2))) + annoy_index.unload() + + print('Saving mapping to disk...') + with open(mapping_file_path, 'wb') as handle: + pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL) + print('Mapping is saved to disk. File size: {} MB'.format( + round(os.path.getsize(mapping_file_path) / float(1024**2), 2))) + + random_projection_file_path = os.path.join( + args.embed_output_dir, _RANDOM_PROJECTION_FILENAME) + if os.path.exists(random_projection_file_path): + shutil.copy( + random_projection_file_path, os.path.join( + args.index_output_dir, _RANDOM_PROJECTION_FILENAME)) + print('Random projection matrix file copies to index output directory.') diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/index_builder_test.py b/tensorflow_hub/tools/make_nearest_neighbour_index/index_builder_test.py new file mode 100644 index 000000000..3573300fa --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/index_builder_test.py @@ -0,0 +1,98 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests tensorflow_hub.tools.make_nearest_neighbour_index.index_builder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +from absl import flags +import tensorflow as tf +from tensorflow_hub.tools.make_nearest_neighbour_index import index_builder +# resources dependency + +MNNI_FOLDER = ("org_tensorflow_hub/tools/" + "make_nearest_neighbour_index/") + +FLAGS = flags.FLAGS + +flags.DEFINE_integer("embed_output_dir", None, "") +flags.DEFINE_integer("num_trees", 10, "") +flags.DEFINE_string("index_output_dir", None, "") + + +def _get_resource(dirname, filename): + return os.path.join(os.path.dirname(__file__), filename) + + +class IndexBuilderTest(tf.test.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + # Create run parameters + FLAGS.embed_output_dir = _get_resource(MNNI_FOLDER, "test_data/embeds/") + FLAGS.index_output_dir = os.path.join(self.get_temp_dir(), "index") + + def test_run(self): + # Make sure we don't test for pre-existing files. + self.assertFalse(os.path.isfile(FLAGS.index_output_dir)) + + # Run index_builder + index_builder.run(FLAGS) + + # Make sure that the index directory is created. + self.assertTrue(os.path.exists(FLAGS.index_output_dir)) + # Make sure that the index file is created. + expected_index = os.path.join(FLAGS.index_output_dir, "ann.index") + self.assertTrue(os.path.isfile(expected_index)) + # Make sure that the mapping file is created. + expected_mapping_file = os.path.join(FLAGS.index_output_dir, + "ann.index.mapping") + self.assertTrue(os.path.isfile(expected_mapping_file)) + # Make sure that the random prjection file is created. + expected_projection_matrix_file = os.path.join(FLAGS.index_output_dir, + "random_projection.matrix") + self.assertTrue(os.path.isfile(expected_projection_matrix_file)) + + +def _ensure_tf2(): + """Ensure running with TensorFlow 2 behavior. + + This function is safe to call even before flags have been parsed. + + Raises: + ImportError: If tensorflow is too old for proper TF2 behavior. + """ + print("Running with tensorflow %s (git version %s)", tf.__version__, + tf.__git_version__) + if tf.__version__.startswith("1."): + if tf.__git_version__ == "unknown": # For internal testing use. + try: + tf.compat.v1.enable_v2_behavior() + return + except AttributeError: + pass # Fail below for missing enabler function. + raise ImportError("Sorry, this program needs TensorFlow 2.") + + +if __name__ == "__main__": + try: + _ensure_tf2() + except ImportError as e: + print("Skipping tests:", str(e)) + sys.exit(0) + tf.test.main() diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/make_nearest_neighbour_index.py b/tensorflow_hub/tools/make_nearest_neighbour_index/make_nearest_neighbour_index.py new file mode 100644 index 000000000..33cc4a2ca --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/make_nearest_neighbour_index.py @@ -0,0 +1,179 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Entry point to run the hub2ann tool.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app +from absl import flags +import tensorflow as tf + +from tensorflow_hub.tools.make_nearest_neighbour_index import embedding_generator as generator +from tensorflow_hub.tools.make_nearest_neighbour_index import index_builder as builder +from tensorflow_hub.tools.make_nearest_neighbour_index import similarity_finder as finder + +# Embedding generator flags +flags.DEFINE_string( + "data_file_pattern", None, + "Path to data file(s) to generate embeddings for.") +flags.DEFINE_string( + "module_url", None, "TF-Hub module to use. " + "For more options, search https://tfhub.dev.") +flags.DEFINE_integer( + "projected_dim", None, + "The desired target dimension to project the embedding to. " + "If specified, random projection will be uses.") +flags.DEFINE_string( + "embed_output_dir", None, + "The directory to store the generated embedding files to. " + "This can be a local or a GCS location.") + +# index builder parameters +flags.DEFINE_integer( + "num_trees", 100, + "The number of trees to build the ANN index. Default is 100. " + "For more details, refer to https://github.com/spotify/annoy.") +flags.DEFINE_string( + "index_output_dir", None, + "The directory to store the created index and mapping files. " + "This can be a local or GCS location.") + +# similarity matching parameters +flags.DEFINE_integer( + "num_matches", 10, + "The number of similar matches to retrieve from the ANN index. " + "Default is 10.") + +FLAGS = flags.FLAGS + + +def validate_args(args): + """Validates the command line arguments specified by the user.""" + + if len(args) < 2 or args[1] not in ["generate", "build", "e2e", "query"]: + raise ValueError("You need to specify one of four operations: " + "generate | build | e2e | query") + + def _validate_generate_args(): + """Validates generate operation args.""" + if not FLAGS.data_file_pattern: + raise ValueError( + "You must provide --data_file_pattern to generate embeddings for.") + if not FLAGS.module_url: + raise ValueError( + "You must provide --module_url to use for embeddings generation.") + if not FLAGS.embed_output_dir: + raise ValueError( + "You must provide --embed_output_dir to store the embedding files.") + if FLAGS.projected_dim and FLAGS.projected_dim < 1: + raise ValueError("--projected_dim must be a positive integer value.") + + def _validate_build_args(e2e=False): + """Validates build operation args.""" + if not FLAGS.embed_output_dir and not e2e: + raise ValueError( + "You must provide --embed_output_dir of the embeddings" + "to build the ANN index for.") + if not FLAGS.index_output_dir: + raise ValueError( + "You must provide --index_output_dir to store the index files.") + if not FLAGS.num_trees or FLAGS.num_trees < 1: + raise ValueError( + "You must provide --num_trees as a positive integer value.") + + def _validate_query_args(): + if not FLAGS.module_url: + raise ValueError("You must provide --module_url to use for query.") + if not FLAGS.index_output_dir: + raise ValueError("You must provide --index_output_dir to use for query.") + + operation = args[1] + if operation == "generate": + _validate_generate_args() + elif operation == "build": + _validate_build_args() + elif operation == "e2e": + _validate_generate_args() + _validate_build_args(True) + else: + _validate_query_args() + + return operation + + +def _ensure_tf2(): + """Ensure running with TensorFlow 2 behavior. + + This function is safe to call even before flags have been parsed. + + Raises: + ImportError: If tensorflow is too old for proper TF2 behavior. + """ + print("Running with tensorflow %s (git version %s)", + tf.__version__, tf.__git_version__) + if tf.__version__.startswith("1."): + if tf.__git_version__ == "unknown": # For internal testing use. + try: + tf.compat.v1.enable_v2_behavior() + return + except AttributeError: + pass # Fail below for missing enabler function. + raise ImportError("Sorry, this program needs TensorFlow 2.") + + +def main(args): + """Entry point main function.""" + + operation = validate_args(args) + print("Selected operation: {}".format(operation)) + + if operation == "generate": + print("Generating embeddings...") + generator.run(FLAGS) + print("Embedding generation completed.") + + elif operation == "build": + print("Building ANN index...") + builder.run(FLAGS) + print("Building ANN index completed.") + + elif operation == "e2e": + print("Generating embeddings and building ANN index...") + generator.run(FLAGS) + print("Embedding generation completed.") + if FLAGS.projected_dim: + FLAGS.dimensions = FLAGS.projected_dim + + builder.run(FLAGS) + print("Building ANN index completed.") + + else: + print("Querying the ANN index...") + similarity_finder = finder.load(FLAGS) + num_matches = FLAGS.num_matches + while True: + print("Enter your query: ", end="") + query = str(input()) + similar_items = similarity_finder.find_similar_items(query, num_matches) + print("Results:") + print("=========") + for item in similar_items: + print(item) + + +if __name__ == "__main__": + _ensure_tf2() + app.run(main) diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/similarity_finder.py b/tensorflow_hub/tools/make_nearest_neighbour_index/similarity_finder.py new file mode 100644 index 000000000..ed9e27e1e --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/similarity_finder.py @@ -0,0 +1,100 @@ +# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Find similar items for a given query in the ANN index.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle + +import annoy +import tensorflow as tf +import tensorflow_hub as hub + +_INDEX_FILENAME = 'ann.index' +_MAPPING_FILENAME = 'ann.index.mapping' +_RANDOM_PROJECTION_FILENAME = 'random_projection.matrix' +_METRIC = 'angular' + + +class SimilarityFinder(object): + """Similarity finder class.""" + + def __init__( + self, + module_url, + index_file_path, + mapping_file_path, + dimensions, + random_projection_matrix_file, + ): + + # Load the TF-Hub module + print('Loading the TF-Hub module...') + self.embed_fn = hub.load(module_url) + print('TF-hub module is loaded.') + + dimensions = self.embed_fn(['']).shape[1] + + self.random_projection_matrix = None + if tf.io.gfile.exists(random_projection_matrix_file): + with open(random_projection_matrix_file, 'rb') as handle: + self.random_projection_matrix = pickle.load(handle) + dimensions = self.random_projection_matrix.shape[1] + + self.index = annoy.AnnoyIndex(dimensions, metric=_METRIC) + self.index.load(index_file_path, prefault=True) + print('Annoy index is loaded.') + with open(mapping_file_path, 'rb') as handle: + self.mapping = pickle.load(handle) + print('Mapping file is loaded.') + + def find_similar_items(self, query, num_matches=5): + """Finds similar items to a given quey in the ANN index. + + Args: + query: The query string + num_matches: The number of similar items to retrieve. + + Returns: + List of items. + """ + + query_embedding = self.embed_fn([query])[0].numpy() + if self.random_projection_matrix is not None: + query_embedding = query_embedding.dot(self.random_projection_matrix) + ids = self.index.get_nns_by_vector( + query_embedding, num_matches, search_k=-1, include_distances=False) + items = [self.mapping[i] for i in ids] + return items + + +def load(args): + + module_url = args.module_url + index_file_path = os.path.join(args.index_output_dir, _INDEX_FILENAME) + mapping_file_path = os.path.join(args.index_output_dir, _MAPPING_FILENAME) + dimensions = args.dimensions + random_projection_matrix_file = os.path.join( + args.index_output_dir, _RANDOM_PROJECTION_FILENAME) + + return SimilarityFinder( + module_url, + index_file_path, + mapping_file_path, + dimensions, + random_projection_matrix_file, + ) diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/data/titles.txt b/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/data/titles.txt new file mode 100644 index 000000000..7e2e27b46 --- /dev/null +++ b/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/data/titles.txt @@ -0,0 +1,1000 @@ +pakistan suspends al qaeda offensive +palestinian authority condemns yassin killing +palestinian boy killed during rally doctors +part time dentist to service chinchilla +peaceful start to duck hunting season +perth swelters in record breaking march heat +pitlands coordinator commits to one month +plucky japanese horse preserves losing streak +police hunt armed bandits +police make dna bid over moran killing +police probe albury car jacking +police probe fatal car crash +police probe vandalism attack +police seek public help in murder case +pooches soak up luxury treatments at dog spa +portland port to get new security plan +portsmouth gain vital win in south coast derby +ports vulnerable marine authority says +primus expected to play +property tax rise hurting home buyers +public support sought for septic tank plan +push for agnes water pool +questions raised over forest logging +questions remain over gasworks contamination +rail authorities to hold anti terror talks +rally to hear kempsey hospital fears +rocket attacks mark iraq occupation anniversary +rock formations may provide key to mineral +ruddock quiet on brigittes terror cell +safety policies cant cover all emergencies inquiry +sa govt backs dpp decision to drop afl rape case +sa govt backs dpp decision to drop rape case +sa oppn highlights salt water concerns +scientists find cousin to sars +scud fires up with fitness adviser +sears signs for sharks +senate again rejects unfair dismissal bill +senator blasts lack of consultation over child +senior jailed for embezzling 50000 +serena rises in rankings as return nears +shipping group expresses security doubts +sigma posts better than expected profits +soccer boss keen to avoid other codes +sorenstam opens lpga season in style +south korea confirms new bird flu case +speeds leave road safety authorities perplexed +sports group stops election involvement +sri lanka playing for pride +students protest against proposed monash fee jump +study considers alcohol harm reduction +summit to air aviation safety issues +sunscreen developed for fruit nut crops +sword wielding bandits rob boys +taiwan court appoints judges to rule on vote +taiwan poll assassination attempt questioned +tasmania says peta ill informed on logging +tas prisoners take cruelty case to un +three charged over northbridge brawl +three wounded in axe attack near tel aviv +tiger plays down worst finish in five years +tigers may protest over official blunder +tigers want points shared after official blunder +top dog falls as sex scandal fallout widens +two in israels cabinet opposed assassination +two killed in baghdad explosion +two palestinians killed in gaza military sources +uk top cop urges eu cooperation in terror fight +understanding key to ending terror muslim cleric +uni students wait longer for accommodation +updated port security plan operating soon +us charges six over iraqi prison assaults +valencia close gap on real +vic govt stands firm on fast rail project +vic kindergartens get it upgrade +violence forces abandonment of rome derby +virgin blue set to enter freight market +voss likely starter lynch in doubt +wa cannabis use effectively decriminalised +wa gas project highlighted on world stage +wal mart auto and oil make up largest us companies +wa oppn to target regional issues at historic +warne chases double world record +wa senator stands by union vote rigging claim +wa to ban production of gm food crops +weather warning issued for se qld coast +we need to improve to beat england says laporte +weve got work to do webber +witness to naval disaster awarded compensation +woolford faces high tackle charge +world faces challenge from terrorists downer says +abbott latham go head to head in parliament +abc exposes port security failures in darwin +act firefighting shake up possible +aerial survey to gauge locust threat +afl backs saints on milne montagna selection +ag apologises to bikies over bbq cat quip +aged care nurses campaign for better funding +aged care workers highlight pay discrepancy +alarm bells yet to ring over dingo future +albanians orchestrated kosovo violence nato +appleby on the move after florida near miss +arthurs to play in burnie tennis challenge +asx closes in the red +aussie wool in high demand in russia +aus swim coach happy with open athens pool +australian netballers remain undefeated on tour +bad boy rooney faces two match ban +bail out of clubs linked to rome derby riot +bhp announces 2b nickel project +big seas spare sunshine coast beaches +birds play havoc with broken hill power +birthing centre decision sparks obstetrician jobs +blair to visit libya on thursday report +body washes up on beach +bomb explodes in thailands south +brother speaks out over mortimer resignation +brumby loose with the truth on docklands studio +call for aged care worker pay boost +call for council to be split into nine wards +canberra fires inquest hears emergency removed +cattalini pondering move to italy +chelsea hoses down manager speculation +china ends rights talks with us +climate change greater threat than terrorism +coal train dispute to go before irc +community feels impact of missing teen +competent teachers deserve bonuses nsw opposition +conference to debate distance education issues +council approves youth centre move +council candidates urged to stay real +councillor rejects seniors centre decision +council ponders rezoning plan +counter terrorism exercise continues +court refuses dna bid over moran killing +croutons land top brain surgeon in the soup +cyclone to bring big surf to nsw north coast +cyclone whips up big gold coast waves +darwin mayor calls for help to halt tourism drop +date set for pacific island forum +davey leading rookie charge for demons +david jones announces huge profit boost +detention centre management in the spotlight +diagnosis d day for injured bronco tate +diplomat defends reference for alleged child sex +djs ceo opposes centrepoint plans +dogs must get balance right gallop +doubt cast over health centre auxiliary +doubts over council executive position +downer urges against iraq deadline +downer welcomes malaysian election result +doyle talks bendigo issues +drivers bare undies for flying doctors +election to appoint new hamas leader +elvis link has scottish village all shook up +england close in on caribbean victory +england recall grayson for paris showdown +englands jones plays down sarwan spat +entries sought for naidoc art awards +epa guidelines fuel petrol station concerns +eu ministers pledge anti terror cooperation +eu proposes usd 613 million microsoft fine report +farmers group at odds with child work legislation +fed govt urged to part fund burdekin dam work +fina anger over athens pool roof fiasco +fire management planing underway +fishers claim win in reef research +flowers carnival boosts visitor numbers +former vic premier rupert hamer dies +four more arrested over madrid blasts +free trade negotiator in central qld +fruit fly fighters nest grows +fund urges more cancer treatment funding +gallop promises probe into vote fraud claims +gathering to focus on grain rail lines +gm fight still on in wa +gold medallist elliott named to athletics review +gov rejects prison cruelty claims +govt accused of trying to buy off nurses +govt condemns labor as unfair dismissal bill +govt denies knowledge of rwandan suspects +govt to guarantee second seat for nt +govt to increase refugee numbers +green waste powers new qld energy plant +group say developer not interested in theatre +hadid smashes through architectures glass ceiling +heatwave turns green spaces brown +hemp growers send cannabis to educate politicians +hill clarifies comments on iraqi weapons +hill says keelty inquiry a waste of time +howard not fazed by latest opinion poll +hundreds homeless as floods hit png +hydro eyes indian water projects +imf clears usd 31 billion for argentina +indigenous people win access rights in north qld +israeli forces block roads +israeli forces seal gaza strip +israel says arafat not next +jacksons first accuser may testify against him +jail term prescribed for fake doctor +janet jackson receives soul train award +ji still poses credible threat says terror expert +kay fears us is losing credibility +keelty didnt tender resignation howard +kennett calls for national health audit +kennett ends political comeback rumours +labor democrats reject keelty inquiry +labor pushes second sydney airport plan +latham offers iraq troop pull out poll observers +lithgow lion hunter claims new evidence +little cyclone damage on heron is +madonna gets into the groove with new tour +man found not guilty in murder retrial +man jailed over business storm +man questioned over melbourne shooting +man to front court over theft driving charges +mass animal deaths spark crackdown +massive crowd mourns hamas leader +mayor rejects mps civic centre claims +mayors rule out merger call +microsoft calls proposed eu fine unjustified +mid west avoids horror fire season +miner offers traffic assurances +mine workers get pay rise +minister defends hospital conditions +minister pledges to address coast transport +mixed reaction to tas boycott threat +monash uni campus damaged during protest +monsanto doubts gm ban will spread +moodys cuts ratings on bowie bonds +more loans going to qld farmers +mourning condemnation follow hamas leaders death +mp dismisses cloud seeding concerns +muppets launch middle east peace mission +nasa finds deadly faults in shuttle tail +nats hope for toxic waste dump rethink +navy chief takes aim at bad behaviour +new laws to curb club hopping +new report tracks sex trafficking +new spy watchdog named +nigeria offers aristide temporary stay +nightclubs in trouble over noise security +no guarantees over water entitlement +nominations flow in for race club meet +no nato troops on greek soil during games minister +no quick fix for nowra traffic woes +no wa govt pledge yet to convention centre plan +nsw govt commits 20m to reduce surgery waiting +nsw oppn wants alp to drop airport study +olympic memorabilia stolen from perkins home +one freed over madrid bombings four more held +oppn seeks mersey hospital assurances +organisers consider surfest woes +parliament no picnic latham tells mps +pathologists agree on sids definition +peponis real dogs boss folkes +peter mortimer considers bulldogs post +phillpot secures mayoral spot +pig farmers welcome eu subsidies decision +plane bellylands at darwin airport +plan focuses on byron shire future +plantations firm to boost green triangle stake +police seek plane mishap details +police seek sheet details in murder probe +police stage big traffic blitz +poll spells double trouble for howard +ponting shrugs off head knock +possible link between antidepressants suicidal +power backtrack on grand final venue decision +power company urges restraint as mercury rises +public smoking ban debate continues +qld govt urged to settle winegate affair +reds unlikely to risk flatley in canberra +religious leaders speak out over yassin killing +russian nuclear ship could explode +rwanda claims genocide suspect in australia +sa govt urged to address river levy concerns +salinity drops in was denmark river +shattered rogers out for five months +shire signs indigenous agreement +sir joh still awaiting decision on compo lawyers +solider justified in shooting cameraman us army +spain recovers 5000 looted artefacts +spanish pull out a grave mistake +squabble delays taiwan recount +stage set for taiwan recount +stem cells may not be able to mend broken heart +storm seek details over fresh sex claim +suburban vineyard turns 95 +support for walhollow to join liverpool plains +survey finds most skeptical about pokies plan +sydney hospital closes wards for easter +taiwanese police chase leads in shooting +tas forestry practices come under scrutiny of +tas minister outraged at uk boycott bid +telstra struggle expected in senate +thirteen soldiers hurt in basra blasts +tillakaratne determined to win third test +time running out for waste treatment submissions +top seeds could produce all williams florida final +travel warning for israel issued +tunnel discovered at scene of pakistan fighting +turkish family of five found dead in germany +twu denies involvement in wa vote rigging +twu membership figures questioned +union wants enterprise talks delayed +uni talks up medical school plan +un urges pacific nations to follow fiji aids +upgrade planned for townsville port security +us adds ansar al islam to terrorism list +us crackdown hits online identity scammer +us denies involvement in hamas death +us destroyer to up missile defence of japan +us moves to reassure india over pakistan alliance +us to criticise chinas human rights record +us warns japan to prepare for bio attack +uv skin cancer risk higher than thought study +viduka strike helps leeds off the bottom +vieri the younger set for socceroos career +wa could benefit from nt cane toad survey +warne record bid cant affect team ponting +weather bureau warns of williams river flooding +wetlands insects weather drought +white house hits back at bush terror claims +who warns of flu outbreak +woman dies in house fire +woolford out for cowboys clash hornby nutley clear +wool industry hopes to reforge russian links +workcover probes show ride accident +yambulla logging protest continues +yassins death spooks financial markets +500m boost for north qld nickel refinery +6m contract awarded for drainage work +aboriginal communities apply for rural transaction +aboriginal man breaches bail for traditional +accidental explosion kills afghan soldiers wounds +aceh medic killed homes burnt report +adelaide darwin rail link unlikely terrorist target +adelaide speed limit cuts here to stay +aerial surveys to monitor asbestos threat +airport chief quiet on security scare +airspace system gaining support director +all militant leaders marked for death israel +all ords slide continues +anchors away for crane barge +asbestos compensation claims yet to peak +asbestos research fund awards first grant +aussie princess given danish passport +aust presses philippines for stronger defence ties +australian economy dangerously overheated +australia opposes un vote condemning yassins +bahrain test run no big advantage say williams +beattie continues mid east mission despite +bega mp raises policing worries +bendigo tipped to continue good growth +big shark found on northern nsw beach +bird flu found closer to australian shores +storm sets back korean cattle feed trade +brabham tips webber for greatness +bremer moves to allay fears over iraq interim +briton breaks balloon altitude record +byrne back in council top job +call for desalination research centre +call for more work incentives for nurses +call for royal commission in redfern death +call for transport investment decision +centrelink plans high tech welfare cheat hunt +chemical scare shuts airport freight terminal +china withdraws from boomers series +claims pay boost would cut patients waits +clark pledges to meet the people +coffin sparks melbourne airport security alert +compo sought for coal mine illnesses +concerns raised over alcohol fine impacts +conditions for telstra sale unmet anderson +contaminated wheat shipment on sold +controversial project faces more hurdles +costello blasts promise breaking alp over toll +council backs lee wharf probe +council tackles insurance woes +court rejects bid to invalidate taiwan poll +crocs vanderjagt up for rookie award +crows port storm deny knowledge of new rape +cue shire wins cultural heritage award +cyclists pedal towards ms research +cyclone fay whips up broome winds +defence needs businesslike approach +demetriou to investigate new sex assault claims +democrats back act stamp duty proposal +denmark rivers salt level falling +dogs refuse to name cocaine test player +dover hosts major film shoot +dry months ahead for western victoria +early start to cotton picking +emergency dept opens after long wait +england complete seven wicket win +ex miner joins legal action over poor health +express yourself candidate urges sign vandals +family group pushes to ban french film +farmers oppose push for gm crops ban +farmers urged to boost on farm milk storage +farmers want more rail line funds +federer agassi out to join serena in the spotlight +financial boost sought for home schooling +former uk coal miners join worlds biggest class +free travel for cityrail passengers +french navy finds fijians adrift at sea +fresh hitch delays taiwan ballot recount +fruit growers unhappy acting as immigration police +gang war silence frustrates police +gatto appears in court over gangland murder +genome centre to develop drought resistant crops +govt buys bypass homes despite route confusion +govt mps fear super changes will retire them +govt out to crack down on sham bankruptcy +govt pledges 13m to pacific refugee programs +govt reissues israel indonesia travel warnings +govt to ban unsecure pacific flights +grace sends calling card to coast beaches +green group seeks council candidates views +greens want sydney airport rethink +gunmen kill nine iraqi police chief +gunns director calls for stronger export focus +health meeting to focus on restructure worries +hillary clinton backs kerry for white house +hope for local firms to benefit from 2b nickel +immigration crackdown starves farmers of labour +industrial row may lead to blackouts +inquest hears of night fire crew dilemmas +inventor wins top award for missile decoy +iraqi rights abuses widespread survey +israeli tanks roll into palestinian refugee camp +israel shrugs off hamas threats +israel strikes at lebanon +jackson sues over online memorabilia sale +jobs for the best not the boys hidding +just group refloats +kennett pays tribute to former liberal premier +kings favourites as nbl decider looms +kings leading pigs at half time +kings take game one +lara jones fined for port of spain bad behaviour +lederhosen subsidy axed as germany tightens belt +lee maintains attacks on two jobs moore +lehmann pounds another ton +lost fisherman search finds body +madrid bombing death toll revised down +man faces court over gang shooting +man fronts court on attempted murder charge +man in custody after home invasion +many sex harassment victims suffer in silence +mayor upbeat about sugar meeting +meeting to air fears over planned flight cut +memorial for madrid victims begins +methane power to cut darwin energy bill +microsoft slapped with record fine +militants strike baghdads sheraton hotel +military exercise to simulate air threats +mixed water supply may be ongoing +monty pythons life of brian set for re release +more accommodation may be key to tourism boost +mozzie funds on the agenda in south west +mp airs prisoner escort concerns +mp avoids censure in balls furore +mp confident of more freeway funds +nab pledges to implement apra reforms +nca promises state circle consultation +new campaign aims to lure tourists to goldfields +new court ready for business +new govt offices set to open +new hamas chief vows no security for israel +new teachers to face tough tests +nightclubs to protest over restricted trading hours +no way to stop sept 11 attacks bush +nrl powerless on bulldogs cocaine test +nursing funding woes spark health concerns +olyroos look to nsl stars +opals take priority as stirling quits lightning +oppn expected to up calls for keelty talk records +opportunity sits on martian shore nasa +pacific backyard open to terrorists security +pakistani elders seek end to border battle +pakistan send india to bat in one day decider +pan liquidator mulls ceo lawsuit +parenting orders would burden indigenous families +parish urged to help cut greenhouse gases +parliament approves timor gas deal +plaza to undergo more development +pm attacks lathams pull out plan +png minister slams banks for withholding loans for +police expect more gangland killings +police gather dna in art theft investigation +police hold redfern in state of siege pilger tells +police seek martin place brawl witnesses +poor funding means croc fest may be cancelled +porto too good for lyon +port searching for answer to lloyd +ports security boost may cost farmers mp +powell defends bush over sept 11 +power industry seeks govt backing for greenhouse +praise for shires green efforts +proteas fear history making loss to new zealand +protesters and timber company at loggerheads +protesters blockade sawmill over woodchip issue +public asked to highlight nt icons +public to get say on kalgoorlie plan +public to vote on fluoridation plan +quinn pledges rates discounts +quotable victorian scientists honoured +rail guards lack of counter terror training union +ralf prepared to take pay cut manager +rangers win but still trail celtic by 16 points +reef rezoning needs back up funds +refugees await residency answers +regional skills program launched +region pushes forward with tourism marketing +regulator finds profit was king at nab +renison tin mine sold reopening planned +report urges library closures +riverland success stories on show +river report offers hope in salinity battle +rocky seeks earlier jetstar flights +rogers hopes to return ahead of schedule +rural groups lobby to revitalise health services +scott relinquishes k and s chairman role +scully happy with bridge plan +seafood industry happy with native title decision +search continues for missing tourist +senate demands govt produce keelty documents +shattered rogers hopes to return ahead of schedule +shire backs call for more infrastructure funds +shires lose drought status +singapore launches high tech campaign to eliminate +sir rupert hamer renaissance premier +site finally chosen for landfill +six tune in to fm licence bidding +s korean president refuses to appear at +state funeral to honour sir rupert +statistics highlight needy communities +st marys doctor quits +studies recommend wider research cooperation +stunning comeback gives milan victory +surgery rules di venuto out of derbyshires season +sweetman loses riverton preselection +taiwan mps agree to recount deal +tas potato farmers attend international forum +tate gets all clear for injury return +taxing time for wine producers +technical fault leaves iraq pipeline storm +thailand hands drug traffickers back to australia +thought control experiments may benefit parkinsons +tile plant to bolster hunter jobs +two options for broome wastewater plant +un and kosovo policemen killed in attack +us brushes off roofless athens pool concerns +us embassy suspends uae operations +us hails top level meeting with khaddafi +us warns nationals abroad +uwa still undecided on hecs increase +victorian dna laws adequate bracks +vic treasurer promises no electricity price hikes +volunteers sought for desert race +voting changes to spark longer election count +wall street closes down for fourth straight day +wal mart challenges itunes music downloads +waste water report yet to be completed +water to be issue in council poll +west indies team manager skerritt resigns +williams katich in colombo selection mix +yacht guided to safety after struggling in big seas +aboriginal elder banished after stabbing +accused saint to play cats +act child death team to improve support services +act govt defends mental health services +acu not to increase hecs fees +adelaide court jails homicidal driver +afghan president wants to delay elections +a funny thing happened on the way to melbourne +agreement struck over tuna farm trial +albany mp blasts political stunt +arrest warrants cancelled for accused priests +arsenal net away goal in 1 1 draw with chelsea +asx gains ground on banks back +atapattu hits back for sri lanka +atapattu leads sri lankan fight back +australia committed to the pacific +australia opposes israel censure over yassin +ballarat man gets afl opener honour +ballina council rejects fluoride +beachleys battle of the sexes makes waves +big mouths linked to small brains research +big toowoomba show looming +bipartisan support for regional parliament sitting +blair flies to libya for landmark visit +blair meets with libyan leader +bomb kills us soldier in iraq +britain freezes assets of hamas leaders +brits nouveau cuisine the deep fried chocolate +bulldog mason denies testing positive to cocaine +businesses warned of accident impacts +call for highway to be included in transport plan +call for indigenous land use agreements response +call for more political contributions clarity +canberra sydney trains on track for may +cattle producers reminded of aid +chance seeks logging contract legal advice +chopper joins search for missing man +church storm considered suspicious +city living to dominate un +cliches to be honest with you drive us mad +coast beaches weather big waves +coffin causes hiccup at melbourne airport +community forum to discuss mersey hospital woes +controversial castro doco to air in canada +cooma gets new youth service +council airs bypass funding fears +council angry over lifeguards dispute +council hopes to keep planning powers +council tourism decision draws anger +court hears evidence in hotel theft trial +court rules in kennedy asic case +court told abas sydney tv licence decision not +crocs name new signing +cyclone fay batters wa coast +cyclone fay changes course +cyclone set to cross wa coast +date set for hickey inquest +delay possible in election results +democrats seek continued reef protection +diamond miner strikes indigenous deal +diesel powered net connection riles outback users +disappointment over lack of indigenous jail jobs +dogs hold the line on cocaine claims +domestic violence group questions court decision +drink driving campaign to highlight danger signs +durham look north for gibbs cover +emergency conference tries to avert public +f1 changes qualifying procedures +family uninjured in chlorine bomb blast +farina unaware of socceroos rift +farmers claim govt betrayed them on gm stance +farmers push for drought assistance overhaul +federal nt ministers discuss croc safaris +few details emerge from alp meeting +five charged over longreach sexual assault +flatley ruled out for reds +flying doctor concerns remain +forest protest sparks arrests +former afl umpire handed video role +forum to focus on drugs and youth +four eared kitten not a monster +frawley dismisses talk of an upset +french experts defuse bomb on train tracks +fruit growers campaign to keep refugee workers +gaffe highlights japans pension problems +gallop urges dogs to come clean on cocaine test +gallop urges dogs to come clean on mason cocaine +ganguly celebrates historic series win +ganguly may miss first pakistan test +ganguly to undergo scans on back injury +gas report sparks council concerns +german police raid online neo nazi music sharers +gold coast man jailed for 11m fraud +govt introduces bill over nt electorates +govt keeps heat on alp over iraq pullout +govt mismanaged dairy deregulation report +govt says reef protection scheme just months away +govt to create construction overseer +greece happy with olympic security drill +group withdraws french rail bomb threats +hamas leaders go to ground +hamas warns sharon may be a target +health service to start paying off debts +hope for council promises to be kept +hopes for mine sale to benefit west coast economy +howard plans us free trade trip +hunter records more chlamydia cases +hypersonic plane shoots for mach 7 +icac to hold public hearings into fraud claims +icc calls for evidence in match fixing claims +idol judge denies obscene gesture +indonesia plans compulsory military service +iraqi working for time magazine shot in baghdad +iraq to continue buying australian wheat +isis shire ponders sugar industry value +isp review needed to track net paedophiles report +israel arrests teen wearing bomb in west bank +italian parliament approves controversial media +ivanisevic advances in miami +japan arrests chinese activists on disputed island +kewell viduka vieri named in socceroos squad +labor govt to keep keelty on latham +labor supports bans on at risk flights +lane gets what he expected the sack +latham flags further bank regulation +lehmann the key to cracking 400 +life on mars could have come from earth +little hunter response to mining compo case +lord mayors promise to discount rates an election +man fined over animal cruelty +markovic named as nbls top rookie +mason ponders law suit over cocaine claim +match fixing claims hit spanish soccer +mayoral hopeful wants electoral material changes +mayor derails monorail plan +mayor speaks out over funding misuse claims +mcevoy saddles up for godolphin +mexico seeks explanation from uk on cavers +minister clarifies chain clearing law +minister offers police station assurances +monash students wind up protest +more fears aired over nursing post +more foreign nationals detained in sa south east +more funds available to fight locusts +mp backs education policy benefits +netball association resigned to move +new pill may help divers fend off bends +nsw rugby turns to nz expert +nt lobbies for croc safaris +nurses group unhappy with meeting snub +nz executives want common currency with australia +obesity tops health problems for us children +olympic flame brightens up athens +oppn questions hospital ward easter closure +parole appeal rejected for anti abortion activist +part australian honey taken off canadian shelves +payne recovering after sandown fall +phar laps saddle sold for 87500 +police hunt man over attempted sex attack +police probe abalone farm attack +police probe suspected drug overdoses +police probing suspected murder seek letter writer +police to quiz boats crew over diving death +police to target road safety +power plant plan sparks call for probe +precautions taken after mt isa chemical spill +premier to accept commonwealth grant deal +primus named in port squad +public reminded of fire ban +qld govt buys home in lead up to highway bypass +rape accuser faces bryant in courtroom +ratepayers to be asked about recycling costs +ray of hope touches athens preparations +rba says business coping with strong dollar +real win but monaco retain hope +regional health gets funds boost +report casts doubt over sugar rescue call +report highlights telecommunications complaints +rice sculpture highlights uns hunger message +rising seas point to melting glaciers study +sa speaker blames outburst on illness +scud proclaimed comeback king +security concerns leave global markets unstable +senator says no highway funds guarantee +ses monitors barwon river level +singer jailed for failing to pay child support +slipway safety audit after compressor fire +smit named springbok captain +soya powered airliners offer greener future report +spains pm resists pressure over iraq pull out +sports complex committee named +stage set for spider man 3 +stanhope says human rights bill justified by uk +strong winds fan bushfires in tasmania +student protesters take over monash uni office +students occupy monash building in protest +surgery puts furyk on the sidelines +swells expected to calm after gold coast beaches +sydney community tv station wins a reprieve +teen charged over caravan storm +terrorism no priority before sept 11 ex bush +tests continue to try and identify skeleton +thailand returns drug traffickers to australia +time running out for postal voting +troops to stay in iraq for now howard +turkish man charged with people smuggling +two charged after woodchip protest +two to debut for swans opener against lions +two to debut in swans opener against lions +underworld drug squad links highly likely +uq protest over fee rise turns ugly +uq to increase hecs fees +uranium scare forces mining plant shutdown +uranium water may make miners ill regulator +us ambassador warns against troop pullout +us calls european microsoft ruling unfortunate +us proposes wmd resolution +us to gradually lift economic sanctions on libya +vic blood crisis solved for short term +vic bushfire threatens homes +victorian gm ban receives mixed response +victoria opts for four year gm moratorium +virgin blue adds extra rockhampton flight +virgin blue says no to earlier flight +wa labor row threatens national fallout +warrant issued as alleged paedophiles miss court +water contamination scare prompts run on supplies +wa union criticises building industry enforcement +we could have done more to stop sept 11 cia boss +welfare workers worried by docs response delays +white house hits back at ex bush anti terror +worlds largest wave but no chance to surf it +abalone council opposes call for lower annual take +accused turkish people smuggler to remain in +afl braces for hot conditions +agreement wont compromise tuna farm +alp bribery claims go back to police +al qaeda tape urges pakistan coup +appendicitis forces pellegrino out of glory match +aussies lose hayden +aussies slip in final session +aussies slip in last session +australia seals iraqi wheat deal +australia tops honest companies list +backlog at gladstone port +bahrain wanted to postpone gp report +bank tax cut as states bicker +blair hails meeting with libyan leader +boat accident claims life on gold coast +boys body found near train track +breath test charge overshadows rugby sevens row +brisbane mayoral candidates make final pitches +british soldier hauled from flooded mexico caves +builder embroiled in legal battle with owen found +buoyant india eye first test win in pakistan +burn off gets out of control +call for more broken hill freehold blocks +calls for proactive drought approach +capitals coach quits for overseas post +chen declared winner amid violent protests +chief justice bows out after 16 years +china aims for immortality with lunar probe +chinese community okays high school testing plan +christ movie moves texan to murder confession +circumcision can block hiv infection +civil libertarians concerned by terror law plan +coal miner to list on stock exchange +complacent attitude killed yachtsman coroner +corruption commission not needed bracks +council elections loom +councillor targeted in offensive cartoon +council workers ordered back to work +court accuses police of brutality lying +court finds man guilty of hotel funds theft +craigie langmack to discuss race claim +cross network mobile charges too high +cyclone fay out at sea but expected to turn back +dam level reaching low point +democrats call for tougher construction watchdog +democrats want greater powers for building +dogs ask ref to guard against sledging +doubt cast on shaken baby syndrome +downer defends iraq troop numbers +draw hits blues play off hopes +dry conditions continuing in southern nsw +earth picks up a new moon +eu leaders adopt anti terror measures +ex prison officer on child sex charges +extra nurses needed for hospital plan union says +f1s one man show intriguing not boring supremo +fairweather heads australias olympic archery hopes +farmers group backs drought policy reform +fears rural woes will boost suicide rate +fed govt has rethink on burnett ec rules +fed govt under fire over nickel plan stance +fed govt urged to boost energy targets +fight intensifies against lantana +fire crews monitor vic storms +fishing hits shark numbers study +fitness guru simmons cited for slapping fighter +food poisoning parasite can evade drugs study +four killed two wounded in iraq hospital +france detains three over bomb threats +funds boost for indigenous crime fighting +funds to encourage regional doctors to stay put +german family cops an earful +gladstone council agrees to buy land +global trend pushes asx higher +govt bungled dairy deregulation package report +govt commits more money to quamby jail +hackett hopes to lower 1500m wr +handbag recovery costs german woman her licence +health funds boost but no medical retrieval unit +health lobby launches national funding campaign +hollywood braces to be reeled in by spider mans web +howard ignoring parliament with fta trip +howards us trip more than symbolic downer +huegill has confidence in athens security +illegal gun sales trigger police warning +indys nz ace dixon hoping for a change of direction +international chefs endorse burnett producers +iraq remarks cause comment in washington +iraqs national soccer team to tour england +irc rules small business must pay redundancies +irons beachley bow out +island renaming aims to avoid boo boos +israel looks to us to vote down un resolution +ivory coast forces must be held accountable rights +japan forced to pay chinese for wwii labour +jones extends wallabies contract +kelme deny doping accusations from former team +labor plans needs based school funding +labor united on iraq pullout latham +langer cleared on disrepute charge +latham troop plan could risk lives pm says +libs unhappy with former leaders council tactics +longer gm moratorium to allow more debate +lord of the rings a boon for nz tourism +man awarded bail after gungahlin protest +man charged over perth aiport hoax +man could serve double time court told +mayoral shake up wouldnt surprise academic +mayor rejects party politics claims +mccoy denied provisional pole at phillip island +meeting to consider local govt issues +men appear in court over 75m ecstasy haul +mill storm under investigation +mine operator reveals third contamination case +minister promises mersey hospital to remain open +minister stands firm on dairy package +ministers to decide on salt interception scheme +monash rushed into fee hike +more than 100 injured in quake in chinas inner +motorcyclist dies in karratha crash +mp advocates stronger voice for constituents +mp angry over childs hospital wait +mp attacks biotech company +mp welcomes surgery waiting list improvement +mystery surrounds missing prosecution witness +nab wants shareholders to clear decks +native title claim made over western cape york +new mine equipment to bolster jobs +new physics chief to champion science careers +no need for some residents to vote +nsw govt urged to help animal shelter +nsw opposition pushes for health inquiry +nsw police plan extra gun training +nt wants bigger road funding slice +outback visitors to enjoy dinosaur tourism +payback brings harmony to community lawyer +persson atlevi shines in madeira island gloom +pies tigers to launch afl 2004 +pm announces gp practice nurses funding boost +police lockup nominated for worst dungeon +port to blood rookies in opener +portugal seeks nato help for euro 2004 +power confident despite injury concerns +probe launched into alleged animal smuggling racket +psa warns of more strikes over pay claim +public protest for fast rail plan changes +push for beattie govt to look at daylight savings +qantas pledges to keep early flights from tas +qpr defender charged with raping 15 year old +quake rocks eastern turkey +race day breaks records again +rail delays nasdaq open +ranger contamination scare blamed on crossed lines +razorbacks draw level with kings +ref urged to curb sledging in dogs roosters match +reid reaches miami second round +review finds drought aid too complex +richardson leads charge as tigers claw pies +ronaldo to bolster real against sevilla +roosters on top at half time +ruddy takes out archibald with gulpilil +saints focus on footy +sa public servants prepare for mass strike +scott fires tiger bombs at fifth major +second leak examined at ranger mine +senator to use local hang ups to oppose telstra +side strain casts doubt on kallis +simple life key to saudi mans longevity +simpson injury bad news for knights +sinclair shines for black caps +singapore says satire on censorship not funny +sleeping burglary suspect startles victims +south east gears up for council poll +southern nsw gears up for council poll +states agree to gst funding review +states and territories better off with gst howard +steroid use increasing among us school athletes +stiles to lead reds into brumbies battle +strong cowboys outfit to take on raiders +study discounts abortion breast cancer link +study finds low morale among iraq troops +survey shows boundary change support +swim coach accuser agree to avo +sydney candidates launch election eve sweep +sydney ferries row intensifies +tafe shake up aims to save thousands +tas oppn housing plan gets mixed reaction +teenage fanclub beckons for celtic saviour marshall +tennis now drug free says agassi +terrorism exercise finds room for improvement +three springs to boost corella cull +tigers claw pies +tight security in place for bahrain gp +torch relay begins as rogge praises athens +tour operators to say goodbye to gst on reef tax +traditional punishment ban wouldnt work +trapped brits put mexicos nose out of joint diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/embeds/emb-00000-of-00001.tfrecords b/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/embeds/emb-00000-of-00001.tfrecords new file mode 100644 index 000000000..a70855998 Binary files /dev/null and b/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/embeds/emb-00000-of-00001.tfrecords differ diff --git a/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/embeds/random_projection.matrix b/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/embeds/random_projection.matrix new file mode 100644 index 000000000..b48a407b8 Binary files /dev/null and b/tensorflow_hub/tools/make_nearest_neighbour_index/test_data/embeds/random_projection.matrix differ