Adds max_norm option to embedding_lookup (and upstream functions).

tensorflower-gardener · tensorflower-gardener · commit 12dcf29550ca · 2016-11-16T07:08:43.000-08:00
Change: 139325873
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -41,7 +41,8 @@ def safe_embedding_lookup_sparse(embedding_weights,
                                  combiner=None,
                                  default_id=None,
                                  name=None,
-                                 partition_strategy="div"):
+                                 partition_strategy="div",
+                                 max_norm=None):
   """Lookup embedding results, accounting for invalid IDs and empty features.
 
   The partitioned embedding in `embedding_weights` must all be the same shape
@@ -75,6 +76,8 @@ def safe_embedding_lookup_sparse(embedding_weights,
     name: A name for this operation (optional).
     partition_strategy: A string specifying the partitioning strategy.
         Currently `"div"` and `"mod"` are supported. Default is `"div"`.
+    max_norm: If not None, all embeddings are l2-normalized to max_norm before
+        combining.
 
 
   Returns:
@@ -135,7 +138,8 @@ def safe_embedding_lookup_sparse(embedding_weights,
         sparse_weights,
         combiner=combiner,
         partition_strategy=partition_strategy,
-        name=None if default_id is None else scope)
+        name=None if default_id is None else scope,
+        max_norm=max_norm)
 
     if default_id is None:
       # Broadcast is_row_empty to the same shape as embedding_lookup_result,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -162,7 +162,8 @@ class _DeepEmbeddingLookupArguments(
                             "combiner",
                             "dimension",
                             "shared_embedding_name",
-                            "hashed"])):
+                            "hashed",
+                            "max_norm"])):
   """Represents the information needed from a column for embedding lookup.
 
   Used to to compute DNN inputs and weighted sum.
@@ -822,7 +823,7 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
     "_EmbeddingColumn",
     ["sparse_id_column", "dimension", "combiner", "initializer",
      "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
-     "shared_vocab_size"])):
+     "shared_vocab_size", "max_norm"])):
   """Represents an embedding column.
 
   Args:
@@ -863,7 +864,8 @@ def __new__(cls,
               ckpt_to_load_from=None,
               tensor_name_in_ckpt=None,
               shared_embedding_name=None,
-              shared_vocab_size=None):
+              shared_vocab_size=None,
+              max_norm=None):
     if initializer is not None and not callable(initializer):
       raise ValueError("initializer must be callable if specified. "
                        "Embedding of column_name: {}".format(
@@ -882,7 +884,8 @@ def __new__(cls,
                                                 initializer, ckpt_to_load_from,
                                                 tensor_name_in_ckpt,
                                                 shared_embedding_name,
-                                                shared_vocab_size)
+                                                shared_vocab_size,
+                                                max_norm)
 
   @property
   def name(self):
@@ -922,7 +925,8 @@ def _deep_embedding_lookup_arguments(self, input_tensor):
         initializer=self.initializer,
         combiner=self.combiner,
         shared_embedding_name=self.shared_embedding_name,
-        hashed=False)
+        hashed=False,
+        max_norm=self.max_norm)
 
   def _checkpoint_path(self):
     if self.ckpt_to_load_from is not None:
@@ -1133,7 +1137,8 @@ def _deep_embedding_lookup_arguments(self, input_tensor):
         combiner=self.combiner,
         dimension=self.dimension,
         shared_embedding_name=None,
-        hashed=True)
+        hashed=True,
+        max_norm=None)
 
 
 def hashed_embedding_column(column_name,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -130,7 +130,8 @@ def _embeddings_from_arguments(column,
       input_tensor,
       sparse_weights=weight_tensor,
       combiner=args.combiner,
-      name=column.name + 'weights')
+      name=column.name + 'weights',
+      max_norm=args.max_norm)
 
 
 def _input_from_feature_columns(columns_to_tensors,
diff --git a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer.py b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer.py
@@ -42,7 +42,12 @@ class VariableClippingOptimizer(optimizer.Optimizer):
   Multiple instances of `VariableClippingOptimizer` may be chained to specify
   different max norms for different subsets of variables.
 
+  This is more efficient at serving-time than using normalization during
+  embedding lookup, at the expense of more expensive training and fewer
+  guarantees about the norms.
+
   @@__init__
+
   """
 
   def __init__(self,
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -228,6 +228,26 @@ def testSimpleSharded(self):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  def testMaxNorm(self):
+    with self.test_session():
+      embeddings = tf.constant([[2.0]])
+
+      ids = tf.constant([0], dtype=tf.int32)
+      embedding = tf.nn.embedding_lookup([embeddings], ids, max_norm=1.0)
+
+      self.assertAllEqual(embedding.eval(), [[1.0]])
+
+  def testMaxNormNontrivial(self):
+    with self.test_session():
+      embeddings = tf.constant([[2.0, 4.0], [3.0, 1.0]])
+
+      ids = tf.constant([0, 1], dtype=tf.int32)
+      embedding = tf.nn.embedding_lookup([embeddings], ids, max_norm=2.0)
+
+      norms = tf.sqrt(tf.reduce_sum(embeddings * embeddings, axis=1))
+      normalized = embeddings/tf.stack([norms, norms], axis=1)
+      self.assertAllEqual(embedding.eval(), 2 * normalized.eval())
+
   def testSimpleShardedPartitionedVariable(self):
     with self.test_session() as sess:
       num_shards = 2
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -33,7 +34,7 @@
 
 
 def embedding_lookup(params, ids, partition_strategy="mod", name=None,
-                     validate_indices=True):
+                     validate_indices=True, max_norm=None):
   """Looks up `ids` in a list of embedding tensors.
 
   This function is used to perform parallel lookups on the list of
@@ -73,6 +74,8 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       is `"mod"`.
     name: A name for the operation (optional).
     validate_indices: Whether or not to validate gather indices.
+    max_norm: If not None, embedding values are l2-normalized to the value of
+     max_norm.
 
   Returns:
     A `Tensor` with the same type as the tensors in `params`.
@@ -86,17 +89,26 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
     params = list(params)  # Iterate to get the underlying Variables.
   if not isinstance(params, list):
     params = [params]
+  def maybe_normalize(x):
+    if max_norm is not None:
+      if x.get_shape().ndims is not None:
+        ndims = x.get_shape().ndims
+      else:
+        ndims = array_ops.size(array_ops.shape(x))
+      return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
+    return x
   with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
     np = len(params)  # Number of partitions
     params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
     if np == 1:
       with ops.colocate_with(params[0]):
         # TODO(apassos): implement the sharded version as well.
         if isinstance(params[0], resource_variable_ops.ResourceVariable):
-          return params[0].sparse_read(ids, name=name)
+          ret = params[0].sparse_read(ids, name=name)
         else:
-          return array_ops.gather(params[0], ids, name=name,
-                                  validate_indices=validate_indices)
+          ret = array_ops.gather(params[0], ids, name=name,
+                                 validate_indices=validate_indices)
+      return maybe_normalize(ret)
     else:
       ids = ops.convert_to_tensor(ids, name="ids")
       flat_ids = array_ops.reshape(ids, [-1])
@@ -180,13 +192,14 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       # Normally the reshape is sufficient, but setting shape explicitly
       # teaches shape inference that params[1:].get_shape() matters.
       ret.set_shape(ids.get_shape().concatenate(element_shape))
-      return ret
+      return maybe_normalize(ret)
 
 
 def embedding_lookup_sparse(params, sp_ids, sp_weights,
                             partition_strategy="mod",
                             name=None,
-                            combiner=None):
+                            combiner=None,
+                            max_norm=None):
   """Computes embeddings for the given ids and weights.
 
   This op assumes that there is at least one id for each row in the dense tensor
@@ -216,6 +229,8 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
       "mean" is the weighted sum divided by the total weight.
       "sqrtn" is the weighted sum divided by the square root of the sum of the
       squares of the weights.
+    max_norm: If not None, each embedding is normalized to have l2 norm equal
+      to max_norm before combining.
 
   Returns:
     A dense tensor representing the combined embeddings for the
@@ -291,7 +306,7 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
       idx = None
 
     embeddings = embedding_lookup(
-        params, ids, partition_strategy=partition_strategy)
+        params, ids, partition_strategy=partition_strategy, max_norm=max_norm)
     if not ignore_weights:
       weights = sp_weights.values
       if weights.dtype != embeddings.dtype: