tensorflow
diff --git a/‎RELEASE.md
Lines changed: 3 additions & 0 deletions b/‎RELEASE.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorflow_transform/beam/analysis_graph_builder.py
Lines changed: 9 additions & 3 deletions b/‎tensorflow_transform/beam/analysis_graph_builder.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎tensorflow_transform/beam/analysis_graph_builder_test.py
Lines changed: 6 additions & 1 deletion b/‎tensorflow_transform/beam/analysis_graph_builder_test.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎tensorflow_transform/beam/cached_impl_test.py
Lines changed: 98 additions & 6 deletions b/‎tensorflow_transform/beam/cached_impl_test.py
Lines changed: 98 additions & 6 deletions
@@ -26,6 +26,9 @@
 * Moved beam/shared lib to `tfx-bsl`. If running with latest master, `tfx-bsl`
   must also be latest master.
 * Depends on `tfx-bsl>=0.15,<0.16`.
+* `preprocessing_fn`s now have beta support of calls to `tf.function`s, as long
+  as they don't contain calls to `tf.Transform` analyzers/mappers or table
+  initializers.
 
 ## Breaking changes
 * `always_return_num_quantiles` changed to default to True in `tft.quantiles`
 
@@ -20,7 +20,6 @@
 import collections
 import copy
 import hashlib
-import uuid
 
 # GOOGLE-INITIALIZATION
 
@@ -41,6 +40,14 @@
 def _serialize_op_attr(op_attr):
   """Deterministicly serializes tf.Operation attrs since it is a map."""
   sorted_attributes = sorted(op_attr.items(), key=lambda kv: kv[0])
+  if 'f' in op_attr:
+    # This is a tf.Function node, and it includes attributes that are
+    # inconsistent across runs such as _gradient_op_type, config_proto, so we
+    # only keep input and output types since other information will arrive from
+    # the FuncGraph attributes.
+    sorted_attributes = [
+        kv for kv in sorted_attributes if kv[0] in ('Tin', 'Tout')
+    ]
   result = []
   for key, attr_value in sorted_attributes:
     result.append(key)
@@ -49,8 +56,7 @@ def _serialize_op_attr(op_attr):
       raise ValueError(
           'Unable to serialize op attributes that contain a `list.func` field')
     if attr_value.HasField('func'):
-      # TODO(b/138796127): Support tf.function fingerprint.
-      result.append(uuid.uuid4().hex)
+      # There should be a separate call for the FuncGraph attributes.
       attr_value.ClearField('func')
     result.append(attr_value.SerializeToString())
   return result
 
@@ -48,7 +48,12 @@ def _preprocessing_fn_with_no_analyzers(inputs):
 
 
 def _preprocessing_fn_with_one_analyzer(inputs):
-  x = inputs['x']
+
+  @tf.function
+  def _plus_one(x):
+    return x + 1
+
+  x = _plus_one(inputs['x'])
   x_mean = tft.mean(x, name='x')
   x_centered = x - x_mean
   return {'x_centered': x_centered}
 
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 import collections
+import functools
 import itertools
 import os
 import struct
@@ -1019,12 +1020,14 @@ def preprocessing_fn(inputs):
               preprocessing_fn, pipeline=p))
       self.assertFalse(output_cache)
 
-  def test_tf_function_fails_cache(self):
+  def test_tf_function_works_with_cache(self):
 
-    def preprocessing_fn(inputs):
+    def preprocessing_fn(inputs, should_add_one):
 
       @tf.function
       def identity(x):
+        if should_add_one:
+          x = x + 1
         return x
 
       return {
@@ -1035,8 +1038,9 @@ def identity(x):
 
     feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
     input_data_dict = {'span-0': [dict(x=-2), dict(x=4)]}
-    run_result = self._run_pipeline(feature_spec, input_data_dict,
-                                    preprocessing_fn)
+    run_result = self._run_pipeline(
+        feature_spec, input_data_dict,
+        functools.partial(preprocessing_fn, should_add_one=False))
     first_cache_output, p1 = run_result.cache_output, run_result.pipeline
 
     for key in input_data_dict:
@@ -1050,12 +1054,100 @@ def identity(x):
         _get_counter_value(p1.metrics, 'saved_models_created'),
         _SINGLE_PHASE_NUM_SAVED_MODELS)
 
+    # Cache is still valid since the contents of the tf.function are the same.
+    run_result = self._run_pipeline(
+        feature_spec,
+        input_data_dict,
+        functools.partial(preprocessing_fn, should_add_one=False),
+        should_read_cache=True)
+    second_cache_output, p2 = run_result.cache_output, run_result.pipeline
+
+    self.assertFalse(second_cache_output)
+
+    self.assertEqual(_get_counter_value(p2.metrics, 'num_instances'), 0)
+    self.assertEqual(_get_counter_value(p2.metrics, 'cache_entries_decoded'), 1)
+    self.assertEqual(_get_counter_value(p2.metrics, 'cache_entries_encoded'), 0)
+    self.assertEqual(
+        _get_counter_value(p2.metrics, 'saved_models_created'),
+        _ZERO_PHASE_NUM_SAVED_MODELS)
+
+    self.assertEqual(_get_counter_value(p2.metrics, 'num_instances'), 0)
+    self.assertEqual(_get_counter_value(p2.metrics, 'cache_entries_decoded'), 1)
+    self.assertEqual(_get_counter_value(p2.metrics, 'cache_entries_encoded'), 0)
+    self.assertEqual(_get_counter_value(p2.metrics, 'saved_models_created'), 1)
+
+    # Modifying the tf.function contents causes cache invalidation.
+    run_result = self._run_pipeline(
+        feature_spec,
+        input_data_dict,
+        functools.partial(preprocessing_fn, should_add_one=True),
+        should_read_cache=True)
+    third_output_cache, p3 = run_result.cache_output, run_result.pipeline
+
+    for key in input_data_dict:
+      self.assertIn(key, third_output_cache)
+      self.assertEqual(1, len(third_output_cache[key]))
+
+    self.assertEqual(_get_counter_value(p3.metrics, 'num_instances'), 2)
+    self.assertEqual(_get_counter_value(p3.metrics, 'cache_entries_decoded'), 0)
+    self.assertEqual(_get_counter_value(p3.metrics, 'cache_entries_encoded'), 1)
+    self.assertEqual(_get_counter_value(p3.metrics, 'saved_models_created'), 2)
+
+  def test_incomplete_graphs_fail_cache(self):
+
+    def preprocessing_fn(inputs):
+      # Subtract 10 from x using a tf.while_loop.
+      @tf.function(input_signature=[
+          tf.TensorSpec([], tf.int32),
+          tf.TensorSpec([], tf.int64)
+      ])
+      def stop_condition(counter, x_minus_counter):
+        del x_minus_counter  # unused
+        return tf.less(counter, 10)
+
+      @tf.function(input_signature=[
+          tf.TensorSpec([], tf.int32),
+          tf.TensorSpec([], tf.int64)
+      ])
+      def iteration(counter, x_minus_counter):
+        return tf.add(counter, 1), tf.add(x_minus_counter, -1)
+
+      initial_values = [tf.constant(0), inputs['x']]
+      final_values = tf.raw_ops.While(
+          cond=stop_condition.get_concrete_function(),
+          body=iteration.get_concrete_function(),
+          input=initial_values)
+
+      y = final_values[1]
+
+      return {'y': tft.mean(y) + tf.zeros_like(inputs['x'], dtype=tf.float32)}
+
+    feature_spec = {
+        'x': tf.io.FixedLenFeature([], tf.int64),
+    }
+    input_data_dict = {
+        'span-0': [dict(x=-2), dict(x=4)],
+    }
     run_result = self._run_pipeline(feature_spec, input_data_dict,
                                     preprocessing_fn)
+    first_cache_output, p1 = run_result.cache_output, run_result.pipeline
+
+    for key in input_data_dict:
+      self.assertIn(key, first_cache_output)
+      self.assertEqual(1, len(first_cache_output[key]))
+
+    self.assertEqual(_get_counter_value(p1.metrics, 'num_instances'), 2)
+    self.assertEqual(_get_counter_value(p1.metrics, 'cache_entries_decoded'), 0)
+    self.assertEqual(_get_counter_value(p1.metrics, 'cache_entries_encoded'), 1)
+    self.assertEqual(
+        _get_counter_value(p1.metrics, 'saved_models_created'),
+        _SINGLE_PHASE_NUM_SAVED_MODELS)
+
+    run_result = self._run_pipeline(
+        feature_spec, input_data_dict, preprocessing_fn, should_read_cache=True)
     second_cache_output, p2 = run_result.cache_output, run_result.pipeline
 
-    # We expect a full output cache again because tf.function in the
-    # preprocessing_fn broke that cache entry.
+    # We expect the cache to fail here because the tf.function is now different.
     for key in input_data_dict:
       self.assertIn(key, second_cache_output)
       self.assertEqual(1, len(second_cache_output[key]))