tensorflow
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/sentiment_example.py
Lines changed: 11 additions & 37 deletions b/‎examples/sentiment_example.py
Lines changed: 11 additions & 37 deletions
diff --git a/‎setup.py
Lines changed: 7 additions & 16 deletions b/‎setup.py
Lines changed: 7 additions & 16 deletions
diff --git a/‎tensorflow_transform/__init__.py
Lines changed: 1 addition & 0 deletions b/‎tensorflow_transform/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_transform/analyzers.py
Lines changed: 60 additions & 13 deletions b/‎tensorflow_transform/analyzers.py
Lines changed: 60 additions & 13 deletions
diff --git a/‎tensorflow_transform/beam/analyzer_impls.py
Lines changed: 60 additions & 1 deletion b/‎tensorflow_transform/beam/analyzer_impls.py
Lines changed: 60 additions & 1 deletion
@@ -49,4 +49,4 @@ independent from the specific runner as possible.
 ## Getting Started
 
 For instructions on using tf.Transform see the [getting started
-guide](./getting_started.md)
+guide](./getting_started.md).
@@ -46,9 +46,10 @@
 NUM_TEST_INSTANCES = 25000
 
 REVIEW_COLUMN = 'review'
+REVIEW_WEIGHT = 'review_weight'
 LABEL_COLUMN = 'label'
 
-PUNCTUATION_CHARACTERS = ['.', ',', '!', '?', '(', ')']
+DELIMITERS = '.,!?() '
 
 
 # pylint: disable=invalid-name
@@ -139,43 +140,14 @@ def preprocessing_fn(inputs):
         """Preprocess input columns into transformed columns."""
         review = inputs[REVIEW_COLUMN]
 
-        def remove_character(s, char):
-          """Remove a character from a string.
-
-          Args:
-            s: A SparseTensor of rank 1 of type tf.string
-            char: A string of length 1
-
-          Returns:
-            The string `s` with the given character removed (i.e. replaced by
-            '')
-          """
-          # Hacky implementation where we split and rejoin.
-          split = tf.string_split(s, char)
-          rejoined = tf.reduce_join(
-              tf.sparse_to_dense(
-                  split.indices, split.dense_shape, split.values, ''),
-              1)
-          return rejoined
-
-        def remove_punctuation(s):
-          """Remove puncuation from a string.
-
-          Args:
-            s: A SparseTensor of rank 1 of type tf.string
-
-          Returns:
-            The string `s` with punctuation removed.
-          """
-          for char in PUNCTUATION_CHARACTERS:
-            s = remove_character(s, char)
-          return s
-
-        cleaned_review = tft.map(remove_punctuation, review)
-        review_tokens = tft.map(tf.string_split, cleaned_review)
+        review_tokens = tft.map(lambda x: tf.string_split(x, DELIMITERS),
+                                review)
         review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
+        # Add one for the oov bucket created by string_to_int.
+        review_weight = tft.tfidf_weights(review_indices, VOCAB_SIZE + 1)
         return {
             REVIEW_COLUMN: review_indices,
+            REVIEW_WEIGHT: review_weight,
             LABEL_COLUMN: inputs[LABEL_COLUMN]
         }
 
@@ -230,9 +202,11 @@ def train_and_evaluate(transformed_train_filepattern,
   review_column = feature_column.sparse_column_with_integerized_feature(
       REVIEW_COLUMN,
       bucket_size=VOCAB_SIZE + 1,
-      combiner='sqrtn')
+      combiner='sum')
+  weighted_reviews = feature_column.weighted_sparse_column(review_column,
+                                                           REVIEW_WEIGHT)
 
-  estimator = learn.LinearClassifier([review_column])
+  estimator = learn.LinearClassifier([weighted_reviews])
 
   transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir)
   train_input_fn = input_fn_maker.build_training_input_fn(
 
@@ -13,38 +13,29 @@
 # limitations under the License.
 """Package Setup script for the tf.Transform binary.
 """
-import os
-
 from setuptools import find_packages
 from setuptools import setup
 
+# Tensorflow transform version.
+__version__ = '0.1.8'
 
-def get_required_install_packages():
-  return [
 
+def _make_required_install_packages():
+  return [
       # Using >= for better integration tests. During release this is
       # automatically changed to a ==.
-      'google-cloud-dataflow == 0.6.0',
+      'apache-beam[gcp] == 0.6.0',
   ]
 
 
-def get_version():
-  # Obtain the version from the global names on version.py
-  # We cannot do 'from tensorflow_transform import version' since the transitive
-  # dependencies will not be available when the installer is created.
-  global_names = {}
-  execfile(os.path.normpath('tensorflow_transform/version.py'), global_names)
-  return global_names['__version__']
-
-
 setup(
     name='tensorflow-transform',
-    version=get_version(),
+    version=__version__,
     author='Google Inc.',
     author_email='tf-transform-feedback@google.com',
     license='Apache 2.0',
     namespace_packages=[],
-    install_requires=get_required_install_packages(),
+    install_requires=_make_required_install_packages(),
     packages=find_packages(),
     include_package_data=True,
     description='A library for data preprocessing with TensorFlow',
 
@@ -17,4 +17,5 @@
 from tensorflow_transform.analyzers import *
 from tensorflow_transform.api import *
 from tensorflow_transform.mappers import *
+from tensorflow_transform.pretrained_models import *
 # pylint: enable=wildcard-import
@@ -21,63 +21,105 @@
 from tensorflow_transform import api
 
 
-def min(x):  # pylint: disable=redefined-builtin
+def _get_output_shape(x, reduce_instance_dims):
+  """Determines the shape of the output of a numerical analyzer.
+
+  Args:
+    x: An input `Column' wrapping a `Tensor`.
+    reduce_instance_dims: If true, collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
+
+  Returns:
+    The shape to use for the output placeholder.
+  """
+  if reduce_instance_dims:
+    # Numerical analyzers produce scalar output by default
+    return ()
+  else:
+    in_shape = x.tensor.shape
+    if in_shape:
+      # The output will be the same shape as the input, but without the batch.
+      return in_shape.as_list()[1:]
+    else:
+      return None
+
+
+def min(x, reduce_instance_dims=True):  # pylint: disable=redefined-builtin
   """Computes the minimum of a `Column`.
 
   Args:
     x: An input `Column' wrapping a `Tensor`.
+    reduce_instance_dims: By default collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
 
   Returns:
     A `Statistic`.
   """
   if not isinstance(x.tensor, tf.Tensor):
     raise TypeError('Expected a Tensor, but got %r' % x.tensor)
 
+  arg_dict = {'reduce_instance_dims': reduce_instance_dims}
 
   # pylint: disable=protected-access
-  return api._AnalyzerOutput(tf.placeholder(x.tensor.dtype, ()),
-                             api.CanonicalAnalyzers.MIN, [x], {})
+  return api._AnalyzerOutput(
+      tf.placeholder(x.tensor.dtype, _get_output_shape(
+          x, reduce_instance_dims)), api.CanonicalAnalyzers.MIN, [x], arg_dict)
 
 
-def max(x):  # pylint: disable=redefined-builtin
+def max(x, reduce_instance_dims=True):  # pylint: disable=redefined-builtin
   """Computes the maximum of a `Column`.
 
   Args:
     x: An input `Column' wrapping a `Tensor`.
+    reduce_instance_dims: By default collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
 
   Returns:
     A `Statistic`.
   """
   if not isinstance(x.tensor, tf.Tensor):
     raise TypeError('Expected a Tensor, but got %r' % x.tensor)
 
+  arg_dict = {'reduce_instance_dims': reduce_instance_dims}
   # pylint: disable=protected-access
-  return api._AnalyzerOutput(tf.placeholder(x.tensor.dtype, ()),
-                             api.CanonicalAnalyzers.MAX, [x], {})
+  return api._AnalyzerOutput(
+      tf.placeholder(x.tensor.dtype, _get_output_shape(
+          x, reduce_instance_dims)), api.CanonicalAnalyzers.MAX, [x], arg_dict)
 
 
-def sum(x):  # pylint: disable=redefined-builtin
+def sum(x, reduce_instance_dims=True):  # pylint: disable=redefined-builtin
   """Computes the sum of a `Column`.
 
   Args:
     x: An input `Column' wrapping a `Tensor`.
+    reduce_instance_dims: By default collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
 
   Returns:
     A `Statistic`.
   """
   if not isinstance(x.tensor, tf.Tensor):
     raise TypeError('Expected a Tensor, but got %r' % x.tensor)
 
+  arg_dict = {'reduce_instance_dims': reduce_instance_dims}
   # pylint: disable=protected-access
-  return api._AnalyzerOutput(tf.placeholder(x.tensor.dtype, ()),
-                             api.CanonicalAnalyzers.SUM, [x], {})
+  return api._AnalyzerOutput(
+      tf.placeholder(x.tensor.dtype, _get_output_shape(
+          x, reduce_instance_dims)), api.CanonicalAnalyzers.SUM, [x], arg_dict)
 
 
-def size(x):
+def size(x, reduce_instance_dims=True):
   """Computes the total size of instances in a `Column`.
 
   Args:
     x: An input `Column' wrapping a `Tensor`.
+    reduce_instance_dims: By default collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
 
   Returns:
     A `Statistic`.
@@ -86,14 +128,17 @@ def size(x):
     raise TypeError('Expected a Tensor, but got %r' % x.tensor)
 
   # Note: Calling `sum` defined in this module, not the builtin.
-  return sum(api.map(tf.ones_like, x))
+  return sum(api.map(tf.ones_like, x), reduce_instance_dims)
 
 
-def mean(x):
+def mean(x, reduce_instance_dims=True):
   """Computes the mean of the values in a `Column`.
 
   Args:
     x: An input `Column' wrapping a `Tensor`.
+    reduce_instance_dims: By default collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
 
   Returns:
     A `Column` with an underlying `Tensor` of shape [1], containing the mean.
@@ -102,7 +147,9 @@ def mean(x):
     raise TypeError('Expected a Tensor, but got %r' % x.tensor)
 
   # Note: Calling `sum` defined in this module, not the builtin.
-  return api.map_statistics(tf.divide, sum(x), size(x))
+  return api.map_statistics(tf.divide,
+                            sum(x, reduce_instance_dims),
+                            size(x, reduce_instance_dims))
 
 
 def uniques(x, top_k=None, frequency_threshold=None):
 
@@ -28,9 +28,22 @@
 from apache_beam.typehints import with_output_types
 
 import six
+import tensorflow as tf
 from tensorflow_transform.beam import common
 
 
+def flatten_value_to_list(batch):
+  """Converts an N-D dense or sparse batch to a 1-D list."""
+  if isinstance(batch, tf.SparseTensorValue):
+    dense_values = batch.values
+  else:
+    dense_values = batch
+  # Ravel for flattening and tolist so that we go to native Python types
+  # for more efficient followup processing.
+  #
+  return dense_values.ravel().tolist()
+
+
 @with_input_types(List[common.NUMERIC_TYPE])
 @with_output_types(common.NUMERIC_TYPE)
 class _NumericAnalyzer(beam.PTransform):
@@ -40,12 +53,57 @@ def __init__(self, fn):
     self._fn = fn
 
   def expand(self, pcoll):
+    pcoll |= 'FlattenValueToList' >> beam.Map(flatten_value_to_list)
     return (pcoll
             | 'CombineWithinList' >> beam.Map(self._fn)
             | 'CombineGlobally'
             >> beam.CombineGlobally(self._fn).without_defaults())
 
 
+@with_input_types(List[common.PRIMITIVE_TYPE])
+@with_output_types(List[common.PRIMITIVE_TYPE])
+class _NumericAnalyzerOnBatchDim(beam.PTransform):
+  """Reduces a PCollection on the batche dimension using to the given function.
+
+  Args:
+    fn: The function used to reduce the PCollection. It must take as inputs an
+        ndarray of data, and an axis parameter used to specify that the
+        reduction should only happen along the batch dimension, and all
+        instance dimensions should be preserved.
+  """
+
+  class _CombineOnBatchDim(beam.CombineFn):
+    """Combines the PCollection only on the 0th dimension using nparray."""
+
+    def __init__(self, fn):
+      self._fn = fn
+
+    def create_accumulator(self):
+      return []
+
+    def add_input(self, accumulator, next_input):
+      batch = self._fn(next_input, axis=0)
+      if any(accumulator):
+        return self._fn((accumulator, batch), axis=0)
+      else:
+        return batch
+
+    def merge_accumulators(self, accumulators):
+      # numpy's sum, min, max, etc functions operate on array-like objects, but
+      # not arbitrary iterables. Convert the provided accumulators into a list
+      return self._fn(list(accumulators), axis=0)
+
+    def extract_output(self, accumulator):
+      return accumulator
+
+  def __init__(self, fn):
+    self._fn = fn
+
+  def expand(self, pcoll):
+    return (pcoll | 'CombineOnBatchDim'
+            >> beam.CombineGlobally(self._CombineOnBatchDim(self._fn)))
+
+
 @with_input_types(List[common.PRIMITIVE_TYPE])
 @with_output_types(List[common.PRIMITIVE_TYPE])
 class _UniquesAnalyzer(beam.PTransform):
@@ -63,6 +121,7 @@ def expand(self, pcoll):
     # this to create a single element PCollection containing this list of
     # pairs in sorted order by decreasing counts (and by values for equal
     # counts).
+    pcoll |= 'FlattenValueToList' >> beam.Map(flatten_value_to_list)
 
     counts = (
         pcoll
@@ -90,7 +149,7 @@ def expand(self, pcoll):
     # from a single file.
     #
     @beam.ptransform_fn
-    def Reshard(pcoll):
+    def Reshard(pcoll):  # pylint: disable=invalid-name
       return (
           pcoll
           | 'PairWithNone' >> beam.Map(lambda x: (None, x))