Merge pull request #19 from tensorflow/tft-0.1.10

Project import generated by Copybara. PiperOrigin-RevId: 157835649
tensorflow · Jun 2, 2017 · a87a5a3 · a87a5a3
2 parents 3206f45 + 3703673
commit a87a5a3
Show file tree

Hide file tree

Showing 13 changed files with 1,258 additions and 322 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,29 @@
+# Release 0.1.10
+
+## Major Features and Improvements
+* Add json-example serving input functions to TF.Transform.
+* Add variance analyzer to tf.transform.
+
+## Bug Fixes and Other Changes
+* Remove duplication in output of `tft.tfidf`.
+* Ensure ngrams output dense_shape is greater than or equal to 0.
+* Alters the behavior and interface of tensorflow_transform.mappers.ngrams.
+* Use `apache-beam[gcp] >=2,<3`
+* Making TF Parallelism runner-dependent.
+* Fixes issue with csv serving input function.
+
+## Deprecations
+* `tft.map` will be removed on version 0.2.0, see the `examples` directory for
+  instructions on how to use `tft.apply_function` instead (as needed).
+* `tft.tfidf_weights` will be removed on version 0.2.0, use `tft.tfidf` instead.
+
+# Release 0.1.9
+
+## Major Features and Improvements
+* Refactor internals to remove Column and Statistic classes
+
+## Bug Fixes and Other Changes
+* Remove collections from graph to avoid warnings
+* Return float32 from tfidf_weights
+* Update tensorflow_transform to use tf.saved_model APIs.
+* Add default values on example proto coder.
diff --git a/examples/census_example.py b/examples/census_example.py
@@ -116,7 +116,8 @@ def preprocessing_fn(inputs):
     def convert_label(label):
       table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
       return table.lookup(label)
-    outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])
+    outputs[LABEL_COLUMN] = tft.apply_function(convert_label,
+                                               inputs[LABEL_COLUMN])
 
     return outputs
 

diff --git a/examples/sentiment_example.py b/examples/sentiment_example.py
@@ -140,13 +140,13 @@ def preprocessing_fn(inputs):
         """Preprocess input columns into transformed columns."""
         review = inputs[REVIEW_COLUMN]
 
-        review_tokens = tft.map(lambda x: tf.string_split(x, DELIMITERS),
-                                review)
+        review_tokens = tf.string_split(review, DELIMITERS)
         review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
         # Add one for the oov bucket created by string_to_int.
-        review_weight = tft.tfidf_weights(review_indices, VOCAB_SIZE + 1)
+        review_bow_indices, review_weight = tft.tfidf(review_indices,
+                                                      VOCAB_SIZE + 1)
         return {
-            REVIEW_COLUMN: review_indices,
+            REVIEW_COLUMN: review_bow_indices,
             REVIEW_WEIGHT: review_weight,
             LABEL_COLUMN: inputs[LABEL_COLUMN]
         }

diff --git a/examples/simple_example.py b/examples/simple_example.py
@@ -32,11 +32,10 @@ def preprocessing_fn(inputs):
   x = inputs['x']
   y = inputs['y']
   s = inputs['s']
-  x_centered = tft.map(lambda x, mean: x - mean, x, tft.mean(x))
+  x_centered = x - tft.mean(x)
   y_normalized = tft.scale_to_0_1(y)
   s_integerized = tft.string_to_int(s)
-  x_centered_times_y_normalized = tft.map(lambda x, y: x * y,
-                                          x_centered, y_normalized)
+  x_centered_times_y_normalized = (x_centered * y_normalized)
   return {
       'x_centered': x_centered,
       'y_normalized': y_normalized,

diff --git a/getting_started.md b/getting_started.md
@@ -11,35 +11,27 @@ aspects of the usage of tf.Transform.
 ## Defining a Preprocessing Function
 
 The most important concept of tf.Transform is the "preprocessing function". This
-is a logical description of a transformation of a dataset. The dataset is
-conceptualized as a dictionary of columns, and the preprocessing function is
-defined by two basic mechanisms:
-
-1) Applying `tft.map`, which takes a user-defined function that accepts and
-returns tensors. Such a function can use any TensorFlow operation to construct
-the output tensors from the inputs. The remaining arguments of `tft.map` are the
-columns that the function should be applied to. The number of columns provided
-should equal the number of arguments to the user-defined function. Like the
-Python `map` function, `tft.map` applies the user-provided function to the
-elements in the columns specified. Each row is treated independently, and the
-output is a column containing the results (but see the note on batching at the
-end of this section).
-
-2) Applying any of the tf.Transform provided "analyzers". Analyzers are
-functions that accept one or more `Column`s and return some summary statistic
-for the input column or columns. A statistic is like a column except that it
-only has a single value. An example of an analyzer is `tft.min` which computes
-the minimum of a column. Currently tf.Transform provides a fixed set of
-analyzers, but this will be extensible in future versions.
-
-In fact, `tft.map` can also accept statistics, which is how statistics are
-incorporated into the user-defined pipeline. By combining analyzers and
-`tft.map`, users can flexibly create pipelines for transforming their data. In
-particular, users should define a "preprocessing function" which accepts and
-returns columns.
-
-The following preprocessing function transforms each of three columns in
-different ways, and combines two of the columns.
+is a logical description of a transformation of a dataset.  The preprocessing
+function accepts and returns a dictionary of tensors (in this guide, "tensors"
+generally means `Tensor`s or `SparseTensor`s).  There are two kinds of functions
+that can be used to define the preprocessing function:
+
+1) Any function that accepts and returns tensors.  These will add TensorFlow
+operations to the graph that transforms raw data into transformed data.
+
+2) Any of the tf.Transform provided "analyzers". Analyzers also accept and return
+tensors, but unlike typical TensorFlow functions they don't add TF Operations
+to the graph.  Instead, they cause tf.Transform to compute a full pass operation
+outside of TensorFlow, using the input tensor values over the full dataset to
+generate a constant tensor that gets returned as the output.  For example
+`tft.min` computes the minimum of a tensor over the whole dataset. Currently
+tf.Transform provides a fixed set of analyzers, but this will be extensible in
+future versions.
+
+By combining analyzers and regular TensorFlow functions, users can flexibly
+create pipelines for transforming their data.  The following preprocessing
+function transforms each of three features in different ways, and combines two
+of the features.
 
 ```
 import tensorflow as tf
@@ -49,11 +41,10 @@ def preprocessing_fn(inputs):
   x = inputs['x']
   y = inputs['y']
   s = inputs['s']
-  x_centered = tft.map(lambda x, mean: x - mean, x, tft.mean(x))
+  x_centered = x - tft.mean(x)
   y_normalized = tft.scale_to_0_1(y)
   s_integerized = tft.string_to_int(s)
-  x_centered_times_y_normalized = tft.map(lambda x, y: x * y,
-                                          x_centered, y_normalized)
+  x_centered_times_y_normalized = x_centered * y_normalized
   return {
       'x_centered': x_centered,
       'y_normalized': y_normalized,
@@ -62,32 +53,29 @@ def preprocessing_fn(inputs):
   }
 ```
 
-`x`, `y` and `s` are local variables that represent input columns, that are
-declared for code brevity. The first new column to be constructed, `x_centered`,
-is constructed by composing `tft.map` and `tft.mean`. `tft.mean(x)` returns a
-statistic representing the mean of the column `x`. The lambda passed to
-`tft.map` is simply subtraction, where the first argument is the column `x` and
-the second is the statistic `tft.mean(x)`. Thus `x_centered` is the column `x`
+`x`, `y` and `s` are `Tensor`s that represent input features. The first new
+tensor to be constructed, `x_centered`, is constructed by applying `tft.mean`
+to `x` and subtracting this from `x`. `tft.mean(x)` returns a tensor
+representing the mean of the tensor `x`. Thus `x_centered` is the tensor `x`
 with the mean subtracted.
 
-The second new column is `y_normalized`, created in a similar manner but using
+The second new tensor is `y_normalized`, created in a similar manner but using
 the convenience method `tft.scale_to_0_1`. This method does something similar
 under the hood to what is done to compute `x_centered`, namely computing a max
 and min and using these to scale `y`.
 
-The column `s_integerized` shows an example of string manipulation. In this
+The tensor `s_integerized` shows an example of string manipulation. In this
 simple case we take a string and map it to an integer. This too uses a
-convenience function, where the analyzer that is applied computes the unique
-values taken by the column, and the map uses these values as a dictionary to
-convert to an integer.
+convenience function, `tft.string_to_int`.   This function uses an analyzer to
+compute the unique values taken by the input strings, and then uses TensorFlow
+ops to convert the input strings to indices in the table of unique values.
 
-The final column shows that it is possible to use `tft.map` not only to
-manipulate a single column but also to combine columns.
+The final column shows that it is possible to use tensorflow operations to
+create new features by combining tensors.
 
-Note that `Column`s are not themselves wrappers around data. Rather they are
-placeholders used to construct a definition of the user's logical pipeline. In
-order to apply such a pipeline to data, we rely on a concrete implementation of
-the tf.Transform API. The Apache Beam implementation provides `PTransform`s that
+The preprocessing function defines a pipeline of operations on a dataset.  In
+order to apply such a pipeline, we rely on a concrete implementation of the
+tf.Transform API. The Apache Beam implementation provides `PTransform`s that
 apply a user's preprocessing function to data. The typical workflow of a
 tf.Transform user will be to construct a preprocessing function, and then
 incorporate this into a larger Beam pipeline, ultimately materializing the data
@@ -100,13 +88,14 @@ tf.Transform is to provide the TensorFlow graph for preprocessing that can be
 incorporated into the serving graph (and optionally the training graph),
 batching is also an important concept in tf.Transform.
 
-While it is not obvious from the example above, the user defined function passed
-to `tft.map` will be passed tensors representing *batches*, not individual
-instances, just as will happen during training and serving with TensorFlow. This
-is only the case for inputs that are `Column`s, not `Statistic`s. Thus the
-actual tensors used in the `tft.map` for `x_centered` are 1) a rank 1 tensor,
-representing a batch of values from the column `x`, whose first dimension is the
-batch dimension; and 2) a rank 0 tensor representing the mean of that column.
+While it is not obvious from the example above, the user defined preprocessing
+function will be passed tensors representing *batches*, not individual
+instances, just as will happen during training and serving with TensorFlow.  On
+the other hand, analyzers perform a computation over the whole dataset and
+return a single value, not a batch of values.  Thus `x` is a `Tensor` of shape
+`(batch_size,)` while `tft.mean(x)` is a `Tensor` of shape `()`.  The
+subtraction `x - tft.mean(x)` involves broadcasting where the value of
+`tft.mean(x)` is subtracted from every element of the batch represented by `x`.
 
 ## The Canonical Beam Implementation
 
@@ -322,17 +311,20 @@ def preprocessing_fn(inputs):
   def convert_label(label):
     table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
     return table.lookup(label)
-  outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])
+  outputs[LABEL_COLUMN] = tft.apply_function(
+      convert_label, inputs[LABEL_COLUMN])
 
   return outputs
 ```
 
-One difference from the previous example is that we convert the outputs from
-scalars to single element vectors. This allows the data to be correctly read
-during training. Also for the label column, we manually specify the mapping from
-string to index so that ">50K" gets mapped to 0 and "<=50K" gets mapped to 1.
-This is useful so that we know which index in the trained model corresponds to
-which label.
+One difference from the previous example is that for the label column, we
+manually specify the mapping from string to index so that ">50K" gets mapped to
+0 and "<=50K" gets mapped to 1. This is useful so that we know which index in
+the trained model corresponds to which label.  We cannot apply the function
+`convert_label` directly to its arguments because `tf.Transform` needs to know
+about the `Table` defined in `convert_label`.  That is, `convert_label` is not
+a pure function but involves table initialization.  For such functions, we use
+`tft.apply_function` to wrap the function application.
 
 The `raw_data` variable represents a `PCollection` containing data in the same
 format as the list `raw_data` from the previous example, and the use of the

diff --git a/setup.py b/setup.py
@@ -17,14 +17,12 @@
 from setuptools import setup
 
 # Tensorflow transform version.
-__version__ = '0.1.9'
+__version__ = '0.1.10'
 
 
 def _make_required_install_packages():
   return [
-      # Using >= for better integration tests. During release this is
-      # automatically changed to a ==.
-      'apache-beam[gcp] == 0.6.0',
+      'apache-beam[gcp]>=2,<3',
   ]
 
 

diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -45,20 +45,20 @@ class Analyzer(object):
 
   Args:
     inputs: The inputs to the analyzer.
-    output_shapes_and_dtype: List of pairs of (shape, dtype) for each output.
+    output_shapes_and_dtype: List of pairs of (dtype, shape) for each output.
     spec: A description of the computation to be done.
 
   Raises:
     ValueError: If the inputs are not all `Tensor`s.
   """
 
-  def __init__(self, inputs, output_shapes_and_dtypes, spec):
+  def __init__(self, inputs, output_dtypes_and_shapes, spec):
     for tensor in inputs:
       if not isinstance(tensor, tf.Tensor):
         raise ValueError('Analyzers can only accept `Tensor`s as inputs')
     self._inputs = inputs
-    self._outputs = [tf.placeholder(shape, dtype)
-                     for shape, dtype in output_shapes_and_dtypes]
+    self._outputs = [tf.placeholder(dtype, shape)
+                     for dtype, shape in output_dtypes_and_shapes]
     self._spec = spec
     tf.add_to_collection(ANALYZER_COLLECTION, self)
 
@@ -131,7 +131,7 @@ def min(x, reduce_instance_dims=True):  # pylint: disable=redefined-builtin
         dimension and outputs a `Tensor` of the same shape as the input.
 
   Returns:
-    A `Tensor`.
+    A `Tensor`. Has the same type as `x`.
   """
   return _numeric_combine(x, NumericCombineSpec.MIN, reduce_instance_dims)
 
@@ -146,7 +146,7 @@ def max(x, reduce_instance_dims=True):  # pylint: disable=redefined-builtin
         dimension and outputs a vector of the same shape as the output.
 
   Returns:
-    A `Tensor`.
+    A `Tensor`. Has the same type as `x`.
   """
   return _numeric_combine(x, NumericCombineSpec.MAX, reduce_instance_dims)
 
@@ -161,7 +161,7 @@ def sum(x, reduce_instance_dims=True):  # pylint: disable=redefined-builtin
         dimension and outputs a vector of the same shape as the output.
 
   Returns:
-    A `Tensor`.
+    A `Tensor`. Has the same type as `x`.
   """
   return _numeric_combine(x, NumericCombineSpec.SUM, reduce_instance_dims)
 
@@ -176,7 +176,7 @@ def size(x, reduce_instance_dims=True):
         dimension and outputs a vector of the same shape as the output.
 
   Returns:
-    A `Tensor`.
+    A `Tensor`. Has the same type as `x`.
   """
   with tf.name_scope('size'):
     # Note: Calling `sum` defined in this module, not the builtin.
@@ -193,14 +193,44 @@ def mean(x, reduce_instance_dims=True):
         dimension and outputs a vector of the same shape as the output.
 
   Returns:
-    A `Tensor` containing the mean.
+    A `Tensor` containing the mean. If `x` is floating point, the mean will
+    have the same type as `x`. If `x` is integral, the output is cast to float32
+    for int8 and int16 and float64 for int32 and int64 (similar to the behavior
+    of tf.truediv).
   """
   with tf.name_scope('mean'):
     # Note: Calling `sum` defined in this module, not the builtin.
     return tf.divide(
         sum(x, reduce_instance_dims), size(x, reduce_instance_dims))
 
 
+def var(x, reduce_instance_dims=True):
+  """Computes the variance of the values of a `Tensor` over the whole dataset.
+
+  Uses the biased variance (0 delta degrees of freedom), as given by
+  (x - mean(x))**2 / length(x).
+
+  Args:
+    x: A `Tensor`.
+    reduce_instance_dims: By default collapses the batch and instance dimensions
+        to arrive at a single scalar output. If False, only collapses the batch
+        dimension and outputs a vector of the same shape as the output.
+
+  Returns:
+    A `Tensor` containing the variance. If `x` is floating point, the variance
+    will have the same type as `x`. If `x` is integral, the output is cast to
+    float32 for int8 and int16 and float64 for int32 and int64 (similar to the
+    behavior of tf.truediv).
+  """
+  with tf.name_scope('var'):
+    # Note: Calling `mean`, `sum`, and `size` as defined in this module, not the
+    # builtins.
+    x_mean = mean(x, reduce_instance_dims)
+    # x_mean will be float32 or float64, depending on type of x.
+    squared_deviations = tf.square(tf.cast(x, x_mean.dtype) - x_mean)
+    return mean(squared_deviations, reduce_instance_dims)
+
+
 class UniquesSpec(object):
   """Operation to compute unique values."""