From 52b65c9f4a8615eab680b9c8b92c7f6aee28e1ba Mon Sep 17 00:00:00 2001
From: galderic <galderic@gmail.com>
Date: Fri, 20 Apr 2018 13:17:17 +0200
Subject: [PATCH 1/3] added new regression example

---
 examples/mpg_example.py | 298 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 298 insertions(+)
 create mode 100644 examples/mpg_example.py

diff --git a/examples/mpg_example.py b/examples/mpg_example.py
new file mode 100644
index 00000000..ce9e4641
--- /dev/null
+++ b/examples/mpg_example.py
@@ -0,0 +1,298 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example using auto-mpg data from UCI repository."""
+
+# pylint: disable=g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import tempfile
+
+import tensorflow as tf
+import tensorflow_transform as tft
+from apache_beam.io import textio
+from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+
+from tensorflow_transform.beam import impl as beam_impl
+from tensorflow_transform.beam.tft_beam_io import transform_fn_io
+from tensorflow_transform.coders import csv_coder
+from tensorflow_transform.saved import saved_transform_io
+from tensorflow_transform.tf_metadata import dataset_metadata
+from tensorflow_transform.tf_metadata import dataset_schema
+
+import apache_beam as beam
+
+# to download and prepare the data:
+#  curl https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data|grep -v "?"|sed -E -e 's/[[:blank:]]{2,}/,/g'|sed  -E -e $'s/\t/,/g' | head -n340 > auto-mpg.csv
+#  curl https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data|grep -v "?"|sed -E -e 's/[[:blank:]]{2,}/,/g'|sed  -E -e $'s/\t/,/g' | tail -n50 > auto-mpg-test.csv
+
+ordered_columns = [
+    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name'
+]
+
+CATEGORICAL_FEATURE_KEYS = [
+    'cylinders', 'year', 'name', 'origin'
+]
+
+NUMERIC_FEATURE_KEYS = [
+    'displacement', 'horsepower', 'weight', 'acceleration'
+]
+
+LABEL_KEY = 'mpg'
+
+
+def _create_raw_metadata():
+    """Create a DatasetMetadata for the raw data."""
+    column_schemas = {
+        key: dataset_schema.ColumnSchema(
+            tf.string, [], dataset_schema.FixedColumnRepresentation())
+        for key in CATEGORICAL_FEATURE_KEYS
+    }
+    column_schemas.update({
+        key: dataset_schema.ColumnSchema(
+            tf.float32, [], dataset_schema.FixedColumnRepresentation())
+        for key in NUMERIC_FEATURE_KEYS
+    })
+    column_schemas[LABEL_KEY] = dataset_schema.ColumnSchema(
+        tf.float32, [], dataset_schema.FixedColumnRepresentation())
+    raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(
+        column_schemas))
+    return raw_data_metadata
+
+
+RAW_DATA_METADATA = _create_raw_metadata()
+
+# Constants used for training.  Note that the number of instances will be
+# computed by tf.Transform in future versions, in which case it can be read from
+# the metadata.  Similarly BUCKET_SIZES will not be needed as this information
+# will be stored in the metadata for each of the columns.  The bucket size
+# includes all listed categories in the dataset description as well as one extra
+# for "?" which represents unknown.
+BATCH_SIZE = 5
+TRAIN_NUM_EPOCHS = 20
+NUM_TRAIN_INSTANCES = 340
+NUM_TEST_INSTANCES = 50
+BUCKET_SIZES = [5, 12, 1024, 3]
+
+EXPORTED_MODEL_DIR = 'exported_model_dir'
+
+
+def create_transform_fn(train_data_file, working_dir):
+    """Create a transform function that can be run on-the-fly while training
+
+    Read in the data using the CSV reader, and transform it using a
+    preprocessing pipeline that scales numeric data and converts categorical data
+    from strings to int64 values indices, by creating a vocabulary for each
+    category.
+
+    Args:
+      train_data_file: File containing training data
+      working_dir: Directory to write transformed data and metadata to
+    """
+
+    def preprocessing_fn(inputs):
+        """Preprocess input columns into transformed columns."""
+        outputs = {}
+
+        # Scale numeric columns to have range [0, 1].
+        for key in NUMERIC_FEATURE_KEYS:
+            outputs[key] = tft.scale_to_0_1(inputs[key])
+
+        # For all categorical columns except the label column, we use
+        # tft.string_to_int which computes the set of unique values and uses this
+        # to convert the strings to indices.
+        for key in CATEGORICAL_FEATURE_KEYS:
+            outputs[key] = tft.string_to_int(inputs[key])
+
+        # For the label column we provide the mapping from string to index.
+        outputs[LABEL_KEY] = inputs[LABEL_KEY]
+
+        return outputs
+
+    # The "with" block will create a pipeline, and run that pipeline at the exit
+    # of the block.
+    with beam.Pipeline() as pipeline:
+        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
+            # Create a coder to read the mpg data with the schema.  To do this we
+            # need to list all columns in order since the schema doesn't specify the
+            # order of columns in the csv.
+            converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)
+
+            # Read in raw data and convert using CSV converter.  Note that we apply
+            # some Beam transformations here, which will not be encoded in the TF
+            # graph since we don't do the from within tf.Transform's methods
+            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
+            # to get data into a format that the CSV converter can read, in particular
+            # removing empty lines and removing spaces after commas.
+            raw_data = (
+                    pipeline
+                    | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
+                    | 'FilterTrainData' >> beam.Filter(lambda line: line)
+                    | 'FixCommasTrainData' >> beam.Map(
+                lambda line: line.replace(', ', ','))
+                    | 'DecodeTrainData' >> beam.Map(converter.decode))
+
+            # Combine data and schema into a dataset tuple.  Note that we already used
+            # the schema to read the CSV data, but we also need it to interpret
+            # raw_data.
+            raw_dataset = (raw_data, RAW_DATA_METADATA)
+            transformed_dataset, transform_fn = (
+                    raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
+            transformed_data, transformed_metadata = transformed_dataset
+
+            # Will write a SavedModel and metadata to two subdirectories of
+            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
+            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
+            _ = (
+                    transform_fn
+                    | 'WriteTransformFn' >>
+                    transform_fn_io.WriteTransformFn(working_dir))
+
+
+def file_decode_csv(line):
+    columns_default_values = [[0.0], ["4"], [0.0], [0.0], [0.0], [0.0], ["70"], ["1"], ["<unkown>"]]
+
+    parsed_line = tf.decode_csv(line, columns_default_values)
+    features = parsed_line
+
+    d = dict(zip(ordered_columns, features))
+
+    label = d[LABEL_KEY]
+    del d[LABEL_KEY]
+
+    return d, label
+
+
+def _make_training_input_fn(working_dir, csv_file, batch_size):
+    dataset = (tf.data.TextLineDataset(csv_file, buffer_size=8 * 1048576))
+
+    dataset = dataset.shuffle(NUM_TRAIN_INSTANCES)
+    dataset = dataset.apply(tf.contrib.data.map_and_batch(file_decode_csv, batch_size, num_parallel_batches=4))
+    dataset = dataset.prefetch(4)
+
+    raw_features, raw_label = dataset.make_one_shot_iterator().get_next()
+
+    _, transformed_features = saved_transform_io.partially_apply_saved_transform(
+        os.path.join(working_dir, transform_fn_io.TRANSFORM_FN_DIR), raw_features)
+    return transformed_features, raw_label
+
+
+def _make_serving_input_fn(working_dir):
+    """Creates an input function reading from raw data.
+
+    Args:
+      working_dir: Directory to read transformed metadata from.
+
+    Returns:
+      The serving input function.
+    """
+    raw_feature_spec = RAW_DATA_METADATA.schema.as_feature_spec()
+    # Remove label since it is not available during serving.
+    raw_feature_spec.pop(LABEL_KEY)
+
+    def serving_input_fn():
+        """Input function for serving."""
+        # Get raw features by generating the basic serving input_fn and calling it.
+        # Here we generate an input_fn that expects a parsed Example proto to be fed
+        # to the model at serving time.  See also
+        # input_fn_utils.build_default_serving_input_fn.
+        raw_input_fn = input_fn_utils.build_parsing_serving_input_fn(
+            raw_feature_spec)
+        raw_features, _, default_inputs = raw_input_fn()
+
+        # Apply the transform function that was used to generate the materialized
+        # data.
+        _, transformed_features = (
+            saved_transform_io.partially_apply_saved_transform(
+                os.path.join(working_dir, transform_fn_io.TRANSFORM_FN_DIR),
+                raw_features))
+
+        return tf.estimator.export.ServingInputReceiver(transformed_features, default_inputs)
+
+    return serving_input_fn
+
+
+def train_and_evaluate(working_dir, num_train_instances=NUM_TRAIN_INSTANCES,
+                       num_test_instances=NUM_TEST_INSTANCES):
+    """Train the model on training data and evaluate on eval data.
+
+    Args:
+      working_dir: Directory to read transformed data and metadata from and to
+          write exported model to.
+      num_train_instances: Number of instances in train set
+      num_test_instances: Number of instances in test set
+
+    Returns:
+    """
+
+    one_hot_columns = [
+        tf.feature_column.indicator_column(
+            tf.feature_column.categorical_column_with_identity(key=key, num_buckets=num_buckets))
+        for key, num_buckets in zip(CATEGORICAL_FEATURE_KEYS, BUCKET_SIZES)]
+
+    real_valued_columns = [tf.feature_column.numeric_column(key, shape=())
+                           for key in NUMERIC_FEATURE_KEYS]
+
+    estimator = tf.estimator.DNNRegressor(
+        feature_columns=real_valued_columns + one_hot_columns,
+        model_dir=os.path.join(working_dir, "logs_directory"),
+        optimizer=tf.train.AdamOptimizer(),
+        hidden_units=[10, 5])
+
+    train_spec = tf.estimator.TrainSpec(
+        input_fn=lambda: _make_training_input_fn(working_dir, "auto-mpg.csv", BATCH_SIZE),
+        max_steps=TRAIN_NUM_EPOCHS * num_train_instances / BATCH_SIZE)
+
+    eval_spec = tf.estimator.EvalSpec(
+        input_fn=lambda: _make_training_input_fn(working_dir, "auto-mpg-test.csv", BATCH_SIZE),
+        throttle_secs=10, steps=num_test_instances / BATCH_SIZE)
+
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+
+    # Export the model.
+    serving_input_fn = _make_serving_input_fn(working_dir)
+    exported_model_dir = os.path.join(working_dir, EXPORTED_MODEL_DIR)
+    estimator.export_savedmodel(exported_model_dir, serving_input_fn)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_data_dir',
+        help='path to directory containing input data')
+    parser.add_argument(
+        '--working_dir',
+        help='optional, path to directory to hold transformed data')
+    args = parser.parse_args()
+
+    if args.working_dir:
+        working_dir = args.working_dir
+    else:
+        working_dir = tempfile.mkdtemp(dir=args.input_data_dir)
+
+    train_data_file = os.path.join(args.input_data_dir, 'auto-mpg.csv')
+
+    # Will write a SavedModel and metadata to two subdirectories of
+    # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
+    # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
+    create_transform_fn(train_data_file, working_dir)
+
+    # will transform features on the fly using the transform_fn created above
+    train_and_evaluate(working_dir)
+
+if __name__ == '__main__':
+    main()

From 029394e156c6fb973da592afcdecc4a01513ee7d Mon Sep 17 00:00:00 2001
From: galderic <galderic@gmail.com>
Date: Tue, 24 Apr 2018 11:40:32 +0200
Subject: [PATCH 2/3] fixed indent and line wrap

---
 examples/mpg_example.py | 371 ++++++++++++++++++++--------------------
 1 file changed, 190 insertions(+), 181 deletions(-)

diff --git a/examples/mpg_example.py b/examples/mpg_example.py
index ce9e4641..846c9643 100644
--- a/examples/mpg_example.py
+++ b/examples/mpg_example.py
@@ -41,7 +41,8 @@
 #  curl https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data|grep -v "?"|sed -E -e 's/[[:blank:]]{2,}/,/g'|sed  -E -e $'s/\t/,/g' | tail -n50 > auto-mpg-test.csv
 
 ordered_columns = [
-    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name'
+    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
+    'year', 'origin', 'name'
 ]
 
 CATEGORICAL_FEATURE_KEYS = [
@@ -56,22 +57,22 @@
 
 
 def _create_raw_metadata():
-    """Create a DatasetMetadata for the raw data."""
-    column_schemas = {
-        key: dataset_schema.ColumnSchema(
-            tf.string, [], dataset_schema.FixedColumnRepresentation())
-        for key in CATEGORICAL_FEATURE_KEYS
-    }
-    column_schemas.update({
-        key: dataset_schema.ColumnSchema(
-            tf.float32, [], dataset_schema.FixedColumnRepresentation())
-        for key in NUMERIC_FEATURE_KEYS
-    })
-    column_schemas[LABEL_KEY] = dataset_schema.ColumnSchema(
-        tf.float32, [], dataset_schema.FixedColumnRepresentation())
-    raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(
-        column_schemas))
-    return raw_data_metadata
+  """Create a DatasetMetadata for the raw data."""
+  column_schemas = {
+    key: dataset_schema.ColumnSchema(
+       tf.string, [], dataset_schema.FixedColumnRepresentation())
+    for key in CATEGORICAL_FEATURE_KEYS
+  }
+  column_schemas.update({
+    key: dataset_schema.ColumnSchema(
+       tf.float32, [], dataset_schema.FixedColumnRepresentation())
+    for key in NUMERIC_FEATURE_KEYS
+  })
+  column_schemas[LABEL_KEY] = dataset_schema.ColumnSchema(
+     tf.float32, [], dataset_schema.FixedColumnRepresentation())
+  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(
+     column_schemas))
+  return raw_data_metadata
 
 
 RAW_DATA_METADATA = _create_raw_metadata()
@@ -92,207 +93,215 @@ def _create_raw_metadata():
 
 
 def create_transform_fn(train_data_file, working_dir):
-    """Create a transform function that can be run on-the-fly while training
-
-    Read in the data using the CSV reader, and transform it using a
-    preprocessing pipeline that scales numeric data and converts categorical data
-    from strings to int64 values indices, by creating a vocabulary for each
-    category.
-
-    Args:
-      train_data_file: File containing training data
-      working_dir: Directory to write transformed data and metadata to
-    """
-
-    def preprocessing_fn(inputs):
-        """Preprocess input columns into transformed columns."""
-        outputs = {}
-
-        # Scale numeric columns to have range [0, 1].
-        for key in NUMERIC_FEATURE_KEYS:
-            outputs[key] = tft.scale_to_0_1(inputs[key])
-
-        # For all categorical columns except the label column, we use
-        # tft.string_to_int which computes the set of unique values and uses this
-        # to convert the strings to indices.
-        for key in CATEGORICAL_FEATURE_KEYS:
-            outputs[key] = tft.string_to_int(inputs[key])
-
-        # For the label column we provide the mapping from string to index.
-        outputs[LABEL_KEY] = inputs[LABEL_KEY]
-
-        return outputs
-
-    # The "with" block will create a pipeline, and run that pipeline at the exit
-    # of the block.
-    with beam.Pipeline() as pipeline:
-        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
-            # Create a coder to read the mpg data with the schema.  To do this we
-            # need to list all columns in order since the schema doesn't specify the
-            # order of columns in the csv.
-            converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)
-
-            # Read in raw data and convert using CSV converter.  Note that we apply
-            # some Beam transformations here, which will not be encoded in the TF
-            # graph since we don't do the from within tf.Transform's methods
-            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
-            # to get data into a format that the CSV converter can read, in particular
-            # removing empty lines and removing spaces after commas.
-            raw_data = (
-                    pipeline
-                    | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
-                    | 'FilterTrainData' >> beam.Filter(lambda line: line)
-                    | 'FixCommasTrainData' >> beam.Map(
-                lambda line: line.replace(', ', ','))
-                    | 'DecodeTrainData' >> beam.Map(converter.decode))
-
-            # Combine data and schema into a dataset tuple.  Note that we already used
-            # the schema to read the CSV data, but we also need it to interpret
-            # raw_data.
-            raw_dataset = (raw_data, RAW_DATA_METADATA)
-            transformed_dataset, transform_fn = (
-                    raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
-            transformed_data, transformed_metadata = transformed_dataset
-
-            # Will write a SavedModel and metadata to two subdirectories of
-            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
-            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
-            _ = (
-                    transform_fn
-                    | 'WriteTransformFn' >>
-                    transform_fn_io.WriteTransformFn(working_dir))
+  """Create a transform function that can be run on-the-fly while training
+
+  Read in the data using the CSV reader, and transform it using a
+  preprocessing pipeline that scales numeric data and converts categorical data
+  from strings to int64 values indices, by creating a vocabulary for each
+  category.
+
+  Args:
+    train_data_file: File containing training data
+    working_dir: Directory to write transformed data and metadata to
+  """
+
+  def preprocessing_fn(inputs):
+    """Preprocess input columns into transformed columns."""
+    outputs = {}
+
+    # Scale numeric columns to have range [0, 1].
+    for key in NUMERIC_FEATURE_KEYS:
+      outputs[key] = tft.scale_to_0_1(inputs[key])
+
+    # For all categorical columns except the label column, we use
+    # tft.string_to_int which computes the set of unique values and uses this
+    # to convert the strings to indices.
+    for key in CATEGORICAL_FEATURE_KEYS:
+      outputs[key] = tft.string_to_int(inputs[key])
+
+    # For the label column we provide the mapping from string to index.
+    outputs[LABEL_KEY] = inputs[LABEL_KEY]
+
+    return outputs
+
+  # The "with" block will create a pipeline, and run that pipeline at the exit
+  # of the block.
+  with beam.Pipeline() as pipeline:
+    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
+      # Create a coder to read the mpg data with the schema.  To do this we
+      # need to list all columns in order since the schema doesn't specify the
+      # order of columns in the csv.
+      converter = csv_coder.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)
+
+      # Read in raw data and convert using CSV converter.  Note that we apply
+      # some Beam transformations here, which will not be encoded in the TF
+      # graph since we don't do the from within tf.Transform's methods
+      # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
+      # to get data into a format that the CSV converter can read, in particular
+      # removing empty lines and removing spaces after commas.
+      raw_data = (
+         pipeline
+         | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
+         | 'FilterTrainData' >> beam.Filter(lambda line: line)
+         | 'FixCommasTrainData' >> beam.Map(
+         lambda line: line.replace(', ', ','))
+         | 'DecodeTrainData' >> beam.Map(converter.decode))
+
+      # Combine data and schema into a dataset tuple.  Note that we already used
+      # the schema to read the CSV data, but we also need it to interpret
+      # raw_data.
+      raw_dataset = (raw_data, RAW_DATA_METADATA)
+      transformed_dataset, transform_fn = (
+         raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
+      transformed_data, transformed_metadata = transformed_dataset
+
+      # Will write a SavedModel and metadata to two subdirectories of
+      # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
+      # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
+      _ = (
+         transform_fn
+         | 'WriteTransformFn' >>
+         transform_fn_io.WriteTransformFn(working_dir))
 
 
 def file_decode_csv(line):
-    columns_default_values = [[0.0], ["4"], [0.0], [0.0], [0.0], [0.0], ["70"], ["1"], ["<unkown>"]]
+  columns_default_values = [[0.0], ["4"], [0.0], [0.0], [0.0], [0.0], ["70"],
+                            ["1"], ["<unkown>"]]
 
-    parsed_line = tf.decode_csv(line, columns_default_values)
-    features = parsed_line
+  parsed_line = tf.decode_csv(line, columns_default_values)
+  features = parsed_line
 
-    d = dict(zip(ordered_columns, features))
+  d = dict(zip(ordered_columns, features))
 
-    label = d[LABEL_KEY]
-    del d[LABEL_KEY]
+  label = d[LABEL_KEY]
+  del d[LABEL_KEY]
 
-    return d, label
+  return d, label
 
 
 def _make_training_input_fn(working_dir, csv_file, batch_size):
-    dataset = (tf.data.TextLineDataset(csv_file, buffer_size=8 * 1048576))
+  dataset = (tf.data.TextLineDataset(csv_file, buffer_size=8 * 1048576))
 
-    dataset = dataset.shuffle(NUM_TRAIN_INSTANCES)
-    dataset = dataset.apply(tf.contrib.data.map_and_batch(file_decode_csv, batch_size, num_parallel_batches=4))
-    dataset = dataset.prefetch(4)
+  dataset = dataset.shuffle(NUM_TRAIN_INSTANCES)
+  dataset = dataset.apply(
+    tf.contrib.data.map_and_batch(file_decode_csv, batch_size,
+                                  num_parallel_batches=4))
+  dataset = dataset.prefetch(4)
 
-    raw_features, raw_label = dataset.make_one_shot_iterator().get_next()
+  raw_features, raw_label = dataset.make_one_shot_iterator().get_next()
 
-    _, transformed_features = saved_transform_io.partially_apply_saved_transform(
-        os.path.join(working_dir, transform_fn_io.TRANSFORM_FN_DIR), raw_features)
-    return transformed_features, raw_label
+  _, transformed_features = saved_transform_io.partially_apply_saved_transform(
+     os.path.join(working_dir, transform_fn_io.TRANSFORM_FN_DIR), raw_features)
+  return transformed_features, raw_label
 
 
 def _make_serving_input_fn(working_dir):
-    """Creates an input function reading from raw data.
+  """Creates an input function reading from raw data.
 
-    Args:
-      working_dir: Directory to read transformed metadata from.
+  Args:
+    working_dir: Directory to read transformed metadata from.
 
-    Returns:
-      The serving input function.
-    """
-    raw_feature_spec = RAW_DATA_METADATA.schema.as_feature_spec()
-    # Remove label since it is not available during serving.
-    raw_feature_spec.pop(LABEL_KEY)
+  Returns:
+    The serving input function.
+  """
+  raw_feature_spec = RAW_DATA_METADATA.schema.as_feature_spec()
+  # Remove label since it is not available during serving.
+  raw_feature_spec.pop(LABEL_KEY)
 
-    def serving_input_fn():
-        """Input function for serving."""
-        # Get raw features by generating the basic serving input_fn and calling it.
-        # Here we generate an input_fn that expects a parsed Example proto to be fed
-        # to the model at serving time.  See also
-        # input_fn_utils.build_default_serving_input_fn.
-        raw_input_fn = input_fn_utils.build_parsing_serving_input_fn(
-            raw_feature_spec)
-        raw_features, _, default_inputs = raw_input_fn()
+  def serving_input_fn():
+    """Input function for serving."""
+    # Get raw features by generating the basic serving input_fn and calling it.
+    # Here we generate an input_fn that expects a parsed Example proto to be fed
+    # to the model at serving time.  See also
+    # input_fn_utils.build_default_serving_input_fn.
+    raw_input_fn = input_fn_utils.build_parsing_serving_input_fn(
+       raw_feature_spec)
+    raw_features, _, default_inputs = raw_input_fn()
 
-        # Apply the transform function that was used to generate the materialized
-        # data.
-        _, transformed_features = (
-            saved_transform_io.partially_apply_saved_transform(
-                os.path.join(working_dir, transform_fn_io.TRANSFORM_FN_DIR),
-                raw_features))
+    # Apply the transform function that was used to generate the materialized
+    # data.
+    _, transformed_features = (
+      saved_transform_io.partially_apply_saved_transform(
+         os.path.join(working_dir, transform_fn_io.TRANSFORM_FN_DIR),
+         raw_features))
 
-        return tf.estimator.export.ServingInputReceiver(transformed_features, default_inputs)
+    return tf.estimator.export.ServingInputReceiver(transformed_features,
+                                                    default_inputs)
 
-    return serving_input_fn
+  return serving_input_fn
 
 
 def train_and_evaluate(working_dir, num_train_instances=NUM_TRAIN_INSTANCES,
                        num_test_instances=NUM_TEST_INSTANCES):
-    """Train the model on training data and evaluate on eval data.
+  """Train the model on training data and evaluate on eval data.
 
-    Args:
-      working_dir: Directory to read transformed data and metadata from and to
-          write exported model to.
-      num_train_instances: Number of instances in train set
-      num_test_instances: Number of instances in test set
+  Args:
+    working_dir: Directory to read transformed data and metadata from and to
+        write exported model to.
+    num_train_instances: Number of instances in train set
+    num_test_instances: Number of instances in test set
 
-    Returns:
-    """
+  Returns:
+  """
 
-    one_hot_columns = [
-        tf.feature_column.indicator_column(
-            tf.feature_column.categorical_column_with_identity(key=key, num_buckets=num_buckets))
-        for key, num_buckets in zip(CATEGORICAL_FEATURE_KEYS, BUCKET_SIZES)]
+  one_hot_columns = [
+    tf.feature_column.indicator_column(
+       tf.feature_column.categorical_column_with_identity(key=key,
+                                                          num_buckets=num_buckets))
+    for key, num_buckets in zip(CATEGORICAL_FEATURE_KEYS, BUCKET_SIZES)]
 
-    real_valued_columns = [tf.feature_column.numeric_column(key, shape=())
-                           for key in NUMERIC_FEATURE_KEYS]
+  real_valued_columns = [tf.feature_column.numeric_column(key, shape=())
+                         for key in NUMERIC_FEATURE_KEYS]
 
-    estimator = tf.estimator.DNNRegressor(
-        feature_columns=real_valued_columns + one_hot_columns,
-        model_dir=os.path.join(working_dir, "logs_directory"),
-        optimizer=tf.train.AdamOptimizer(),
-        hidden_units=[10, 5])
+  estimator = tf.estimator.DNNRegressor(
+     feature_columns=real_valued_columns + one_hot_columns,
+     model_dir=os.path.join(working_dir, "logs_directory"),
+     optimizer=tf.train.AdamOptimizer(),
+     hidden_units=[10, 5])
 
-    train_spec = tf.estimator.TrainSpec(
-        input_fn=lambda: _make_training_input_fn(working_dir, "auto-mpg.csv", BATCH_SIZE),
-        max_steps=TRAIN_NUM_EPOCHS * num_train_instances / BATCH_SIZE)
+  train_spec = tf.estimator.TrainSpec(
+     input_fn=lambda: _make_training_input_fn(working_dir, "auto-mpg.csv",
+                                              BATCH_SIZE),
+     max_steps=TRAIN_NUM_EPOCHS * num_train_instances / BATCH_SIZE)
 
-    eval_spec = tf.estimator.EvalSpec(
-        input_fn=lambda: _make_training_input_fn(working_dir, "auto-mpg-test.csv", BATCH_SIZE),
-        throttle_secs=10, steps=num_test_instances / BATCH_SIZE)
+  eval_spec = tf.estimator.EvalSpec(
+     input_fn=lambda: _make_training_input_fn(working_dir, "auto-mpg-test.csv",
+                                              BATCH_SIZE),
+     throttle_secs=10, steps=num_test_instances / BATCH_SIZE)
 
-    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
 
-    # Export the model.
-    serving_input_fn = _make_serving_input_fn(working_dir)
-    exported_model_dir = os.path.join(working_dir, EXPORTED_MODEL_DIR)
-    estimator.export_savedmodel(exported_model_dir, serving_input_fn)
+  # Export the model.
+  serving_input_fn = _make_serving_input_fn(working_dir)
+  exported_model_dir = os.path.join(working_dir, EXPORTED_MODEL_DIR)
+  estimator.export_savedmodel(exported_model_dir, serving_input_fn)
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'input_data_dir',
-        help='path to directory containing input data')
-    parser.add_argument(
-        '--working_dir',
-        help='optional, path to directory to hold transformed data')
-    args = parser.parse_args()
-
-    if args.working_dir:
-        working_dir = args.working_dir
-    else:
-        working_dir = tempfile.mkdtemp(dir=args.input_data_dir)
-
-    train_data_file = os.path.join(args.input_data_dir, 'auto-mpg.csv')
-
-    # Will write a SavedModel and metadata to two subdirectories of
-    # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
-    # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
-    create_transform_fn(train_data_file, working_dir)
-
-    # will transform features on the fly using the transform_fn created above
-    train_and_evaluate(working_dir)
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+     'input_data_dir',
+     help='path to directory containing input data')
+  parser.add_argument(
+     '--working_dir',
+     help='optional, path to directory to hold transformed data')
+  args = parser.parse_args()
+
+  if args.working_dir:
+    working_dir = args.working_dir
+  else:
+    working_dir = tempfile.mkdtemp(dir=args.input_data_dir)
+
+  train_data_file = os.path.join(args.input_data_dir, 'auto-mpg.csv')
+
+  # Will write a SavedModel and metadata to two subdirectories of
+  # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
+  # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
+  create_transform_fn(train_data_file, working_dir)
+
+  # will transform features on the fly using the transform_fn created above
+  train_and_evaluate(working_dir)
+
 
 if __name__ == '__main__':
-    main()
+  main()

From 93d635282c4dd05dd3ef1c74cb217305e41f82ac Mon Sep 17 00:00:00 2001
From: galderic <galderic@gmail.com>
Date: Tue, 24 Apr 2018 11:41:44 +0200
Subject: [PATCH 3/3] automatic defaults creation

---
 examples/mpg_example.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/mpg_example.py b/examples/mpg_example.py
index 846c9643..500ecf32 100644
--- a/examples/mpg_example.py
+++ b/examples/mpg_example.py
@@ -165,8 +165,9 @@ def preprocessing_fn(inputs):
 
 
 def file_decode_csv(line):
-  columns_default_values = [[0.0], ["4"], [0.0], [0.0], [0.0], [0.0], ["70"],
-                            ["1"], ["<unkown>"]]
+  columns_default_values = [
+      [0.0] if key in NUMERIC_FEATURE_KEYS or key == LABEL_KEY else [''] for key
+      in ordered_columns]
 
   parsed_line = tf.decode_csv(line, columns_default_values)
   features = parsed_line