fixed learn_runner return bug, cleaned up ipynb

yufengg · Apr 25, 2017 · 6422111 · 6422111
1 parent 4aade28
commit 6422111
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 39 deletions.
diff --git a/trainer/task.py b/trainer/task.py
@@ -250,10 +250,7 @@ def train_and_eval(job_dir=None):
     experiment_fn = generate_experiment(
       model_dir, train_file, test_file, model_type)
 
-    metrics, output_folder = learn_runner.run(experiment_fn, model_dir)
-
-    print('Accuracy: {}'.format(metrics['accuracy']))
-    print('Model exported to {}'.format(output_folder))
+    learn_runner.run(experiment_fn, model_dir)
 
 
 def version_is_less_than(a, b):

diff --git a/wnd_criteo.ipynb b/wnd_criteo.ipynb
@@ -54,7 +54,7 @@
     "\n",
     "tf.logging.set_verbosity(tf.logging.INFO) # Set to INFO for tracking training, default is WARN. ERROR for least messages\n",
     "\n",
-    "print(\"Using TensorFlow version %s\" % (tf.__version__))\n",
+    "print(\"Using TensorFlow version %s\\n\" % (tf.__version__))\n",
     "\n",
     "\n",
     "CONTINUOUS_COLUMNS =  [\"I\"+str(i) for i in range(1,14)] # 1-13 inclusive\n",
@@ -66,15 +66,15 @@
     "\n",
     "FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS\n",
     "\n",
-    "print(FEATURE_COLUMNS)\n",
+    "print('Feature columns are: ', FEATURE_COLUMNS, '\\n')\n",
     "\n",
     " # label is 1\n",
     "sample = [ 0 , 2, 11, 5, 10262, 34, 2, 4, 5,0 , 1,0 , 5, \"be589b51\", \"287130e0\", \"cd7a7a22\", \"fb7334df\", \"25c83c98\",\"0\" , \"6cdb3998\", \"361384ce\", \"a73ee510\", \"3ff10fb2\", \"5874c9c9\", \"976cbd4c\", \"740c210d\", \"1adce6ef\", \"310d155b\", \"07eb8110\", \"07c540c4\", \"891589e7\", \"18259a83\", \"a458ea53\", \"a0ab60ca\",\"0\" , \"32c7478e\", \"a052b1ed\", \"9b3e8820\", \"8967c0d2\"]\n",
     "\n",
     "# label is 1\n",
     "sample = [ 0, 127, 1, 3, 1683, 19, 26, 17, 475, 0, 9, 0, 3, \"05db9164\", \"8947f767\", \"11c9d79e\", \"52a787c8\", \"4cf72387\", \"fbad5c96\", \"18671b18\", \"0b153874\", \"a73ee510\", \"ceb10289\", \"77212bd7\", \"79507c6b\", \"7203f04e\", \"07d13a8f\", \"2c14c412\", \"49013ffe\", \"8efede7f\", \"bd17c3da\", \"f6a3e43b\", \"a458ea53\", \"35cd95c9\", \"ad3062eb\", \"c7dc6720\", \"3fdb382b\", \"010f6491\", \"49d68486\"]\n",
     "\n",
-    "print(dict(zip(FEATURE_COLUMNS, sample)))"
+    "print('Columns and data as a dict: ', dict(zip(FEATURE_COLUMNS, sample)), '\\n')"
    ]
   },
   {
@@ -118,6 +118,23 @@
     "You can try different values for `BATCH_SIZE` to see how they impact your results"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### High-level structure of input functions for CSV-style data\n",
+    "1. Queue file(s)\n",
+    "2. Read a batch of data from the next file\n",
+    "3. Create record defaults, generally 0 for continuous values, and \"\" for categorical. You can use named types if you prefer\n",
+    "4. Decode the CSV and restructure it to be appropriate for the graph's input format\n",
+    "    * `zip()` column headers with the data\n",
+    "    * `pop()` off the label column(s)\n",
+    "    * Remove/pop any unneeded column(s)\n",
+    "    * Run `tf.expand_dims()` on categorical columns\n",
+    "    5. Return the pair: `(feature_dict, label_array)`\n",
+    "    "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -135,25 +152,25 @@
     "        # Reads out batch_size number of lines\n",
     "        key, value = reader.read_up_to(filename_queue, num_records=batch_size)\n",
     "        \n",
-    "#         tf.Print(value)\n",
-    "        \n",
     "        # 1 int label, 13 ints, 26 strings\n",
     "        cont_defaults = [ [0] for i in range(1,14) ]\n",
     "        cate_defaults = [ [\" \"] for i in range(1,27) ]\n",
     "        label_defaults = [ [0] ]\n",
     "        column_headers = TRAIN_DATA_COLUMNS\n",
+    "        # The label is the first column of the data.\n",
     "        record_defaults = label_defaults + cont_defaults + cate_defaults\n",
     "\n",
     "        # Decode CSV data that was just read out. \n",
+    "        # Note that this does NOT return a dict, \n",
+    "        # so we will need to zip it up with our headers\n",
     "        columns = tf.decode_csv(\n",
     "            value, record_defaults=record_defaults)\n",
     "        \n",
-    "        # features is a dictionary that maps from column names to tensors of the data.\n",
-    "        # income_bracket is the last column of the data. Note that this is NOT a dict.\n",
+    "        # all_columns is a dictionary that maps from column names to tensors of the data.\n",
     "        all_columns = dict(zip(column_headers, columns))\n",
     "        \n",
-    "        # Save the income_bracket column as our labels \n",
-    "        # dict.pop() returns the popped array of income_bracket values\n",
+    "        # Pop and save our labels \n",
+    "        # dict.pop() returns the popped array of values; exactly what we need!\n",
     "        labels = all_columns.pop(LABEL_COLUMN[0])\n",
     "        \n",
     "        # the remaining columns are our features\n",
@@ -198,7 +215,8 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "scrolled": true
    },
    "outputs": [],
    "source": [
@@ -212,9 +230,10 @@
     "\n",
     "wide_columns = []\n",
     "for name in CATEGORICAL_COLUMNS:\n",
-    "    wide_columns.append(tf.contrib.layers.sparse_column_with_hash_bucket(name, hash_bucket_size=1000))\n",
+    "    wide_columns.append(tf.contrib.layers.sparse_column_with_hash_bucket(\n",
+    "            name, hash_bucket_size=1000))\n",
     "\n",
-    "print('Sparse columns configured')"
+    "print('Wide/Sparse columns configured')"
    ]
   },
   {
@@ -245,7 +264,7 @@
     "for name in CONTINUOUS_COLUMNS:\n",
     "    deep_columns.append(tf.contrib.layers.real_valued_column(name))\n",
     "\n",
-    "print('continuous columns configured')"
+    "print('deep/continuous columns configured')"
    ]
   },
   {
@@ -281,7 +300,8 @@
    },
    "outputs": [],
    "source": [
-    "# No known Transformations. Can add some if desired. Examples from other dataset below.\n",
+    "# No known Transformations. Can add some if desired. \n",
+    "# Examples from other datasets are shown below.\n",
     "\n",
     "# age_buckets = tf.contrib.layers.bucketized_column(age,\n",
     "#             boundaries=[ 18, 25, 30, 35, 40, 45, 50, 55, 60, 65 ])\n",
@@ -292,7 +312,7 @@
     "# country_occupation = tf.contrib.layers.crossed_column([native_country, occupation], \n",
     "#                                                       hash_bucket_size=int(1e4))\n",
     "\n",
-    "# print('Transformations complete')"
+    "print('Transformations complete')"
    ]
   },
   {
@@ -313,7 +333,8 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "scrolled": true
    },
    "outputs": [],
    "source": [
@@ -342,7 +363,8 @@
     "\n",
     "# Embeddings for wide columns into deep columns\n",
     "for col in wide_columns:\n",
-    "    deep_columns.append(tf.contrib.layers.embedding_column(col, dimension=8))\n",
+    "    deep_columns.append(tf.contrib.layers.embedding_column(col, \n",
+    "                                                           dimension=8))\n",
     "\n",
     "print('wide and deep columns configured')"
    ]
@@ -371,17 +393,20 @@
    "outputs": [],
    "source": [
     "def create_model_dir(model_type):\n",
+    "    # Returns something like models/model_WIDE_AND_DEEP_1493043407\n",
     "    return 'models/model_' + model_type + '_' + str(int(time.time()))\n",
     "\n",
-    "# If new_model=False, pass in the desired model_dir \n",
-    "def get_model(model_type, new_model=False, model_dir=None):\n",
-    "    if new_model or model_dir is None:\n",
-    "        model_dir = create_model_dir(model_type) # Comment out this line to continue training a existing model\n",
+    "# Specify the desired model_dir \n",
+    "def get_model(model_type, model_dir):\n",
     "    print(\"Model directory = %s\" % model_dir)\n",
     "    \n",
+    "    # There are more options here than shown here. \n",
+    "    # We are using this to show additional checkpointing for illustrative purposes.\n",
+    "    # In a real system with far more samples, you would \n",
+    "    #     likely choose to save checkpoints less frequently.\n",
     "    runconfig = tf.contrib.learn.RunConfig(\n",
-    "        save_checkpoints_secs=120,\n",
-    "        save_checkpoints_steps = None,\n",
+    "        save_checkpoints_secs=None,\n",
+    "        save_checkpoints_steps = 100,\n",
     "    )\n",
     "    \n",
     "    m = None\n",
@@ -397,7 +422,7 @@
     "        m = tf.contrib.learn.DNNClassifier(\n",
     "            model_dir=model_dir,\n",
     "            feature_columns=deep_columns,\n",
-    "            hidden_units=[100, 50])\n",
+    "            hidden_units=[100, 50, 25])\n",
     "\n",
     "    # Combined Linear and Deep Classifier\n",
     "    if model_type == 'WIDE_AND_DEEP':\n",
@@ -410,12 +435,12 @@
     "        \n",
     "    print('estimator built')\n",
     "    \n",
-    "    return m, model_dir\n",
+    "    return m\n",
     "    \n",
     "\n",
     "MODEL_TYPE = 'WIDE_AND_DEEP'\n",
     "model_dir = create_model_dir(model_type=MODEL_TYPE)\n",
-    "m, model_dir = get_model(model_type = 'WIDE_AND_DEEP', model_dir=model_dir)"
+    "m = get_model(model_type=MODEL_TYPE, model_dir=model_dir)"
    ]
   },
   {
@@ -426,6 +451,8 @@
    },
    "outputs": [],
    "source": [
+    "# Showing that canned estimators return an instance of 'Evaluable'\n",
+    "\n",
     "from tensorflow.contrib.learn.python.learn import evaluable\n",
     "isinstance(m, evaluable.Evaluable)\n"
    ]
@@ -445,7 +472,12 @@
     "If you don't want to download a new copy of the dataset each time your script runs, you can download it locally using \n",
     "\n",
     "    gsutil cp gs://dataset-uploader/criteo-kaggle/medium_version/train.csv .\n",
-    "    gsutil cp gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv ."
+    "    gsutil cp gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv .\n",
+    "    \n",
+    "If you want to download it manually, use\n",
+    "\n",
+    "- http://storageapis.google.com/dataset-uploader/criteo-kaggle/medium_version/eval.csv\n",
+    "- http://storageapis.google.com/dataset-uploader/criteo-kaggle/medium_version/train.csv"
    ]
   },
   {
@@ -456,11 +488,15 @@
    },
    "outputs": [],
    "source": [
+    "# Use the cloud or local depending on your preference\n",
+    "\n",
+    "# CLOUD\n",
     "train_file = \"gs://dataset-uploader/criteo-kaggle/medium_version/train.csv\"\n",
     "eval_file  = \"gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv\"\n",
     "\n",
-    "# train_file = \"data_files/medium_version/train.csv\" # \"gs://dataset-uploader/criteo-kaggle/medium_version/train.csv\"\n",
-    "# eval_file  = \"data_files/medium_version/eval.csv\" #  \"gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv\"\n"
+    "# LOCAL. Update these paths as appropriate\n",
+    "train_file = \"data_files/medium_version/train.csv\"\n",
+    "eval_file  = \"data_files/medium_version/eval.csv\"\n"
    ]
   },
   {
@@ -474,6 +510,8 @@
    "source": [
     "%%time\n",
     "\n",
+    "# This can be found with\n",
+    "# wc -l train.csv\n",
     "train_sample_size = 800000\n",
     "train_steps = train_sample_size/BATCH_SIZE # 8000/40 = 200\n",
     "\n",
@@ -501,10 +539,11 @@
    "source": [
     "%%time\n",
     "\n",
-    "eval_sample_size = 200000 # this can be found with a 'wc -l'\n",
+    "eval_sample_size = 200000 # this can be found with a 'wc -l eval.csv'\n",
     "eval_steps = eval_sample_size/BATCH_SIZE # 2000/40 = 50\n",
     "\n",
-    "results = m.evaluate(input_fn=generate_input_fn(eval_file), steps=eval_steps)\n",
+    "results = m.evaluate(input_fn=generate_input_fn(eval_file), \n",
+    "                     steps=eval_steps)\n",
     "print('evaluate done')\n",
     "\n",
     "print('Accuracy: %s' % results['accuracy'])\n",
@@ -519,7 +558,6 @@
    },
    "outputs": [],
    "source": [
-    "\n",
     "def pred_fn():\n",
     "    sample = [ 0, 127, 1, 3, 1683, 19, 26, 17, 475, 0, 9, 0, 3, \"05db9164\", \"8947f767\", \"11c9d79e\", \"52a787c8\", \"4cf72387\", \"fbad5c96\", \"18671b18\", \"0b153874\", \"a73ee510\", \"ceb10289\", \"77212bd7\", \"79507c6b\", \"7203f04e\", \"07d13a8f\", \"2c14c412\", \"49013ffe\", \"8efede7f\", \"bd17c3da\", \"f6a3e43b\", \"a458ea53\", \"35cd95c9\", \"ad3062eb\", \"c7dc6720\", \"3fdb382b\", \"010f6491\", \"49d68486\"]\n",
     "    sample_dict = dict(zip(FEATURE_COLUMNS, sample))\n",
@@ -592,6 +630,7 @@
    },
    "outputs": [],
    "source": [
+    "# Manually export\n",
     "export_folder = m.export_savedmodel(\n",
     "    export_dir_base = model_dir + '/export',\n",
     "    input_fn=serving_input_fn\n",
@@ -631,7 +670,7 @@
     "    \n",
     "    train_input_fn = generate_input_fn(train_file, BATCH_SIZE)\n",
     "    eval_input_fn = generate_input_fn(eval_file)\n",
-    "    my_model, model_dir = get_model(model_type=MODEL_TYPE, \n",
+    "    my_model = get_model(model_type=MODEL_TYPE, \n",
     "                  model_dir=output_dir)\n",
     "\n",
     "    experiment = tf.contrib.learn.Experiment(\n",
@@ -658,6 +697,7 @@
    },
    "outputs": [],
    "source": [
+    "# manually train and eval\n",
     "%%time\n",
     "\n",
     "exp = experiment_fn(model_dir)\n",
@@ -689,11 +729,11 @@
    "source": [
     "# Conclusions\n",
     "\n",
-    "In this Juypter notebook, we have configured, created, and evaluated a Wide & Deep machine learning model, that combines the powers of a Linear Classifier with a Deep Neural Network, using TensorFlow's tf.learn module.\n",
+    "In this Juypter notebook, we have configured, created, and evaluated a Wide & Deep machine learning model, that combines the powers of a Linear Classifier with a Deep Neural Network, using TensorFlow's Estimator and Experiment classes.\n",
     "\n",
     "With this working example in your toolbelt, you are ready to explore the wide (and deep) world of machine learning with TensorFlow! Some ideas to help you get going:\n",
     "* Change the features we used today. Which columns do you think are correlated and should be crossed? Which ones do you think are just adding noise and could be removed to clean up the model?\n",
-    "* Swap in an entirely new dataset! There are many dataset available on the web, or use a dataset you possess! Check out https://archive.ics.uci.edu/ml to find your own dataset. "
+    "* Swap in an entirely new dataset! There are many datasets available on the web, or use a dataset you possess! Check out https://archive.ics.uci.edu/ml to find your own dataset. "
    ]
   }
  ],