Skip to content

Commit

Permalink
fixed learn_runner return bug, cleaned up ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
yufengg committed Apr 25, 2017
1 parent 4aade28 commit 6422111
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 39 deletions.
5 changes: 1 addition & 4 deletions trainer/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,7 @@ def train_and_eval(job_dir=None):
experiment_fn = generate_experiment(
model_dir, train_file, test_file, model_type)

metrics, output_folder = learn_runner.run(experiment_fn, model_dir)

print('Accuracy: {}'.format(metrics['accuracy']))
print('Model exported to {}'.format(output_folder))
learn_runner.run(experiment_fn, model_dir)


def version_is_less_than(a, b):
Expand Down
110 changes: 75 additions & 35 deletions wnd_criteo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
"\n",
"tf.logging.set_verbosity(tf.logging.INFO) # Set to INFO for tracking training, default is WARN. ERROR for least messages\n",
"\n",
"print(\"Using TensorFlow version %s\" % (tf.__version__))\n",
"print(\"Using TensorFlow version %s\\n\" % (tf.__version__))\n",
"\n",
"\n",
"CONTINUOUS_COLUMNS = [\"I\"+str(i) for i in range(1,14)] # 1-13 inclusive\n",
Expand All @@ -66,15 +66,15 @@
"\n",
"FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS\n",
"\n",
"print(FEATURE_COLUMNS)\n",
"print('Feature columns are: ', FEATURE_COLUMNS, '\\n')\n",
"\n",
" # label is 1\n",
"sample = [ 0 , 2, 11, 5, 10262, 34, 2, 4, 5,0 , 1,0 , 5, \"be589b51\", \"287130e0\", \"cd7a7a22\", \"fb7334df\", \"25c83c98\",\"0\" , \"6cdb3998\", \"361384ce\", \"a73ee510\", \"3ff10fb2\", \"5874c9c9\", \"976cbd4c\", \"740c210d\", \"1adce6ef\", \"310d155b\", \"07eb8110\", \"07c540c4\", \"891589e7\", \"18259a83\", \"a458ea53\", \"a0ab60ca\",\"0\" , \"32c7478e\", \"a052b1ed\", \"9b3e8820\", \"8967c0d2\"]\n",
"\n",
"# label is 1\n",
"sample = [ 0, 127, 1, 3, 1683, 19, 26, 17, 475, 0, 9, 0, 3, \"05db9164\", \"8947f767\", \"11c9d79e\", \"52a787c8\", \"4cf72387\", \"fbad5c96\", \"18671b18\", \"0b153874\", \"a73ee510\", \"ceb10289\", \"77212bd7\", \"79507c6b\", \"7203f04e\", \"07d13a8f\", \"2c14c412\", \"49013ffe\", \"8efede7f\", \"bd17c3da\", \"f6a3e43b\", \"a458ea53\", \"35cd95c9\", \"ad3062eb\", \"c7dc6720\", \"3fdb382b\", \"010f6491\", \"49d68486\"]\n",
"\n",
"print(dict(zip(FEATURE_COLUMNS, sample)))"
"print('Columns and data as a dict: ', dict(zip(FEATURE_COLUMNS, sample)), '\\n')"
]
},
{
Expand Down Expand Up @@ -118,6 +118,23 @@
"You can try different values for `BATCH_SIZE` to see how they impact your results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### High-level structure of input functions for CSV-style data\n",
"1. Queue file(s)\n",
"2. Read a batch of data from the next file\n",
"3. Create record defaults, generally 0 for continuous values, and \"\" for categorical. You can use named types if you prefer\n",
"4. Decode the CSV and restructure it to be appropriate for the graph's input format\n",
" * `zip()` column headers with the data\n",
" * `pop()` off the label column(s)\n",
" * Remove/pop any unneeded column(s)\n",
" * Run `tf.expand_dims()` on categorical columns\n",
" 5. Return the pair: `(feature_dict, label_array)`\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -135,25 +152,25 @@
" # Reads out batch_size number of lines\n",
" key, value = reader.read_up_to(filename_queue, num_records=batch_size)\n",
" \n",
"# tf.Print(value)\n",
" \n",
" # 1 int label, 13 ints, 26 strings\n",
" cont_defaults = [ [0] for i in range(1,14) ]\n",
" cate_defaults = [ [\" \"] for i in range(1,27) ]\n",
" label_defaults = [ [0] ]\n",
" column_headers = TRAIN_DATA_COLUMNS\n",
" # The label is the first column of the data.\n",
" record_defaults = label_defaults + cont_defaults + cate_defaults\n",
"\n",
" # Decode CSV data that was just read out. \n",
" # Note that this does NOT return a dict, \n",
" # so we will need to zip it up with our headers\n",
" columns = tf.decode_csv(\n",
" value, record_defaults=record_defaults)\n",
" \n",
" # features is a dictionary that maps from column names to tensors of the data.\n",
" # income_bracket is the last column of the data. Note that this is NOT a dict.\n",
" # all_columns is a dictionary that maps from column names to tensors of the data.\n",
" all_columns = dict(zip(column_headers, columns))\n",
" \n",
" # Save the income_bracket column as our labels \n",
" # dict.pop() returns the popped array of income_bracket values\n",
" # Pop and save our labels \n",
" # dict.pop() returns the popped array of values; exactly what we need!\n",
" labels = all_columns.pop(LABEL_COLUMN[0])\n",
" \n",
" # the remaining columns are our features\n",
Expand Down Expand Up @@ -198,7 +215,8 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
Expand All @@ -212,9 +230,10 @@
"\n",
"wide_columns = []\n",
"for name in CATEGORICAL_COLUMNS:\n",
" wide_columns.append(tf.contrib.layers.sparse_column_with_hash_bucket(name, hash_bucket_size=1000))\n",
" wide_columns.append(tf.contrib.layers.sparse_column_with_hash_bucket(\n",
" name, hash_bucket_size=1000))\n",
"\n",
"print('Sparse columns configured')"
"print('Wide/Sparse columns configured')"
]
},
{
Expand Down Expand Up @@ -245,7 +264,7 @@
"for name in CONTINUOUS_COLUMNS:\n",
" deep_columns.append(tf.contrib.layers.real_valued_column(name))\n",
"\n",
"print('continuous columns configured')"
"print('deep/continuous columns configured')"
]
},
{
Expand Down Expand Up @@ -281,7 +300,8 @@
},
"outputs": [],
"source": [
"# No known Transformations. Can add some if desired. Examples from other dataset below.\n",
"# No known Transformations. Can add some if desired. \n",
"# Examples from other datasets are shown below.\n",
"\n",
"# age_buckets = tf.contrib.layers.bucketized_column(age,\n",
"# boundaries=[ 18, 25, 30, 35, 40, 45, 50, 55, 60, 65 ])\n",
Expand All @@ -292,7 +312,7 @@
"# country_occupation = tf.contrib.layers.crossed_column([native_country, occupation], \n",
"# hash_bucket_size=int(1e4))\n",
"\n",
"# print('Transformations complete')"
"print('Transformations complete')"
]
},
{
Expand All @@ -313,7 +333,8 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -342,7 +363,8 @@
"\n",
"# Embeddings for wide columns into deep columns\n",
"for col in wide_columns:\n",
" deep_columns.append(tf.contrib.layers.embedding_column(col, dimension=8))\n",
" deep_columns.append(tf.contrib.layers.embedding_column(col, \n",
" dimension=8))\n",
"\n",
"print('wide and deep columns configured')"
]
Expand Down Expand Up @@ -371,17 +393,20 @@
"outputs": [],
"source": [
"def create_model_dir(model_type):\n",
" # Returns something like models/model_WIDE_AND_DEEP_1493043407\n",
" return 'models/model_' + model_type + '_' + str(int(time.time()))\n",
"\n",
"# If new_model=False, pass in the desired model_dir \n",
"def get_model(model_type, new_model=False, model_dir=None):\n",
" if new_model or model_dir is None:\n",
" model_dir = create_model_dir(model_type) # Comment out this line to continue training a existing model\n",
"# Specify the desired model_dir \n",
"def get_model(model_type, model_dir):\n",
" print(\"Model directory = %s\" % model_dir)\n",
" \n",
" # There are more options here than shown here. \n",
" # We are using this to show additional checkpointing for illustrative purposes.\n",
" # In a real system with far more samples, you would \n",
" # likely choose to save checkpoints less frequently.\n",
" runconfig = tf.contrib.learn.RunConfig(\n",
" save_checkpoints_secs=120,\n",
" save_checkpoints_steps = None,\n",
" save_checkpoints_secs=None,\n",
" save_checkpoints_steps = 100,\n",
" )\n",
" \n",
" m = None\n",
Expand All @@ -397,7 +422,7 @@
" m = tf.contrib.learn.DNNClassifier(\n",
" model_dir=model_dir,\n",
" feature_columns=deep_columns,\n",
" hidden_units=[100, 50])\n",
" hidden_units=[100, 50, 25])\n",
"\n",
" # Combined Linear and Deep Classifier\n",
" if model_type == 'WIDE_AND_DEEP':\n",
Expand All @@ -410,12 +435,12 @@
" \n",
" print('estimator built')\n",
" \n",
" return m, model_dir\n",
" return m\n",
" \n",
"\n",
"MODEL_TYPE = 'WIDE_AND_DEEP'\n",
"model_dir = create_model_dir(model_type=MODEL_TYPE)\n",
"m, model_dir = get_model(model_type = 'WIDE_AND_DEEP', model_dir=model_dir)"
"m = get_model(model_type=MODEL_TYPE, model_dir=model_dir)"
]
},
{
Expand All @@ -426,6 +451,8 @@
},
"outputs": [],
"source": [
"# Showing that canned estimators return an instance of 'Evaluable'\n",
"\n",
"from tensorflow.contrib.learn.python.learn import evaluable\n",
"isinstance(m, evaluable.Evaluable)\n"
]
Expand All @@ -445,7 +472,12 @@
"If you don't want to download a new copy of the dataset each time your script runs, you can download it locally using \n",
"\n",
" gsutil cp gs://dataset-uploader/criteo-kaggle/medium_version/train.csv .\n",
" gsutil cp gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv ."
" gsutil cp gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv .\n",
" \n",
"If you want to download it manually, use\n",
"\n",
"- http://storageapis.google.com/dataset-uploader/criteo-kaggle/medium_version/eval.csv\n",
"- http://storageapis.google.com/dataset-uploader/criteo-kaggle/medium_version/train.csv"
]
},
{
Expand All @@ -456,11 +488,15 @@
},
"outputs": [],
"source": [
"# Use the cloud or local depending on your preference\n",
"\n",
"# CLOUD\n",
"train_file = \"gs://dataset-uploader/criteo-kaggle/medium_version/train.csv\"\n",
"eval_file = \"gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv\"\n",
"\n",
"# train_file = \"data_files/medium_version/train.csv\" # \"gs://dataset-uploader/criteo-kaggle/medium_version/train.csv\"\n",
"# eval_file = \"data_files/medium_version/eval.csv\" # \"gs://dataset-uploader/criteo-kaggle/medium_version/eval.csv\"\n"
"# LOCAL. Update these paths as appropriate\n",
"train_file = \"data_files/medium_version/train.csv\"\n",
"eval_file = \"data_files/medium_version/eval.csv\"\n"
]
},
{
Expand All @@ -474,6 +510,8 @@
"source": [
"%%time\n",
"\n",
"# This can be found with\n",
"# wc -l train.csv\n",
"train_sample_size = 800000\n",
"train_steps = train_sample_size/BATCH_SIZE # 8000/40 = 200\n",
"\n",
Expand Down Expand Up @@ -501,10 +539,11 @@
"source": [
"%%time\n",
"\n",
"eval_sample_size = 200000 # this can be found with a 'wc -l'\n",
"eval_sample_size = 200000 # this can be found with a 'wc -l eval.csv'\n",
"eval_steps = eval_sample_size/BATCH_SIZE # 2000/40 = 50\n",
"\n",
"results = m.evaluate(input_fn=generate_input_fn(eval_file), steps=eval_steps)\n",
"results = m.evaluate(input_fn=generate_input_fn(eval_file), \n",
" steps=eval_steps)\n",
"print('evaluate done')\n",
"\n",
"print('Accuracy: %s' % results['accuracy'])\n",
Expand All @@ -519,7 +558,6 @@
},
"outputs": [],
"source": [
"\n",
"def pred_fn():\n",
" sample = [ 0, 127, 1, 3, 1683, 19, 26, 17, 475, 0, 9, 0, 3, \"05db9164\", \"8947f767\", \"11c9d79e\", \"52a787c8\", \"4cf72387\", \"fbad5c96\", \"18671b18\", \"0b153874\", \"a73ee510\", \"ceb10289\", \"77212bd7\", \"79507c6b\", \"7203f04e\", \"07d13a8f\", \"2c14c412\", \"49013ffe\", \"8efede7f\", \"bd17c3da\", \"f6a3e43b\", \"a458ea53\", \"35cd95c9\", \"ad3062eb\", \"c7dc6720\", \"3fdb382b\", \"010f6491\", \"49d68486\"]\n",
" sample_dict = dict(zip(FEATURE_COLUMNS, sample))\n",
Expand Down Expand Up @@ -592,6 +630,7 @@
},
"outputs": [],
"source": [
"# Manually export\n",
"export_folder = m.export_savedmodel(\n",
" export_dir_base = model_dir + '/export',\n",
" input_fn=serving_input_fn\n",
Expand Down Expand Up @@ -631,7 +670,7 @@
" \n",
" train_input_fn = generate_input_fn(train_file, BATCH_SIZE)\n",
" eval_input_fn = generate_input_fn(eval_file)\n",
" my_model, model_dir = get_model(model_type=MODEL_TYPE, \n",
" my_model = get_model(model_type=MODEL_TYPE, \n",
" model_dir=output_dir)\n",
"\n",
" experiment = tf.contrib.learn.Experiment(\n",
Expand All @@ -658,6 +697,7 @@
},
"outputs": [],
"source": [
"# manually train and eval\n",
"%%time\n",
"\n",
"exp = experiment_fn(model_dir)\n",
Expand Down Expand Up @@ -689,11 +729,11 @@
"source": [
"# Conclusions\n",
"\n",
"In this Juypter notebook, we have configured, created, and evaluated a Wide & Deep machine learning model, that combines the powers of a Linear Classifier with a Deep Neural Network, using TensorFlow's tf.learn module.\n",
"In this Juypter notebook, we have configured, created, and evaluated a Wide & Deep machine learning model, that combines the powers of a Linear Classifier with a Deep Neural Network, using TensorFlow's Estimator and Experiment classes.\n",
"\n",
"With this working example in your toolbelt, you are ready to explore the wide (and deep) world of machine learning with TensorFlow! Some ideas to help you get going:\n",
"* Change the features we used today. Which columns do you think are correlated and should be crossed? Which ones do you think are just adding noise and could be removed to clean up the model?\n",
"* Swap in an entirely new dataset! There are many dataset available on the web, or use a dataset you possess! Check out https://archive.ics.uci.edu/ml to find your own dataset. "
"* Swap in an entirely new dataset! There are many datasets available on the web, or use a dataset you possess! Check out https://archive.ics.uci.edu/ml to find your own dataset. "
]
}
],
Expand Down

0 comments on commit 6422111

Please sign in to comment.