From 8560e7afb93ed991b68697e8f523fd1e257bca85 Mon Sep 17 00:00:00 2001
From: Andre Araujo <araujo@cloudera.com>
Date: Mon, 11 Nov 2019 22:00:37 -0800
Subject: [PATCH] Convert file format from DOS to Unix

---
 cdsw.iot_exp.py | 202 ++++++++++++++++++++++++------------------------
 1 file changed, 101 insertions(+), 101 deletions(-)

diff --git a/cdsw.iot_exp.py b/cdsw.iot_exp.py
index ad3a539..2eff660 100644
--- a/cdsw.iot_exp.py
+++ b/cdsw.iot_exp.py
@@ -1,101 +1,101 @@
-from pyspark.sql import SparkSession
-from pyspark.sql.types import *
-from pyspark.ml.feature import StringIndexer
-from pyspark.ml import Pipeline
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import roc_auc_score, average_precision_score
-import numpy as np
-import pandas as pd
-import pickle
-import cdsw
-import os
-import time
-
-spark = SparkSession.builder \
-      .appName("Predictive Maintenance") \
-      .getOrCreate()
-
-# read 21 colunms large file from HDFS
-schemaData = StructType([StructField("0", DoubleType(), True),
-                         StructField("1", DoubleType(), True),
-                         StructField("2", DoubleType(), True),
-                         StructField("3", DoubleType(), True),
-                         StructField("4", DoubleType(), True),
-                         StructField("5", DoubleType(), True),
-                         StructField("6", DoubleType(), True),     
-                         StructField("7", DoubleType(), True),     
-                         StructField("8", DoubleType(), True),     
-                         StructField("9", DoubleType(), True),     
-                         StructField("10", DoubleType(), True),     
-                         StructField("11", DoubleType(), True),          
-                         StructField("12", IntegerType(), True)])
-
-iot_data = spark.read.schema(schemaData).csv('/user/' 
-                                             + os.environ['HADOOP_USER_NAME'] 
-                                             + '/historical_iot.txt')
-
-
-# Create Pipeline
-label_indexer = StringIndexer(inputCol = '12', outputCol = 'label')
-plan_indexer = StringIndexer(inputCol = '1', outputCol = '1_indexed')
-pipeline = Pipeline(stages=[plan_indexer, label_indexer])
-indexed_data = pipeline.fit(iot_data).transform(iot_data)
-(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3])
-
-pdTrain = train_data.toPandas()
-pdTest = test_data.toPandas()
-
-# 12 features
-features = ["1_indexed",
-            "0", 
-            "2", 
-            "3",
-            "4", 
-            "5", 
-            "6",
-            "7", 
-            "8", 
-            "9", 
-            "10",
-            "11"]
-
-param_numTrees = int(sys.argv[1])
-param_maxDepth = int(sys.argv[2])
-param_impurity = 'gini'
-
-randF=RandomForestClassifier(n_jobs=10,
-                             n_estimators=param_numTrees, 
-                             max_depth=param_maxDepth, 
-                             criterion = param_impurity,
-                             random_state=0)
-
-cdsw.track_metric("numTrees",param_numTrees)
-cdsw.track_metric("maxDepth",param_maxDepth)
-cdsw.track_metric("impurity",param_impurity)
-
-# Fit and Predict
-randF.fit(pdTrain[features], pdTrain['label'])
-predictions=randF.predict(pdTest[features])
-
-#temp = randF.predict_proba(pdTest[features])
-
-pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction'])
-
-list(zip(pdTrain[features], randF.feature_importances_))
-
-
-y_true = pdTest['label']
-y_scores = predictions
-auroc = roc_auc_score(y_true, y_scores)
-ap = average_precision_score (y_true, y_scores)
-print(auroc, ap)
-
-cdsw.track_metric("auroc", auroc)
-cdsw.track_metric("ap", ap)
-
-pickle.dump(randF, open("iot_model.pkl","wb"))
-
-cdsw.track_file("iot_model.pkl")
-
-time.sleep(15)
-print("Slept for 15 seconds.")
+from pyspark.sql import SparkSession
+from pyspark.sql.types import *
+from pyspark.ml.feature import StringIndexer
+from pyspark.ml import Pipeline
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import roc_auc_score, average_precision_score
+import numpy as np
+import pandas as pd
+import pickle
+import cdsw
+import os
+import time
+
+spark = SparkSession.builder \
+      .appName("Predictive Maintenance") \
+      .getOrCreate()
+
+# read 21 colunms large file from HDFS
+schemaData = StructType([StructField("0", DoubleType(), True),
+                         StructField("1", DoubleType(), True),
+                         StructField("2", DoubleType(), True),
+                         StructField("3", DoubleType(), True),
+                         StructField("4", DoubleType(), True),
+                         StructField("5", DoubleType(), True),
+                         StructField("6", DoubleType(), True),     
+                         StructField("7", DoubleType(), True),     
+                         StructField("8", DoubleType(), True),     
+                         StructField("9", DoubleType(), True),     
+                         StructField("10", DoubleType(), True),     
+                         StructField("11", DoubleType(), True),          
+                         StructField("12", IntegerType(), True)])
+
+iot_data = spark.read.schema(schemaData).csv('/user/' 
+                                             + os.environ['HADOOP_USER_NAME'] 
+                                             + '/historical_iot.txt')
+
+
+# Create Pipeline
+label_indexer = StringIndexer(inputCol = '12', outputCol = 'label')
+plan_indexer = StringIndexer(inputCol = '1', outputCol = '1_indexed')
+pipeline = Pipeline(stages=[plan_indexer, label_indexer])
+indexed_data = pipeline.fit(iot_data).transform(iot_data)
+(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3])
+
+pdTrain = train_data.toPandas()
+pdTest = test_data.toPandas()
+
+# 12 features
+features = ["1_indexed",
+            "0", 
+            "2", 
+            "3",
+            "4", 
+            "5", 
+            "6",
+            "7", 
+            "8", 
+            "9", 
+            "10",
+            "11"]
+
+param_numTrees = int(sys.argv[1])
+param_maxDepth = int(sys.argv[2])
+param_impurity = 'gini'
+
+randF=RandomForestClassifier(n_jobs=10,
+                             n_estimators=param_numTrees, 
+                             max_depth=param_maxDepth, 
+                             criterion = param_impurity,
+                             random_state=0)
+
+cdsw.track_metric("numTrees",param_numTrees)
+cdsw.track_metric("maxDepth",param_maxDepth)
+cdsw.track_metric("impurity",param_impurity)
+
+# Fit and Predict
+randF.fit(pdTrain[features], pdTrain['label'])
+predictions=randF.predict(pdTest[features])
+
+#temp = randF.predict_proba(pdTest[features])
+
+pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction'])
+
+list(zip(pdTrain[features], randF.feature_importances_))
+
+
+y_true = pdTest['label']
+y_scores = predictions
+auroc = roc_auc_score(y_true, y_scores)
+ap = average_precision_score (y_true, y_scores)
+print(auroc, ap)
+
+cdsw.track_metric("auroc", auroc)
+cdsw.track_metric("ap", ap)
+
+pickle.dump(randF, open("iot_model.pkl","wb"))
+
+cdsw.track_file("iot_model.pkl")
+
+time.sleep(15)
+print("Slept for 15 seconds.")