From 8560e7afb93ed991b68697e8f523fd1e257bca85 Mon Sep 17 00:00:00 2001 From: Andre Araujo Date: Mon, 11 Nov 2019 22:00:37 -0800 Subject: [PATCH] Convert file format from DOS to Unix --- cdsw.iot_exp.py | 202 ++++++++++++++++++++++++------------------------ 1 file changed, 101 insertions(+), 101 deletions(-) diff --git a/cdsw.iot_exp.py b/cdsw.iot_exp.py index ad3a539..2eff660 100644 --- a/cdsw.iot_exp.py +++ b/cdsw.iot_exp.py @@ -1,101 +1,101 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import * -from pyspark.ml.feature import StringIndexer -from pyspark.ml import Pipeline -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import roc_auc_score, average_precision_score -import numpy as np -import pandas as pd -import pickle -import cdsw -import os -import time - -spark = SparkSession.builder \ - .appName("Predictive Maintenance") \ - .getOrCreate() - -# read 21 colunms large file from HDFS -schemaData = StructType([StructField("0", DoubleType(), True), - StructField("1", DoubleType(), True), - StructField("2", DoubleType(), True), - StructField("3", DoubleType(), True), - StructField("4", DoubleType(), True), - StructField("5", DoubleType(), True), - StructField("6", DoubleType(), True), - StructField("7", DoubleType(), True), - StructField("8", DoubleType(), True), - StructField("9", DoubleType(), True), - StructField("10", DoubleType(), True), - StructField("11", DoubleType(), True), - StructField("12", IntegerType(), True)]) - -iot_data = spark.read.schema(schemaData).csv('/user/' - + os.environ['HADOOP_USER_NAME'] - + '/historical_iot.txt') - - -# Create Pipeline -label_indexer = StringIndexer(inputCol = '12', outputCol = 'label') -plan_indexer = StringIndexer(inputCol = '1', outputCol = '1_indexed') -pipeline = Pipeline(stages=[plan_indexer, label_indexer]) -indexed_data = pipeline.fit(iot_data).transform(iot_data) -(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3]) - -pdTrain = train_data.toPandas() -pdTest = test_data.toPandas() - -# 12 features -features = ["1_indexed", - "0", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11"] - -param_numTrees = int(sys.argv[1]) -param_maxDepth = int(sys.argv[2]) -param_impurity = 'gini' - -randF=RandomForestClassifier(n_jobs=10, - n_estimators=param_numTrees, - max_depth=param_maxDepth, - criterion = param_impurity, - random_state=0) - -cdsw.track_metric("numTrees",param_numTrees) -cdsw.track_metric("maxDepth",param_maxDepth) -cdsw.track_metric("impurity",param_impurity) - -# Fit and Predict -randF.fit(pdTrain[features], pdTrain['label']) -predictions=randF.predict(pdTest[features]) - -#temp = randF.predict_proba(pdTest[features]) - -pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction']) - -list(zip(pdTrain[features], randF.feature_importances_)) - - -y_true = pdTest['label'] -y_scores = predictions -auroc = roc_auc_score(y_true, y_scores) -ap = average_precision_score (y_true, y_scores) -print(auroc, ap) - -cdsw.track_metric("auroc", auroc) -cdsw.track_metric("ap", ap) - -pickle.dump(randF, open("iot_model.pkl","wb")) - -cdsw.track_file("iot_model.pkl") - -time.sleep(15) -print("Slept for 15 seconds.") +from pyspark.sql import SparkSession +from pyspark.sql.types import * +from pyspark.ml.feature import StringIndexer +from pyspark.ml import Pipeline +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import roc_auc_score, average_precision_score +import numpy as np +import pandas as pd +import pickle +import cdsw +import os +import time + +spark = SparkSession.builder \ + .appName("Predictive Maintenance") \ + .getOrCreate() + +# read 21 colunms large file from HDFS +schemaData = StructType([StructField("0", DoubleType(), True), + StructField("1", DoubleType(), True), + StructField("2", DoubleType(), True), + StructField("3", DoubleType(), True), + StructField("4", DoubleType(), True), + StructField("5", DoubleType(), True), + StructField("6", DoubleType(), True), + StructField("7", DoubleType(), True), + StructField("8", DoubleType(), True), + StructField("9", DoubleType(), True), + StructField("10", DoubleType(), True), + StructField("11", DoubleType(), True), + StructField("12", IntegerType(), True)]) + +iot_data = spark.read.schema(schemaData).csv('/user/' + + os.environ['HADOOP_USER_NAME'] + + '/historical_iot.txt') + + +# Create Pipeline +label_indexer = StringIndexer(inputCol = '12', outputCol = 'label') +plan_indexer = StringIndexer(inputCol = '1', outputCol = '1_indexed') +pipeline = Pipeline(stages=[plan_indexer, label_indexer]) +indexed_data = pipeline.fit(iot_data).transform(iot_data) +(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3]) + +pdTrain = train_data.toPandas() +pdTest = test_data.toPandas() + +# 12 features +features = ["1_indexed", + "0", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11"] + +param_numTrees = int(sys.argv[1]) +param_maxDepth = int(sys.argv[2]) +param_impurity = 'gini' + +randF=RandomForestClassifier(n_jobs=10, + n_estimators=param_numTrees, + max_depth=param_maxDepth, + criterion = param_impurity, + random_state=0) + +cdsw.track_metric("numTrees",param_numTrees) +cdsw.track_metric("maxDepth",param_maxDepth) +cdsw.track_metric("impurity",param_impurity) + +# Fit and Predict +randF.fit(pdTrain[features], pdTrain['label']) +predictions=randF.predict(pdTest[features]) + +#temp = randF.predict_proba(pdTest[features]) + +pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction']) + +list(zip(pdTrain[features], randF.feature_importances_)) + + +y_true = pdTest['label'] +y_scores = predictions +auroc = roc_auc_score(y_true, y_scores) +ap = average_precision_score (y_true, y_scores) +print(auroc, ap) + +cdsw.track_metric("auroc", auroc) +cdsw.track_metric("ap", ap) + +pickle.dump(randF, open("iot_model.pkl","wb")) + +cdsw.track_file("iot_model.pkl") + +time.sleep(15) +print("Slept for 15 seconds.")