Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
asdaraujo committed Aug 15, 2019
0 parents commit 1a46a88
Show file tree
Hide file tree
Showing 81 changed files with 7,952 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea/
542 changes: 542 additions & 0 deletions README.adoc

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions cdsw.iot_exp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np
import pandas as pd
import pickle
import cdsw
import os
import time

spark = SparkSession.builder \
.appName("Predictive Maintenance") \
.getOrCreate()

# read 21 colunms large file from HDFS
schemaData = StructType([StructField("0", DoubleType(), True),
StructField("1", DoubleType(), True),
StructField("2", DoubleType(), True),
StructField("3", DoubleType(), True),
StructField("4", DoubleType(), True),
StructField("5", DoubleType(), True),
StructField("6", DoubleType(), True),
StructField("7", DoubleType(), True),
StructField("8", DoubleType(), True),
StructField("9", DoubleType(), True),
StructField("10", DoubleType(), True),
StructField("11", DoubleType(), True),
StructField("12", IntegerType(), True)])

iot_data = spark.read.schema(schemaData).csv('/user/'
+ os.environ['HADOOP_USER_NAME']
+ '/historical_iot.txt')


# Create Pipeline
label_indexer = StringIndexer(inputCol = '12', outputCol = 'label')
plan_indexer = StringIndexer(inputCol = '1', outputCol = '1_indexed')
pipeline = Pipeline(stages=[plan_indexer, label_indexer])
indexed_data = pipeline.fit(iot_data).transform(iot_data)
(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3])

pdTrain = train_data.toPandas()
pdTest = test_data.toPandas()

# 12 features
features = ["1_indexed",
"0",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11"]

param_numTrees = int(sys.argv[1])
param_maxDepth = int(sys.argv[2])
param_impurity = 'gini'

randF=RandomForestClassifier(n_jobs=10,
n_estimators=param_numTrees,
max_depth=param_maxDepth,
criterion = param_impurity,
random_state=0)

cdsw.track_metric("numTrees",param_numTrees)
cdsw.track_metric("maxDepth",param_maxDepth)
cdsw.track_metric("impurity",param_impurity)

# Fit and Predict
randF.fit(pdTrain[features], pdTrain['label'])
predictions=randF.predict(pdTest[features])

#temp = randF.predict_proba(pdTest[features])

pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction'])

list(zip(pdTrain[features], randF.feature_importances_))


y_true = pdTest['label']
y_scores = predictions
auroc = roc_auc_score(y_true, y_scores)
ap = average_precision_score (y_true, y_scores)
print(auroc, ap)

cdsw.track_metric("auroc", auroc)
cdsw.track_metric("ap", ap)

pickle.dump(randF, open("iot_model.pkl","wb"))

cdsw.track_file("iot_model.pkl")

time.sleep(15)
print("Slept for 15 seconds.")
9 changes: 9 additions & 0 deletions cdsw.iot_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pickle
import numpy as np

model = pickle.load(open('iot_model.pkl', 'rb'))

def predict(args):
account=np.array(args["feature"].split(",")).reshape(1,-1)
return {"result" : model.predict(account)[0]}

Loading

0 comments on commit 1a46a88

Please sign in to comment.