forked from JakeKandell/NBA-Predict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateModel.py
203 lines (132 loc) · 7.88 KB
/
createModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# createModel.py - Used to train, test, and create the model
# Call createModel() to generate a new model
# May need to edit which lines are commented out based on what range of game data you would like to use
from standardizeStats import basicOrAdvancedStatZScore, basicOrAdvancedStatStandardDeviation, basicOrAdvancedStatMean
from getDailyMatchups import dailyMatchupsPast
from getStats import getStatsForTeam
from availableStats import availableStats
from configureCWD import setCurrentWorkingDirectory
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import pickle
from datetime import timedelta, date
# Calculates the zScore differential between two teams for a specified stat
def zScoreDifferential(observedStatHome, observedStatAway, mean, standardDeviation):
homeTeamZScore = basicOrAdvancedStatZScore(observedStatHome, mean, standardDeviation)
awayTeamZScore = basicOrAdvancedStatZScore(observedStatAway, mean, standardDeviation)
differenceInZScore = homeTeamZScore - awayTeamZScore
return differenceInZScore
# Used to combine and format all the data to be put into a pandas dataframe
# dailyGames should be list where index 0 is a dictionary holding the games and index 1 is a list holding the results
def infoToDataFrame(dailyGames, meanDict, standardDeviationDict, startDate, endDate, season):
fullDataFrame = []
gameNumber = 0 # Counter to match the result of the game with the correct game
dailyResults = dailyGames[1] # List of results for the games
for homeTeam,awayTeam in dailyGames[0].items():
homeTeamStats = getStatsForTeam(homeTeam, startDate, endDate, season)
awayTeamStats = getStatsForTeam(awayTeam, startDate, endDate, season)
currentGame = [homeTeam,awayTeam]
for stat,statType in availableStats.items(): # Finds Z Score Dif for stats listed above and adds them to list
zScoreDif = zScoreDifferential(homeTeamStats[stat], awayTeamStats[stat], meanDict[stat], standardDeviationDict[stat])
currentGame.append(zScoreDif)
if dailyResults[gameNumber] == 'W': # Sets result to 1 if a win
result = 1
else: # Sets result to 0 if loss
result = 0
currentGame.append(result)
gameNumber += 1
print(currentGame)
fullDataFrame.append(currentGame) # Adds this list to list of all games on specified date
return(fullDataFrame)
# Function that allows iterating through specified start date to end date
def daterange(startDate, endDate):
for n in range(int ((endDate - startDate).days)):
yield startDate + timedelta(n)
# Returns a list. Index 0 is a dict holding mean for each stat. Index 1 is a dict holding standard deviation for each stat.
def createMeanStandardDeviationDicts(startDate, endDate, season):
meanDict = {}
standardDeviationDict = {}
# Loops through and inputs standard deviation and mean for each stat into dict
for stat, statType in availableStats.items():
statMean = basicOrAdvancedStatMean(startDate, endDate, stat, statType, season)
meanDict.update({stat: statMean})
statStandardDeviation = basicOrAdvancedStatStandardDeviation(startDate, endDate, stat, statType, season)
standardDeviationDict.update({stat: statStandardDeviation})
bothDicts = []
bothDicts.append(meanDict)
bothDicts.append(standardDeviationDict)
return bothDicts
# Loops through every date between start and end and appends each game to a singular list to be returned
# season should be in format 'yyyy-yy' and startOfSeason should be in format 'mm/dd/yyyy'
def getTrainingSet(startYear, startMonth, startDay, endYear, endMonth, endDay, season, startOfSeason):
startDate = date(startYear, startMonth, startDay)
endDate = date(endYear, endMonth, endDay)
startDateFormatted = startDate.strftime("%m/%d/%Y") # Formats start date in mm/dd/yyyy
allGames = []
for singleDate in daterange(startDate, endDate):
currentDate = singleDate.strftime("%m/%d/%Y") # Formats current date in mm/dd/yyyy
print(currentDate)
previousDay = singleDate - timedelta(days=1)
previousDayFormatted = previousDay.strftime("%m/%d/%Y")
meanAndStandardDeviationDicts = createMeanStandardDeviationDicts(startOfSeason, previousDayFormatted, season)
meanDict = meanAndStandardDeviationDicts[0] # Dict in format {stat:statMean}
standardDeviationDict = meanAndStandardDeviationDicts[1] # Dict in format {stat:statStDev}
currentDayGames = dailyMatchupsPast(currentDate, season) # Finds games on current date in loop
currentDayGamesAndStatsList = infoToDataFrame(currentDayGames, meanDict, standardDeviationDict, startOfSeason, previousDayFormatted, season) # Formats Z Score difs for games on current date in loop
for game in currentDayGamesAndStatsList: # Adds game with stats to list of all games
game.append(currentDate)
allGames.append(game)
print(allGames)
return(allGames)
# Returns a dataframe from list of games with z score differentials
def createDataFrame(listOfGames):
games = pd.DataFrame(
listOfGames,
columns=['Home', 'Away', 'W_PCT', 'REB', 'TOV', 'PLUS_MINUS', 'OFF_RATING', 'DEF_RATING', 'TS_PCT', 'Result', 'Date']
)
print(games)
return(games)
# Creates the logistic regression model and tests accuracy
def performLogReg(dataframe):
# Update if new stats are added
featureColumns = ['W_PCT', 'REB', 'TOV', 'PLUS_MINUS', 'OFF_RATING', 'DEF_RATING', 'TS_PCT']
X = dataframe[featureColumns] # Features
Y = dataframe.Result # Target Variable
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)
logreg = LogisticRegression()
logreg.fit(X_train, Y_train) # Fits model with data
Y_pred = logreg.predict(X_test)
confusionMatrix = metrics.confusion_matrix(Y_test, Y_pred) # Diagonals tell you correct predictions
# Code below prints model accuracy information
print('Coefficient Information:')
for i in range(len(featureColumns)): # Prints each feature next to its corresponding coefficient in the model
logregCoefficients = logreg.coef_
currentFeature = featureColumns[i]
currentCoefficient = logregCoefficients[0][i]
print(currentFeature + ': ' + str(currentCoefficient))
print('----------------------------------')
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
print("Precision:", metrics.precision_score(Y_test, Y_pred))
print("Recall:", metrics.recall_score(Y_test, Y_pred))
print('----------------------------------')
print('Confusion Matrix:')
print(confusionMatrix)
return logreg
# Saves the model in folder to be used in future
# filename should be end in '.pkl'
def saveModel(model, filename):
# Change to where you want to save the model
setCurrentWorkingDirectory('SavedModels')
with open(filename, 'wb') as file:
pickle.dump(model, file)
# Used to generate new logistic regression models
# Can import the statistics and predictions for each game from a csv file or can be created on their own
def createModel(startYear=None, startMonth=None, startDay=None, endYear=None, endMonth=None, endDay=None, season='2018-19', startOfSeason = '10/16/2018', filename='model.pkl'):
# allGames = getTrainingSet(startYear, startMonth, startDay, endYear, endMonth, endDay, season, startOfSeason) # Unnecessary if using data from CSV file
# allGamesDataframe = createDataFrame(allGames) # Unnecessary if using data from CSV file
setCurrentWorkingDirectory('Data')
allGamesDataframe = pd.read_csv('COMBINEDgamesWithInfo2016-19.csv') # Should be commented out if needing to obtain data on different range of games
logRegModel = performLogReg(allGamesDataframe)
saveModel(logRegModel, filename)