-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetting_started_emily.py
123 lines (78 loc) · 3.04 KB
/
getting_started_emily.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'''
Playing with SF Crime dataset from Kaggle
Feb 19 2016
Group Work Session with DIG peeps
'''
import pandas as pd
import numpy as np
train_df = pd.read_csv('./data/train.csv')
train_df['Dates'] = pd.to_datetime(train_df['Dates'])
# how many arrest categories?
# 39
len(train_df.Category.unique())
# how many types of resolutions
len(train_df.Resolution.unique())
# 17
train_df.Category.value_counts()
'''
The variable that we want to predict is category
'''
# 10 different Police Districts
len(train_df.PdDistrict.unique())
# which districts have the crime per type?
train_df['PdDistrict'] = train_df['PdDistrict'].astype('category')
train_df['Category'] = train_df['Category'].astype('category')
crime_by_district = train_df.groupby(['PdDistrict'])['Category'].value_counts()
crime_by_day_of_week = train_df.groupby(['DayOfWeek'])['Category'].value_counts()
# in the training set we have 878K observations of 9 variables
# maybe try making classifier based on district and day of week just to see how it does
'''
do some feature engineering and get text labeled data into integers
ex - need category to correspond to number
'''
from sklearn import preprocessing
# use label encoder to transform non numerical labels into numerical labels
le = preprocessing.LabelEncoder()
le.fit(train_df['Category'])
train_df['cat_trans'] = le.transform(train_df.Category)
le.fit(train_df['PdDistrict'])
train_df['district_trans'] = le.transform(train_df.PdDistrict)
le.fit(train_df['Address'])
train_df['address_trans'] = le.transform(train_df.Address)
le.fit(train_df['DayOfWeek'])
train_df['day_week_trans'] = le.transform(train_df.DayOfWeek)
train_df['Month'] = train_df.Dates.map(lambda x: x.month)
tr_df1 = train_df[::2]
tr_df2 = train_df[1::2]
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
features_train = tr_df1[['district_trans', 'address_trans', 'day_week_trans']]
labels_train = tr_df1['cat_trans']
clf = SGDClassifier()
clf.fit(features_train, labels_train)
features_test = tr_df2[['district_trans', 'address_trans', 'day_week_trans']]
labels_test = tr_df2['cat_trans']
pred = clf.predict(features_test)
accuracy_score(pred, labels_test)
# 0.03 accuracy! TERRIBLE!
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators= 100, max_depth= 30, min_samples_split=30)
clf2.fit(features_train, labels_train)
pred2 = clf2.predict(features_test)
accuracy_score(pred2, labels_test)
# 0.24 so already better - with no parameters
# clf2 = RandomForestClassifier(n_estimators= 100, max_depth= 30, min_samples_split=30)
# gave accuracy of 0.26
'''
let's try getting better with features
'''
# make time of day feature
train_df['hour'] = train_df.Dates.map(lambda x: x.hour)
crime_by_hour = train_df.groupby(['hour'])['Category'].value_counts()
# 8AM-5PM = daytime
# 5PM - 10PM = evening
# 10PM - 8AM = late night
pd.cut(ages, bins=[0, 18, 35, 70])
train_df['time_day'] = pd.cut(train_df.hour, bins=[0, 8, 17, 22])
pd.cut(train_df.hour, bins=[0, 8, 17, 22, 24])
# this isn't working - sadface