-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStats Project .R
214 lines (181 loc) · 6.46 KB
/
Stats Project .R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
library(tidyverse)
library(ggplot2)
library(dplyr)
library(lubridate)
library(patchwork)
library(stargazer)
setwd("~/Documents/Documents/MSBA/Stats")
bikeshareData = read.csv("Capital Bike Sharing data by hour.csv")
#Question 1 What is the trend in overall bike demand over the months of the year?
bikeshareData=bikeshareData %>%
mutate(ShareDate = as.Date(dteday, "%Y-%m-%d"))
str(bikeshareData)
anyNA(bikeshareData)
hist(bikeshareData$cnt)
bikeshareQ1 = bikeshareData %>%
mutate(month = month(ShareDate), year = year(ShareDate)) %>%
group_by(month, year) %>%
summarise(total = sum(cnt)) %>%
arrange(year, month)
bikeshareQ1 %>%
ggplot(aes(x = month,
y = total, color = factor(year))) +
geom_point() +
scale_x_continuous(n.breaks = 12) +
labs(x="Month",
y="Total Bike Rentals",
fill="",
title = "Total Bike Rentals per Month",
subtitle = "for 2011 and 2012") +
scale_y_continuous(labels = scales::comma) +
scale_color_discrete(name="Year",
breaks=c(2011, 2012),
labels=c("2011", "2012"))
#Question 2 The data science group at Capital bike share hypothesize that a.
#There must be high demand during office timings. Early morning and late
#evening can have different trends (cyclist) and low demand from 10:00pm to
#4:00am. Do you agree? b. Registered users demand more bike on weekdays compared
#to the weekend or holiday. Do you agree?
#a
bikeshareQ2a = bikeshareData %>%
group_by(hr, workingday) %>%
summarise(total = sum(cnt))
bikeshareQ2a$WorkingDayText = ifelse(bikeshareQ2a$workingday==1, "Working Day", "Weekend")
bikeshareQ2a %>%
ggplot(aes(x = hr,
y = total, color=WorkingDayText)) +
geom_point() +
facet_wrap(~WorkingDayText,
scales="free_x") +
labs(x="Hour of Day",
y="Total Bike Rentals",
fill="",
title = "Total Bike Rentals per Hour",
subtitle = "for Weekends and Work Days") +
scale_y_continuous(labels = scales::comma) +
scale_color_discrete(name="Type of Day")
#b
bikeshareQ2b = bikeshareData %>%
group_by(workingday) %>%
summarise(avgReg = mean(registered))
bikeshareQ2b$WorkingDayText = ifelse(bikeshareQ2b$workingday==1, "Working Day", "Weekend")
bikeshareQ2b %>%
ggplot(aes(x = as.factor(WorkingDayText),
y = avgReg)) +
geom_bar(stat='identity') +
labs(x="Working Day",
y="Total Bike Rentals",
fill="",
title = "Total Bike Rentals",
subtitle = "Working Days vs Weekends (where 1=working day and 0=weekend)")+
scale_y_continuous(labels = scales::comma)
#Question 3Is there any relationship between season and bike rental?
#Create a visualization displaying the relationship.
bikeshareQ3 = bikeshareData %>%
group_by(season) %>%
summarise(avg = mean(cnt))
bikeshareQ3$SeasonName[1]="Winter"
bikeshareQ3$SeasonName[2]="Spring"
bikeshareQ3$SeasonName[3]="Summer"
bikeshareQ3$SeasonName[4]="Fall"
bikeshareQ3 %>%
ggplot(aes(x = SeasonName,
y = avg, fill=SeasonName)) +
geom_bar(stat='identity') +
labs(x="Season Name",
y="Average Bike Rentals",
fill="",
title = "Average Bike Rentals",
subtitle = "per season") +
scale_y_continuous(labels = scales::comma)+
theme(legend.position="none")
#Question 4 What type of relationship do you see between weather and bike rental?
#Is the relationship the same for registered vs. casual users?
#Temp vs Total
bikeshareData %>%
group_by(temp) %>%
summarise(avg= mean(cnt)) %>%
ggplot(aes(x = temp,
y = avg)) +
geom_point() +
labs(x="Normalized Temperature",
y="Average Bike Rentals",
fill="",
title = "Average Bike Rentals",
subtitle = "by Normalized Temperature") +
scale_y_continuous(labels = scales::comma) +
geom_abline(slope=bikeShareTempModel$coefficients[2],
intercept=bikeShareTempModel$coefficients[1],
color="blue",
size=1)
#Feeling Temp vs Total
bikeshareData %>%
group_by(atemp) %>%
summarise(avg = mean(cnt)) %>%
ggplot(aes(x = atemp,
y = avg)) +
geom_point() +
labs(x="Normalized Feeling Temperature",
y="Average Bike Rentals",
fill="",
title = "Average Bike Rentals",
subtitle = "by Normalized Feeling Temperature") +
#theme_clean() +
scale_y_continuous(labels = scales::comma) +
geom_abline(slope=bikeShareAtempModel$coefficients[2],
intercept=bikeShareAtempModel$coefficients[1],
color="blue",
size=1)
#casual users
p1 = bikeshareData %>%
group_by(temp) %>%
summarise(avg = mean(casual)) %>%
ggplot(aes(x = temp,
y = avg)) +
geom_point(color="#F8766D") +
labs(x="Normalized Temperature",
y="Avg Bike Rentals",
fill="",
title = "Temp vs Avg Bike Rentals",
subtitle = "for Casual Users") +
scale_y_continuous(labels = scales::comma)
#registered users
p2 = bikeshareData %>%
group_by(temp) %>%
summarise(avg = mean(registered)) %>%
ggplot(aes(x = temp,
y = avg)) +
geom_point(color="#619CFF") +
labs(x="Normalized Temperature",
y="",
fill="",
title = "Temp vs Avg Bike Rentals",
subtitle = " for Registered Users") +
scale_y_continuous(labels = scales::comma)
p1 + p2
#Question 5 Fit a linear model predicting the total bike rental demand from daily
#temperature. What kind of insights can you generate? (make sure to write the
#linear model and interpret it in the context of the data)
bikeshareModel = bikeshareData %>%
group_by(temp) %>%
summarise(avg = mean(cnt))
bikeShareTempModel = lm(avg~temp, data = bikeshareModel)
bikeShareTempModel
summary(bikeShareTempModel)
str(bikeShareTempModel)
plot(bikeShareTempModel)
#Question 6 Fit another linear model predicting total daily bike rentals from
#daily feeling temperature. Write the linear model, interpret the slope, etc.
#Is the temperature or feeling temperature a better predictor of bike rentals?
bikeshareModel2 = bikeshareData %>%
group_by(atemp) %>%
summarise(avg = mean(cnt))
bikeShareAtempModel = lm(avg~atemp, data = bikeshareModel2)
bikeShareAtempModel
summary(bikeShareAtempModel)
str(bikeShareAtempModel)
stargazer(bikeShareTempModel, bikeShareAtempModel, type="text",
dep.var.caption = "", dep.var.labels.include = F,
report = "c*", df = F, model.numbers = F,
keep.stat = c("ser","rsq","adj.rsq"),
column.labels = c("Normalized Temp vs Avg","Feeling Temp vs Avg"))