-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstatistical-modelling-1a.R
155 lines (115 loc) · 5.11 KB
/
statistical-modelling-1a.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# library imports
library(ggplot2)
library(MCMCpack)
library(dplyr)
# dataset is taken from https://www.kaggle.com/zynicide/wine-reviews
# The winemag-data-130k-v2.csv dataset
# This dataset has the following colummns in this order
# country, description, designation, points, price, province, region_1, region_2,
# taster_name, taster_twitter_handle, title, variety, winery
wine_df = read.csv('data/winemag-data-130k-v2.csv')
# definitely unwanted variables for the modelling
wine_df = within(wine_df, remove('X', 'description', 'designation', 'taster_name', 'taster_twitter_handle', 'title'))
# these variables might be useful in the future, but ignorning them for now.
wine_df = within(wine_df, remove('province', 'region_1', 'region_2', 'winery'))
# currently the data frame contains the following columns "country, points, price and variety" in this order
# where X = {country, price and variety} and Y = points
View(wine_df)
mean(wine_df$points)
sd(wine_df$points)
# this filters the data as per Q1, Sauvignon Blanc Wine from South Africa, Priced at $15
wine_sa_sb_15 = filter(wine_df, variety=='Sauvignon Blanc' & country == 'South Africa' & price == 15)
View(wine_sa_sb_15)
## set up ggplot for histogram and density plots
ggplot(wine_sa_sb_15, aes(x = points, ..density..)) +
## add histogram
geom_histogram(bins = 5, aes(alpha = 0.5, colour = "blue"), show.legend = FALSE) +
geom_density(fill = "pink", aes(alpha = 0.5), show.legend = FALSE) +
xlab("Wine Points") + ylab("density") +
ggtitle("Distribution of Points for Sauvignon Blanc")
# this filters the data as per Q1a, Chardonnay Wine from Chile, Priced at $15
wine_ch_ch_15 = filter(wine_df, variety=='Chardonnay' & country == 'Chile' & price == 15)
View(wine_ch_ch_15)
## set up ggplot for histogram and density plots
ggplot(wine_ch_ch_15, aes(x = points, ..density..)) +
## add histogram
geom_histogram(bins = 5, aes(alpha = 0.5, colour = "blue"), show.legend = FALSE) +
geom_density(fill = "pink", aes(alpha = 0.5), show.legend = FALSE) +
xlab("Wine Points") + ylab("density") +
ggtitle("Distribution of Points for Chardonnay")
# combining the 2 data frames
comb_wine = rbind(wine_ch_ch_15, wine_sa_sb_15)
ggplot(comb_wine) + geom_boxplot(aes(variety, points, fill = variety)) +
geom_jitter(aes(variety, points, shape = variety))
# apllying the func "mean" to points sorted by variety
wine_means = tapply(comb_wine$points, comb_wine$variety, mean)
# removed the non-existent varities (don't know how they are still there)
wine_means[complete.cases(wine_means)]
# apllying the func "median" to points sorted by variety
wine_median = tapply(comb_wine$points, comb_wine$variety, median)
wine_median[complete.cases(wine_median)]
# apllying the func "sd" to points sorted by variety
wine_sd = tapply(comb_wine$points, comb_wine$variety, sd)
wine_sd[complete.cases(wine_sd)]
t.test(points ~ variety, data=comb_wine, var.equal = TRUE)
ts = replicate(1000,t.test(points ~ variety, data=comb_wine, var.equal = TRUE)$statistic)
range(ts)
pts = seq(-4.5,4.5,length=100)
plot(pts,dt(pts,df=18),col='red',type='l')
lines(density(ts))
compare_2_gibbs <- function(y, ind, mu0 = 88.44714, tau0 = 1/(3.03973^2), del0 = 0, gamma0 = 1/(3.03973^2),
a0 = 1, b0 = 50, maxiter = 10000)
{
y1 <- y[ind == 'Chardonnay']
y2 <- y[ind == 'Sauvignon Blanc']
n1 <- length(y1)
n2 <- length(y2)
##### starting values
mu <- (mean(y1) + mean(y2)) / 2
del <- (mean(y1) - mean(y2)) / 2
mat_store <- matrix(0, nrow = maxiter, ncol = 3)
#####
##### Gibbs sampler
an <- a0 + (n1 + n2)/2
for(s in 1 : maxiter)
{
##update tau
bn <- b0 + 0.5 * (sum((y1 - mu - del) ^ 2) + sum((y2 - mu + del) ^ 2))
tau <- rgamma(1, an, bn)
##
##update mu
taun <- tau0 + tau * (n1 + n2)
mun <- (tau0 * mu0 + tau * (sum(y1 - del) + sum(y2 + del))) / taun
mu <- rnorm(1, mun, sqrt(1/taun))
##
##update del
gamman <- tau0 + tau*(n1 + n2)
deln <- ( del0 * tau0 + tau * (sum(y1 - mu) - sum(y2 - mu))) / gamman
del<-rnorm(1, deln, sqrt(1/gamman))
##
## store parameter values
mat_store[s, ] <- c(mu, del, tau)
}
colnames(mat_store) <- c("mu", "del", "tau")
return(mat_store)
}
# contains 5000 rows of 3 columns mu, del and tau (5000 x 3 matrix)
fit1 <- compare_2_gibbs(comb_wine$points, as.factor(comb_wine$variety), maxiter = 20000)
fit <- fit1[ 4 * (1 : 5000), ]
plot(as.mcmc(fit))
acf(fit)
raftery.diag(as.mcmc(fit))
# applying the func "mean" on each columns of the resultant matrix in fit
apply(fit, 2, mean)
# applying the func "sd" on each columns of the resultant matrix in fit
apply(fit, 2, sd)
## easier to interpret standard deviation than precision
mean(1/sqrt(fit[, 3]))
sd(1/sqrt(fit[, 3]))
y1_sim <- rnorm(5000, mean = fit[, 1] + fit[, 2], sd = 1/sqrt(fit[, 3]))
y2_sim <- rnorm(5000, mean = fit[, 1] - fit[, 2], sd = 1/sqrt(fit[, 3]))
ggplot(data.frame(y_sim_diff = y1_sim - y2_sim)) + stat_bin(aes(y_sim_diff), binwidth = 1)
mean(y1_sim > y2_sim)
mean(y2_sim > y1_sim)
ggplot(data.frame(y1_sim, y2_sim)) + geom_point(aes(y1_sim, y2_sim), alpha = 0.3) +
geom_abline(slope = 1, intercept = 0)