-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathggplot_2.R
368 lines (284 loc) · 11.4 KB
/
ggplot_2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
rm(list = ls()) ## clean the worksapce
set.seed(123)
my_data <- data.frame(
height = rnorm(100),
weight = rnorm(100),
group = sample(c("A", "B"), 100, replace = TRUE)
)
my_data
library(ggplot2)
ggplot(data =my_data,
mapping = aes(x= height, y= weight, color=group))+
geom_point(color = "red", ##color of the dots
shape = 8, ## shape of the dots
alpha = 0.8, ## color brightness/transparebcy
size = 2) + ##size of dots
geom_smooth(method = "lm",
se = TRUE, ##adds the shaded area to the lines
size = NULL ,
linetype = 1, ## width of the line
linewidth = 2)+ ##width of the line
labs(title = "Relationship between height and weight",
subtitle = "US Census Data 2013",
caption = "source: https://gauaban.abahan.com",
x = " height (cm)",
y = "weight (g)",
color = "group")+
#facet_wrap(~group)+
facet_wrap(~group, scales = "free", ## adds scales to each small graph
strip.position = "bottom") ## adds the group names to bottom
#theme_minimal() ##+ theme_classic() + + theme_light() + theme_dark() + + theme_void()
theme(
axis.title.x = element_text(color = "blue"), ## change color of x and y axis
axis.title.y = element_text(color = "red"),
plot.title = element_text(size = 15))
# Generate some sample data
set.seed(123)
data <- data.frame(
values = rnorm(100),
category = sample(c("A", "B", "C"), 100, replace = TRUE)
)
# Load ggplot2 library
library(ggplot2)
# Histogram using ggplot2
ggplot(data, aes(x = values, fill = category)) +
geom_histogram(binwidth = 0.5, position = "dodge", color = "red", alpha = 0.9) +
labs(title = "Histogram", x = "Values", y = "Frequency") +
theme_minimal()+
theme(
plot.title = element_text(hjust = 0.5, size = 15))
# Line chart using ggplot2
ggplot(data, aes(x = seq_along(values), y = values, group = category, color = category)) +
geom_line(alpha = 6,
linetype =4) +
labs(title = "Line Chart", x = "Index", y = "Values") +
theme_minimal()
url <- "https://tinyurl.com/mtktm8e5"
insurance <- read.csv(url)
# create an obesity variable
insurance$obese <- ifelse(insurance$bmi >= 30,
"obese", "not obese")
# specify dataset and mapping
library(ggplot2)
ggplot(data = insurance,
mapping = aes(x = age, y = expenses))
# add points
ggplot(data = insurance,
mapping = aes(x = age, y = expenses)) +
geom_point()
# make points blue, larger, and semi-transparent
ggplot(data = insurance,
mapping = aes(x = age, y = expenses)) +
geom_point(color = "cornflowerblue",
alpha = 0.8, ## color brightness/transparebcy
size = 2) ##size of dots
# add a line of best fit.
ggplot(data = insurance,
mapping = aes(x = age, y = expenses)) +
geom_point(color = "cornflowerblue",
alpha = .5,
size = 2) +
geom_smooth(method = "lm")
# indicate sex using color
ggplot(data = insurance,
mapping = aes(x = age,
y = expenses,
color = smoker)) +
geom_point(alpha = .5,
size = 2) +
geom_smooth(method = "lm",
se = TRUE, ## to include shaded arera around lines
size = 0.8) ## size of lines
# modify the x and y axes and specify the colors to be used
ggplot(data = insurance,
mapping = aes(x = age,
y = expenses,
color = smoker)) +
geom_point(alpha = .5,
size = 2) +
geom_smooth(method = "lm",
se = FALSE,
size = 1.5) +
scale_x_continuous(breaks = seq(0, 70, 10)) +
scale_y_continuous(breaks = seq(0, 60000, 20000),
label = scales::dollar) +
scale_color_manual(values = c("indianred3",
"cornflowerblue"))
# reproduce plot for each obsese and non-obese individuals
ggplot(data = insurance,
mapping = aes(x = age,
y = expenses,
color = smoker)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm",
se = FALSE) +
scale_x_continuous(breaks = seq(0, 70, 10)) +
scale_y_continuous(breaks = seq(0, 60000, 20000),
label = scales::dollar) +
scale_color_manual(values = c("indianred3",
"cornflowerblue"))+
facet_wrap(~obese)
# add informative labels
ggplot(data = insurance,
mapping = aes(x = age,
y = expenses,
color = smoker)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm",
se = FALSE) +
scale_x_continuous(breaks = seq(0, 70, 10)) +
scale_y_continuous(breaks = seq(0, 60000, 20000),
label = scales::dollar) +
scale_color_manual(values = c("indianred3",
"cornflowerblue")) +
facet_wrap(~obese) +
labs(title = "Relationship between patient demographics and medical costs",
subtitle = "US Census Bureau 2013",
caption = "source: http://mosaic-web.org/",
x = " Age (years)",
y = "Annual expenses",
color = "Smoker?")
# use a minimalist theme
ggplot(data = insurance,
mapping = aes(x = age,
y = expenses,
color = smoker)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm",
se = FALSE) +
scale_x_continuous(breaks = seq(0, 70, 10)) +
scale_y_continuous(breaks = seq(0, 60000, 20000),
label = scales::dollar) +
scale_color_manual(values = c("indianred3",
"cornflowerblue")) +
facet_wrap(~obese) +
labs(title = "Relationship between age and medical expenses",
subtitle = "US Census Data 2013",
caption = "source: https://github.com/dataspelunking/MLwR",
x = " Age (years)",
y = "Medical Expenses",
color = "Smoker?") +
theme_minimal()
# placing color mapping in the geom_point function
ggplot(insurance,
aes(x = age,
y = expenses)) +
geom_point(aes(color = smoker),
alpha = .5,
size = 2) +
geom_smooth(method = "lm",
se = FALSE,
size = 1.5)
# create scatterplot and save it
myplot <- ggplot(data = insurance,
aes(x = age, y = expenses)) +
geom_point()
# plot the graph
myplot
# make the points larger and blue
# then print the graph
myplot <- myplot + geom_point(size = 2, color = "blue")
myplot
# print the graph with a title and line of best fit
# but don't save those changes
myplot + geom_smooth(method = "lm") +
labs(title = "Mildly interesting graph")
# print the graph with a black and white theme
# but don't save those changes
myplot + theme_bw()
# Clear the entire workspace
rm(list = ls())
# load libraries
library(tidyverse)
# load data
CIP <- read_csv("./data/CIP.csv", na = "NA")
# Initial exploration of the dataset 'CIP'
glimpse(CIP)
# Display the contents of the 'CIP' dataset in a spreadsheet-like view
View(CIP)
# Calculating the average total root weight per plot, excluding NA values
mean(CIP$trw, na.rm = TRUE)
# Calculating the average weight of non-commercial storage roots per plot, excluding NA values
mean(CIP$ncrw, na.rm = TRUE)
# Calculating the average weight of commercial storage roots per plot, excluding NA values
mean(CIP$crw, na.rm = TRUE)
# Counting the number of occurrences per 'trial' and displaying distinct rows of 'trial', 'harvest', and count
CIP %>%
add_count(trial) %>%
select(trial, harvest, n) %>%
distinct() %>% View()
# convert release column into factor
CIP$release <- factor(CIP$release)
# Generating a summary of the 'CIP' dataset, including statistics for each variable
summary(CIP)
# Creating a scatter plot of vine weight vs. weight of commercial storage roots using ggplot2
ggplot(CIP, aes(x = vw, y = nocr)) +
geom_point() +
labs(title = "Scatter Plot of Vine Weight vs Commercial Root Weight",
x = "Weight of vines per plot (kg)",
y = "Weight of Commercial storage roots per plot (kg)")
# Creating a scatter plot of vine weight vs. weight of commercial storage roots using base R
plot(CIP$vw, CIP$nocr,
xlab = "Weight of vines per plot (kg)",
ylab = "Weight of Commercial storage roots per plot (kg)",
pch = 19, col = "blue")
# Creating a scatter plot of commercial root weight vs. total root weight using ggplot2
ggplot(CIP, aes(x = crw, y = trw)) +
geom_point() +
labs(title = "Scatter Plot of Commercial vs Total Root Weight",
x = "Weight of Commercial roots (kg)",
y = "Weight of total roots (kg)")
# Creating a scatter plot with a linear model fit of commercial root weight vs. total root weight using ggplot2
ggplot(CIP, aes(x = crw, y = trw)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Scatter Plot with Linear Fit: Commercial vs Total Root Weight",
x = "Weight of Commercial roots (kg)",
y = "Weight of total roots (kg)")
# Creating a bar plot of genotype vs. mean total root weight per plot using ggplot2
ggplot(CIP, aes(x = geno, y = trw)) +
geom_bar(stat = "summary") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mean Total Root Weight per Genotype",
x = "Genotype", y = "Mean of total root weight per plot")
# Creating a faceted bar plot of genotype vs. commercial root weight for each trial using ggplot2
ggplot(CIP, aes(x = geno, y = crw)) +
facet_wrap(~trial) +
geom_bar(stat = "summary") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mean Commercial Root Weight per Genotype Across Trials",
x = "Genotype", y = "Mean of commercial root weight per plot")
# Creating a histogram of total root weight per plot using ggplot2
ggplot(CIP, aes(x = trw)) +
geom_histogram(bins = 10, fill = "green", color = "black") +
labs(title = "Histogram of Total Root Weight per Plot", x = "Total weight per plot")
# Creating a faceted histogram of total root weight per plot for each trial using ggplot2
ggplot(CIP, aes(x = crw)) +
facet_wrap(~trial) +
geom_histogram(bins = 10, fill = "green", color = "black") +
labs(title = "Faceted Histogram of Total Root Weight per Plot", x = "Total weight per plot")
# Creating a boxplot of commercial root weight by genotype using ggplot2
ggplot(CIP, aes(x = geno, y = crw)) +
geom_boxplot(fill = "orange") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot of Commercial Root Weight by Genotype", y = "Commercial root weight")
# Creating a boxplot of commercial root weight by genotype using ggplot2
ggplot(CIP, aes(x = geno, y = crw)) +
facet_wrap(~trial) +
geom_boxplot(fill = "orange") +
#
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot of Commercial Root Weight by Genotype", y = "Commercial root weight")
# Exercises: fix the following code (there are errors)
ggplot(CIP, aes(x = vw, y = nocr)) +
geom_point() +
labs(title = "Scatter Plot of Vine Weight vs Commercial Root Weight",
x = "Weight of vines per plot (kg)",
y = "Weight of Commercial storage roots per plot (kg)")
ggplot(CIP, aes(x = geno, y = trw)) +
geom_bar(stat = "summary") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Genotype", y = "Mean of total root weight per plot")
ggplot(CIP, aes(x = trw)) +
geom_histogram(bins = 10, fill = "green", color = "black") +
labs(title = "Histogram of trw", x = "Total weight per plot")