-
Notifications
You must be signed in to change notification settings - Fork 1
/
carabids_02_EDA.R
337 lines (287 loc) · 12.5 KB
/
carabids_02_EDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# This script explores the differences in species ED between the parataxonomist, expert taxonomist, and barcode datasets on NEON ground beetles, then visualizes the data
# Para vs expert comparison written by Max
library(neonUtilities)
library(dplyr)
library(stringr)
library(ggplot2)
library(forcats) #fct_reorder
library(arm) #display
# Load carabid data
load(file="data_raw/beetles/carabids_NIWO.Rdata")
list2env(carabids_NIWO, .GlobalEnv)
load(file="data_raw/beetles/carabids_barcode_NIWO.Rdata")
list2env(carabids_barcode_NIWO, .GlobalEnv)
### First compare barcode and expert taxonomist
# pull out barcode and expert taxonomist data
barcode <- bet_BOLDtaxonomy %>%
mutate(individualID = sampleID, bc_sciname = NA) %>%
as_tibble
expert <- bet_expertTaxonomistIDProcessed %>%
mutate(expert_sciname = paste(genus, specificEpithet)) %>%
as_tibble
# Some barcode species are subspecies. Remove subspecies, leaving genus/species
for (i in 1:nrow(barcode)) {
if (sapply(strsplit(barcode$species[i]," "),length) == 2) {
barcode$bc_sciname[i] <- barcode$species[i]
} else {
barcode$bc_sciname[i] <- paste(word(barcode$species[i], 1:2),collapse=" ")
}
}
# AIS couldn't get this to work in simple ifelse format: barcode %>% mutate(bc_sciname = ifelse( sapply(strsplit(species," "),length) == 2, species, paste(word(species, 1:2),collapse=" ")))
# Join barcode and expert data
bc_exp_taxon_df <- expert %>%
select(individualID, expert_sciname) %>%
left_join(distinct(barcode, individualID, bc_sciname))
# What percentage of barcode IDs match expert IDs
bc_exp_taxon_df %>%
filter(!is.na(bc_sciname)) %>% #samples that expert IDed and where para identified to species
mutate(same_species = expert_sciname == bc_sciname) %>%
group_by(expert_sciname) %>%
summarize(pct_correct = mean(same_species, na.rm = TRUE),
tot_correct = sum(same_species, na.rm = TRUE),
n = n()) %>%
arrange(-n) %>%
data.frame
#100%!
### Now compare barcode/expert to parataxonomist
# pull out parataxonomist data
para <- bet_parataxonomistID %>%
mutate(para_sciname = scientificName) %>%
select(individualID, morphospeciesID, taxonRank, para_sciname) %>%
as_tibble
# Join para and expert/barcode data, and evaluate discrepancies in species ids
taxon_df <- left_join(para, bc_exp_taxon_df)
# para_sciname and expert_sciname don't always match
taxon_df %>%
filter(taxonRank == "species") %>%
summarize(mean(para_sciname == expert_sciname, na.rm = TRUE))
#0.97 match between specimen identified by expert taxonomist with parataxonomist ID
# Are any species perfectly identified?
taxon_df %>%
filter(taxonRank == "species", !is.na(expert_sciname)) %>% #samples that expert IDed and where para identified to species
mutate(same_species = para_sciname == expert_sciname) %>%
group_by(para_sciname) %>%
summarize(pct_correct = mean(same_species, na.rm = TRUE),
tot_correct = sum(same_species, na.rm = TRUE),
n = n()) %>%
arrange(-n) %>%
data.frame
# In all but one case (Cymindis cribricollis), the para and expert IDs were either 0% or 100% matching
# look at some of the discrepancies
taxon_df %>%
filter(taxonRank == "species",
para_sciname != expert_sciname) %>%
select(individualID, para_sciname, expert_sciname) %>%
arrange(para_sciname) %>%
data.frame
# Three mismatches checked with taxize pckg had different taxon serial numbers
# Plot discrepancies for samples identified to the species level
taxon_df %>%
filter(taxonRank == "species", !is.na(expert_sciname)) %>%
count(para_sciname, expert_sciname) %>%
arrange(n) %>%
mutate(discrepancy = para_sciname != expert_sciname) %>%
ggplot(aes(x = para_sciname,
y = expert_sciname,
color = discrepancy)) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
xlab("Species ID from parataxonomist") +
ylab("Species ID from expert taxonomist") +
scale_color_manual(values = c("black", "red"))
# Plot morphospecies IDs with expert IDs:
# all but one morphospecies map to one expert species ID.
# the D13.2016.MorphBU morphospecies is a mixture of two species
taxon_df %>%
filter(!is.na(morphospeciesID), !is.na(expert_sciname)) %>% #samples where morphospeciesID and expert ID exists,
count(morphospeciesID, expert_sciname) %>%
ggplot(aes(x = fct_reorder(morphospeciesID, n),
y = fct_reorder(expert_sciname, n))) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Morphospecies from parataxonomist") +
ylab("Species ID from expert taxonomist")
# How many morphospecies occur with each parataxonomist ID?
# note: if the parataxonomist IDed to species, then no morphospecies was assigned
taxon_df %>%
group_by(para_sciname) %>%
summarize(morpho_n = n_distinct(morphospeciesID, na.rm = TRUE),
n = n()) %>%
arrange(-n) %>%
data.frame
# 3 para ID's had more than one morphospecies
# Harpalus sp. (para ID) had 6 morphospecies and Amara sp. (para ID) had 9
# Doesn't seem like using the morphospeciesID is helpful
# Plot discrepancies for all samples, excluding para ID == expert ID
taxon_df %>%
count(para_sciname, expert_sciname) %>%
arrange(n) %>%
mutate(discrepancy = para_sciname != expert_sciname) %>%
filter(discrepancy == TRUE) %>%
ggplot(aes(x = para_sciname,
y = expert_sciname)) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
xlab("Species ID from parataxonomist") +
ylab("Species ID from expert taxonomist") +
scale_color_manual(values = c("red"))
# Visualize those species with 100% match between para and expert IDs and with n>20
select_spp <- taxon_df %>%
mutate(same_species = para_sciname == expert_sciname) %>%
group_by(para_sciname) %>%
summarize(pct_correct = mean(same_species, na.rm = TRUE),
tot_correct = sum(same_species, na.rm = TRUE), #number of individuals for this species where para = expert where expert exists
n = n()) %>% #number of individuals with para_sciname
filter(pct_correct == 1, n > 20) %>%
pull(para_sciname)
# Plot abundance of selected Niwot species over time
taxon_df %>%
full_join(bet_parataxonomistID %>%
select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
ggplot() +
geom_bar(aes(x = collectDate, fill = para_sciname)) +
facet_wrap(. ~ para_sciname)
# Plot taxa by year (thanks SN)
taxon_df %>%
full_join(bet_parataxonomistID %>%
select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
group_by(para_sciname, year) %>%
summarize(n=n()) %>%
ggplot() +
geom_line(aes(x = year, y = n, colour = para_sciname, group = para_sciname)) +
geom_point(aes(x = year, y = n, colour = para_sciname)) +
xlab("Collection Year")
# Pterostichus restrictus came out of nowhere!
# Plot taxa by month through the years
taxon_df %>%
full_join(bet_parataxonomistID %>%
select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
group_by(para_sciname, year, month) %>%
summarize(n=n()) %>%
ggplot() +
geom_line(aes(x = month, y = n, colour = para_sciname, group = para_sciname)) +
geom_point(aes(x = month, y = n, colour = para_sciname)) +
xlab("Collection Month") +
facet_grid(year ~ .)
# July has the most abundance, but may have the most collection dates, too
# Plot taxa through time by collection day
taxon_df %>%
full_join(bet_parataxonomistID %>%
select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
group_by(para_sciname, year, month, day) %>%
summarize(n=n()) %>%
ggplot() +
#geom_line(aes(x = day, y = n, colour = para_sciname, group = para_sciname)) +
geom_point(aes(x = day, y = n, colour = para_sciname)) +
xlab("Collection Day") +
facet_grid(year ~ month)
# Plot spatial arrangement of plots, labeled with habitat
# add lat and long poitns
bet_fielddata %>%
select(plotID, nlcdClass, decimalLatitude, decimalLongitude) %>%
ggplot() +
geom_point(aes(x = decimalLongitude, y = decimalLatitude, colour = nlcdClass))
# cool variables to look at: bet_fielddata$nativestatuscode
bet_parataxonomistID %>%
filter(scientificName %in% select_spp) %>%
select(scientificName, nativeStatusCode) %>%
summarize(mean(nativeStatusCode == "N") )
### Modeling EDA
# What variables might be related to carabid abundance?
carabid_df <- taxon_df %>%
full_join(bet_parataxonomistID %>%
select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
left_join(distinct(bet_fielddata %>%
select(plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation))) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
mutate_at(c("morphospeciesID", "taxonRank", "para_sciname", "expert_sciname", "bc_sciname", "plotID", "trapID", "nlcdClass"), funs(factor(.)))
# Group data by plot and species and summarize by species abundance
model_test_abund <- carabid_df %>%
group_by(plotID, trapID, para_sciname, nlcdClass) %>% #para_sciname
summarize(sp_abund = n() ) %>%
arrange(plotID) %>%
data.frame()
model_test_comp <- carabid_df %>%
group_by(plotID, nlcdClass) %>%
summarize(sp_comp = n_distinct(para_sciname, na.rm = TRUE) ) %>%
arrange(plotID) %>%
data.frame()
# AIS later, try grouping by collection date and trapID
# Is habitat type a good predictor for species abundance?
# Constant
fit_abund_0 <- glm(formula = sp_abund ~ 1,
family = poisson(link="log"),
data = model_test_abund)
# With habitat type
fit_abund_1 <- glm(formula = sp_abund ~ nlcdClass,
family = poisson(link="log"),
data = model_test_abund)
fit_abund_1 <- glmer(formula = sp_abund ~ nlcdClass + para_sciname + (1|plotID) ,
family = poisson(link="log"),
data = model_test_abund)
display(fit_abund_0)
display(fit_abund_1)
anova(fit_abund_0, fit_abund_1)
# Is habitat type a good predictor for species composition?
# Constant
fit_comp_0 <- glm(formula = sp_comp ~ 1,
family = poisson(link="log"),
data = model_test_comp)
# With habitat type
fit_comp_1 <- glm(formula = sp_comp ~ nlcdClass,
family = poisson(link="log"),
data = model_test_comp)
display(fit_comp_0)
display(fit_comp_1)
# Classwork exploring woody veg structure
carabid_plotIDs <- unique(carabid_df$plotID)
vst_shrubgroup %>% filter(plotID %in% carabid_plotIDs)
### SCRATCH ###
# How to decide on the final species ID to use?
# Class notes
# exclude species that are intermittent
# Pseudocode
# for individuals that have expertID,
# If paraID matches expertID
# then assign expertID to all individuals in para with that paraID (supported by 100% match of expert and barcode ID)
# else if expertID does NOT match paraID
# then still take expertID?
# for individuals that do not have expertID
# if their paraID was matched to an expertID elsewhere in the df, take the expertID
# if their paraID has no expert match, then assign NA to final_sciname and don't get analyzed?
matched_species <- taxon_df %>%
filter(!is.na(expert_sciname)) %>%
select(para_sciname, expert_sciname) %>%
distinct() %>%
arrange(expert_sciname) %>%
data.frame
taxon_df <- taxon_df %>%
mutate(same_species = para_sciname == expert_sciname,
final_sciname = NA)
# for each row where same_species is T, use expertID on all other rows with matching paraIDs
for (i in 1:nrow(taxon_df)) {
if (!is.na(taxon_df$expert_sciname[i])) {
taxon_df$final_sciname[i] <- taxon_df$expert_sciname[i]
} else {
if (taxon_df$para_sciname[i] %in% matched_species$para_sciname) {
# if their paraID was matched to an expertID elsewhere in the df, take the expertID. Here we risk the chance that the parataxonomist identified two different species as one (e.g. para ID for Amara sp. matches 6 expert IDs)
}
}
}
# AIS how to make this tidy?