-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_splitting_lemmatisation_tokenisation_dfm_creation.R
336 lines (258 loc) · 15.7 KB
/
04_splitting_lemmatisation_tokenisation_dfm_creation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# splitting-lemmatisation-tokenisation-data-feature-matrix-creation ----
# Splitting in segments (with Rainette)
corpus_segm_unlemm_full <- split_segments(corpus_lesson_disc, segment_size = 40)
## Process to create a lemmatized corpus ----
# instructions to create a python virtual environment for spacy
# not needed to run once the environment is created --- keeping it here for archive if needed
# choose the version of python you want
# version <- "3.11.7"
# do a python install with this version
# the function returns the path to the Python executable file
# python_exe <- reticulate::install_python(version)
# create a virtual environment with this python install
# reticulate::virtualenv_create("venvforspacy", python = python_exe)
# install spaCy in it
# unfortunately, using this function spacy_install(), I wasn’t able to specify the environment in which I wanted to install spaCy… so the function didn’t install spaCy in the environment I created just before, it created a new environment (!) with the default name "r-spacyr" and installed spaCy into it… good things: this environment took the python version I decided for the first environment (here 3.11.7); spaCy could be installed with the option [apple] that makes it use the capacities of the GPU of the Apple Silicon chip through Metal Performance Shaders; the language model is the one I wanted.
# spacy_install(version = "apple", lang_models = "fr_dep_news_trf", ask = interactive())
# Lancement de la lemmatisation avec le bon modèle
# new option with a python virtual environment venv, working
spacy_initialize(model = "fr_dep_news_trf", virtualenv = "r-spacyr")
# Création d’une data.table avec le résultat de la lemmatisation et division en mots du corpus
data_table_lemmatisee <- spacy_parse(corpus_segm_unlemm_full, pos = TRUE, tag = FALSE, lemma = TRUE, entity = TRUE, dependency = TRUE, nounphrase = FALSE, multithread = TRUE)
# Terminer l’environnement spaCy
spacy_finalize()
## correct some wrong pos in the lemmatised data
# In data_table_lemmatisee, replace the POS for the lemma "voilà" from "VERB" to "INTJ"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "voilà", "INTJ", pos))
# In data_table_lemmatisee, replace the POS for the lemma "ok" with "ADV"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "ok", "ADV", pos))
# same for "OK"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "OK", "ADV", pos))
# In data_table_lemmatisee, replace the POS for the lemma "ha" with "INTJ"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "ha", "INTJ", pos))
# same for "Ha"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "Ha", "INTJ", pos))
# same for "hm"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "hm", "INTJ", pos))
# same for "hein"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "hein", "INTJ", pos))
# same for "Hein"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "Hein", "INTJ", pos))
# In data_table_lemmatisee, replace the lemma for the token "plait" with "plaire"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "plait", "plaire", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma for the token "plante" with "planter"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "plante" & pos == "VERB", "planter", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma for the token "tourne" with "tourner"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "tourne" & pos == "VERB", "tourner", lemma))
# same for "Tourne"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "Tourne" & pos == "VERB", "tourner", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma for the token "ferme" with "fermer"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "ferme" & pos == "VERB", "fermer", lemma))
# same for "Ferme"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "Ferme" & pos == "VERB", "fermer", lemma))
# In data_table_lemmatisee, replace the lemma for the token "répèter" with "répéter"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "répèter", "répéter", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma "marche" with "marcher"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "marche" & pos == "VERB", "marcher", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma "donne" with "donner"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(token == "donne" & pos == "VERB", "donner", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma "retourne" with "retourner"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "retourne" & pos == "VERB", "retourner", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma "monte" with "monter"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "monte" & pos == "VERB", "monter", lemma))
# In data_table_lemmatisee, when the POS is "VERB", replace the lemma "cherche" with "chercher"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "cherche" & pos == "VERB", "chercher", lemma))
# same with "montre" and "montrer"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "montre" & pos == "VERB", "montrer", lemma))
# same with "joue" and "jouer"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "joue" & pos == "VERB", "jouer", lemma))
# same with "laisse" and "laisser"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(lemma = ifelse(lemma == "laisse" & pos == "VERB", "laisser", lemma))
# In data_table_lemmatisee, replace the POS for the token "-là" with "ADV"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(token == "-là", "ADV", pos))
# In data_table_lemmatisee, replace the POS for the lemma "oup" with "INTJ"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "oup", "INTJ", pos))
# In data_table_lemmatisee, replace the POS for the lemma "merci" with "INTJ"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "merci", "INTJ", pos))
# In data_table_lemmatisee, replace the POS for the lemma "ch" with "SYM"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "ch", "SYM", pos))
# In data_table_lemmatisee, replace the POS for the lemma "2A" with "SYM"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "2A", "SYM", pos))
# same for "2a"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "2a", "SYM", pos))
# same for "2A."
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "2A.", "SYM", pos))
# same for "2a."
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "2a.", "SYM", pos))
# In data_table_lemmatisee, replace the POS for the lemma "1A" with "SYM"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "1A", "SYM", pos))
# same for "1a"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "1a", "SYM", pos))
# same for "1A."
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "1A.", "SYM", pos))
# same for "1a."
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "1a.", "SYM", pos))
# same for "b."
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "b.", "SYM", pos))
# In data_table_lemmatisee, replace the POS for the lemma "nom_d_un_e_élève" with "PROPN"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "nom_d_un_e_élève", "PROPN", pos))
# In data_table_lemmatisee, replace the POS for the lemma "ci" with "ADV"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "ci", "ADV", pos))
# In data_table_lemmatisee, replace the POS for the lemma "cætera" with "CCONJ"
data_table_lemmatisee <- data_table_lemmatisee %>%
mutate(pos = ifelse(lemma == "cætera", "CCONJ", pos))
## tokenisation ----
## create a tokens object unlemmatised with POS for export
tok_segm_unlemm_w_pos_full <- as.tokens(
data_table_lemmatisee,
concatenator = "/",
include_pos = c("pos"),
use_lemma = FALSE
)
## create a tokens object lemmatised with POS for export
tok_segm_lemm_w_pos_full <- as.tokens(
data_table_lemmatisee,
concatenator = "/",
include_pos = c("pos"),
use_lemma = TRUE
)
## create a tokens object keeping from the preceeding one only the nouns, adjectives and verbs (makes it partial)
tok_segm_lemm_part <- tok_segm_lemm_w_pos_full |>
tokens_select(
c("/NOUN$", "/AJD$", "/VERB$"),
valuetype = "regex"
) |>
tokens_split("/") |>
tokens_remove(
c("NOUN", "ADJ", "VERB"),
valuetype = "fixed",
case_insensitive = FALSE
)
## option to take all the lemmatised tokens without POS
## create a tokens object for the lesson analysis (lemmatised without POS)
# tok_segm_lemm_part <- as.tokens(
# data_table_lemmatisee,
# concatenator = "/",
# include_pos = c("none"),
# use_lemma = TRUE
# )
# for the record, display the list of tokens for one document in the tokens object
# as.list(tok_segm_lemm_part)[1]
# splitter aux apostrophes, aux parenthèses, aux slash et aux tirets
# tok_segm_lemm_part <- tokens_split(tok_segm_lemm_part,"'")
# tok_segm_lemm_part <- tokens_split(tok_segm_lemm_part,"(")
# tok_segm_lemm_part <- tokens_split(tok_segm_lemm_part,"/")
# tok_segm_lemm_part <- tokens_split(tok_segm_lemm_part,"-")
# tok_segm_lemm_part <- tokens_split(tok_segm_lemm_part,"’")
# tokenize and remove punctuation, symbols, numbers and urls
tok_segm_lemm_part <- tokens(tok_segm_lemm_part, remove_punct = TRUE, remove_numbers = TRUE)
# create docvars for the full corpus ---
## add variables from classes as docvars
docvars(corpus_segm_unlemm_full, "class_id") <- this_class_id
docvars(corpus_segm_unlemm_full, "pupils_number") <- pupils_number
docvars(corpus_segm_unlemm_full, "school_level") <- school_level
# add variables from lessons as docvars
docvars(corpus_segm_unlemm_full, "lesson_id") <- this_lesson_id
docvars(corpus_segm_unlemm_full, "lesson_topic") <- lesson_topic
docvars(corpus_segm_unlemm_full, "programming_type") <- programming_type
docvars(corpus_segm_unlemm_full, "number_of_teachers") <- number_of_teachers
# add variables from discourses as docvars
docvars(corpus_segm_unlemm_full, "discourse_id") <- first_discourse_id
docvars(corpus_segm_unlemm_full, "nth_time_teacher_taught_lesson") <- nth_time_first_teacher_taught_lesson
## add variables from teachers as docvars ----
# role_in_project
docvars(corpus_segm_unlemm_full, "role_in_project") <- teachers %>% filter(teacher_id == first_teacher) %>% select(role_in_project) %>% as.character()
# gender
docvars(corpus_segm_unlemm_full, "gender") <- teachers %>% filter(teacher_id == first_teacher) %>% select(gender) %>% as.character()
# age_range
docvars(corpus_segm_unlemm_full, "age_range") <- teachers %>% filter(teacher_id == first_teacher) %>% select(age_range) %>% as.character()
# teaching_level
docvars(corpus_segm_unlemm_full, "teaching_level") <- teachers %>% filter(teacher_id == first_teacher) %>% select(teaching_level) %>% as.character()
# number_of_disciplines
docvars(corpus_segm_unlemm_full, "number_of_disciplines") <- teachers %>% filter(teacher_id == first_teacher) %>% select(number_of_disciplines) %>% as.character()
# disciplines
docvars(corpus_segm_unlemm_full, "disciplines") <- teachers %>% filter(teacher_id == first_teacher) %>% select(disciplines) %>% as.character()
# teaching_experience
docvars(corpus_segm_unlemm_full, "teaching_experience") <- teachers %>% filter(teacher_id == first_teacher) %>% select(teaching_experience) %>% as.numeric()
# teaching_experience_range
docvars(corpus_segm_unlemm_full, "teaching_experience_range") <- teachers %>% filter(teacher_id == first_teacher) %>% select(teaching_experience_range) %>% as.character()
# cs_teaching_experience
docvars(corpus_segm_unlemm_full, "cs_teaching_experience") <- teachers %>% filter(teacher_id == first_teacher) %>% select(cs_teaching_experience) %>% as.numeric()
# cs_teaching_experience_range
docvars(corpus_segm_unlemm_full, "cs_teaching_experience_range") <- teachers %>% filter(teacher_id == first_teacher) %>% select(cs_teaching_experience_range) %>% as.character()
# teacher_degree
docvars(corpus_segm_unlemm_full, "teacher_degree") <- teachers %>% filter(teacher_id == first_teacher) %>% select(teacher_degree) %>% as.character()
# pre_or_in_service
docvars(corpus_segm_unlemm_full, "pre_or_in_service") <- teachers %>% filter(teacher_id == first_teacher) %>% select(pre_or_in_service) %>% as.character()
# teacher_education_type
docvars(corpus_segm_unlemm_full, "teacher_education_type") <- teachers %>% filter(teacher_id == first_teacher) %>% select(teacher_education_type) %>% as.character()
# cs_education_institution
docvars(corpus_segm_unlemm_full, "cs_education_institution") <- teachers %>% filter(teacher_id == first_teacher) %>% select(cs_education_institution) %>% as.character()
# cs_education_type
docvars(corpus_segm_unlemm_full, "cs_education_type") <- teachers %>% filter(teacher_id == first_teacher) %>% select(cs_education_type) %>% as.character()
# highest_diploma
docvars(corpus_segm_unlemm_full, "highest_diploma") <- teachers %>% filter(teacher_id == first_teacher) %>% select(highest_diploma) %>% as.character()
# TPACK_score_didactical_competencies
docvars(corpus_segm_unlemm_full, "TPACK_score_didactical_competencies") <- teachers %>% filter(teacher_id == first_teacher) %>% select(TPACK_score_didactical_competencies) %>% as.character()
# TPACK_score_content_and_technology_foundations
docvars(corpus_segm_unlemm_full, "TPACK_score_content_and_technology_foundations") <- teachers %>% filter(teacher_id == first_teacher) %>% select(TPACK_score_content_and_technology_foundations) %>% as.character()
# make docnames unique for further export
docnames(corpus_segm_unlemm_full) <- docnames(corpus_segm_unlemm_full) %>% paste0(this_lesson_id, "_", .)
# make segment_source unique for further export
docvars(corpus_segm_unlemm_full)$segment_source <- docvars(corpus_segm_unlemm_full)$segment_source %>% paste0(this_lesson_id, "_", .)
# copy docvars to all tokens and corpus objects for export ----
# copy docvars from corpus_segm_unlemm_full to tok_segm_unlemm_w_pos_full
docvars(tok_segm_unlemm_w_pos_full) <- docvars(corpus_segm_unlemm_full)
# make docnames unique for further export
docnames(tok_segm_unlemm_w_pos_full) <- docnames(tok_segm_unlemm_w_pos_full) %>% paste0(this_lesson_id, "_", .)
# copy docvars from corpus_segm_unlemm_full to tok_segm_lemm_w_pos_full
docvars(tok_segm_lemm_w_pos_full) <- docvars(corpus_segm_unlemm_full)
# make docnames unique for further export
docnames(tok_segm_lemm_w_pos_full) <- docnames(tok_segm_lemm_w_pos_full) %>% paste0(this_lesson_id, "_", .)
# copy docvars from corpus_segm_unlemm_full to tok_segm_lemm_part
docvars(tok_segm_lemm_part) <- docvars(corpus_segm_unlemm_full)
# make docnames unique for further export
docnames(tok_segm_lemm_part) <- docnames(tok_segm_lemm_part) %>% paste0(this_lesson_id, "_", .)
# create data-feature matrix ----
# create a dfm with the lemmatised and partial tokenisation
dfm_segm_lemm_part <- dfm(tok_segm_lemm_part)
# for the record, display the full list of features in the dfm
# dfm_segm_lemm_part@Dimnames[["features"]]