-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path07_alt_creation_and_ca.R
224 lines (175 loc) · 9.57 KB
/
07_alt_creation_and_ca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# aggregated-lexical-table-creation ----
## rename-some-factors ----
# address to
docvars(dfm_segm_lemm_part_wo_na) <- docvars(dfm_segm_lemm_part_wo_na) %>%
mutate(To = case_when(
recipient == "class" ~ "recipient_class",
recipient == "pupil" ~ "recipient_pupil",
recipient == "teacher" ~ "recipient_teacher"
)) %>% mutate(To = as.factor(recipient))
## group-documents-by-variables ----
# Create dfm by grouping around one variable — sums the number of segments containing each word for every modality of the variable
dfm_recipient <- dfm_group(dfm_segm_lemm_part_wo_na, docvars(dfm_segm_lemm_part_wo_na)$recipient)
# Convert to data.frame
df_recipient <- convert(dfm_recipient, "data.frame")
# Convert column doc_id into rownames
df_recipient <- df_recipient %>% remove_rownames %>% column_to_rownames(var="doc_id")
# Create dfm by grouping around one variable — sums the number of segments containing each word for every modality of the variable
dfm_time_slice <- dfm_group(dfm_segm_lemm_part_wo_na, docvars(dfm_segm_lemm_part_wo_na)$time_slice)
# Convert to data.frame
df_time_slice <- convert(dfm_time_slice, "data.frame")
# Convert column doc_id into rownames
df_time_slice <- df_time_slice %>% remove_rownames %>% column_to_rownames(var="doc_id")
# Create dfm by grouping around one variable — sums the number of segments containing each word for every modality of the variable
dfm_cluster <- dfm_group(dfm_segm_lemm_part_wo_na, docvars(dfm_segm_lemm_part_wo_na)$lesson_cluster_id)
# Convert to data.frame
df_cluster <- convert(dfm_cluster, "data.frame")
# Convert column doc_id into rownames
df_cluster <- df_cluster %>% remove_rownames %>% column_to_rownames(var="doc_id")
# group all dfm into one
tableau_lexical_questions <- rbind(df_recipient, df_time_slice)
tableau_lexical_questions <- rbind(tableau_lexical_questions, df_cluster)
# Order lexicon decreasing to view high and low values
tableau_lexical_questions <- tableau_lexical_questions[,order(colSums(tableau_lexical_questions), decreasing = TRUE)]
## add-clusters-to-alt-and-compute-mca ----
### group small clusters ----
# If I don’t want to keep a fixed number of clusters, but use a threshold value of number of segments in the cluster under which we group them, this is the way to do it
# How to decide the threshold? Clusters including more than 1% of segments of the corpus represent 12 clusters. We decide to regroup clusters under 1% of the corpus.
# cluster_threshold <- sum(cluster_nb_of_segments)/25
# cluster_nb_of_segments_under_threshold <- cluster_nb_of_segments[cluster_nb_of_segments < cluster_threshold]
# get the names of the small clusters to regroup
# clusters_to_regroup <- names(cluster_nb_of_segments_under_threshold)
# And this is the way to go if I decide to regroup all but the biggest clusters (fixed number of 10)
# create a vector with a list of all clusters numbers, depending on the number of clusters defined at the beginning
clusters_numbers_list <- 1:number_of_lesson_clusters
# create a vector for the small clusters, as a diff between the list of all clusters and the biggest clusters
smallest_clusters <- setdiff(clusters_numbers_list, biggest_clusters)
# group small clusters from ALT into a new row with the sum of their values
# Create a df with sum of all small clusters
sum_small_clusters <- tableau_lexical_questions %>% filter(rownames(tableau_lexical_questions) %in% smallest_clusters) %>% summarise(across(everything(), sum))
# rename the unique row of this dataframe
rownames(sum_small_clusters) <- "small_clusters"
# remove the rows corresponding to small clusters in the generalised aggregated lexical table
tableau_lexical_questions <- tableau_lexical_questions[!(row.names(tableau_lexical_questions) %in% smallest_clusters),]
# add the row with the sum of small clusters
tableau_lexical_questions <- rbind(tableau_lexical_questions, sum_small_clusters)
# rename clusters if they have been analysed
if (clusters_renamed_yes) {
# extract rownames into a column
tableau_lexical_questions <- rownames_to_column(tableau_lexical_questions)
# rename it for merge
colnames(tableau_lexical_questions)[1] <- "lesson_cluster_id"
# merge with the biggest clusters df into another df
tbl_lex_quest_merged <- merge(tableau_lexical_questions, biggest_clusters_df, by = "lesson_cluster_id")
# rename the column and drop the other
tbl_lex_quest_merged$lesson_cluster_id <- tbl_lex_quest_merged$name
tbl_lex_quest_merged$name <- NULL
# remove the biggest clusters from the generalised aggregated lexical table
tableau_lexical_questions <- tableau_lexical_questions[!tableau_lexical_questions$lesson_cluster_id %in% biggest_clusters, ]
# add the renamed clusters
tableau_lexical_questions <- rbind(tableau_lexical_questions, tbl_lex_quest_merged)
# transform first column into rownames
rownames(tableau_lexical_questions) <- tableau_lexical_questions[ , 1]
tableau_lexical_questions[ , 1] <- NULL
} else {
# extract rows corresponding to the biggest clusters
tbl_lex_quest_extract <- tableau_lexical_questions %>% filter(rownames(tableau_lexical_questions) %in% biggest_clusters)
# rename clusters with cluster_
rownames(tbl_lex_quest_extract) <- paste0("cluster_", rownames(tbl_lex_quest_extract))
# remove the biggest clusters from the generalised aggregated lexical table
tableau_lexical_questions <- tableau_lexical_questions[!rownames(tableau_lexical_questions) %in% biggest_clusters, ]
# add the renamed clusters
tableau_lexical_questions <- rbind(tableau_lexical_questions, tbl_lex_quest_extract)
}
# Remove columns with values under… (5?)
# tableau_lexical_questions_extract <- tableau_lexical_questions %>%
# select_if(~ !any(.x < 5))
# Limit lexicon to a fixed number of words… (70?)
# tableau_lexical_questions_extract <- tableau_lexical_questions[,1:70]
# transpose the aggregated lexical table
tableau_lexical_questions <- as.data.frame(t(tableau_lexical_questions))
# Correspondence Analysis (CA) ----
ca_lesson_disc <- CA(tableau_lexical_questions, graph = FALSE)
# explor(ca_lesson_disc)
# compute-plot-eigenvalues ----
# Eigenvalues
ca_alt_screeplot <- fviz_screeplot(ca_lesson_disc, addlabels = TRUE, ncp = 20, title = "", ylim = c(0, 50), theme = theme_minimal())
# compute-plot-ca ----
set.seed(42)
plot_ca_lesson_disc <- fviz_ca_biplot(
ca_lesson_disc,
axes = c(1, 2),
geom = c("point", "text"),
select.row = list(cos2 = 50),
# select.col = list(cos2 = 50),
labelsize = 6,
label = "all",
col.col = "#CC79A7",
col.row = "#0072B2",
shape.row = 0,
shape.col = 4,
pointsize = 3,
repel = TRUE,
title = ""
) + theme(text = element_text(size = 7.5),
axis.title = element_text(size = 18),
axis.text = element_text(size = 18),
plot.margin = margin(0, 10, 10, 10, unit = "pt"))
# ggsave(filename="mca.pdf", path = "output", scale = 1.4)
# contrib-cos2-coord-create-tables ----
## coord ----
# create a dataframe with coordinates for all dimensions of CA
# variables
ca_lesson_disc_var_coord <- as.data.frame(ca_lesson_disc[["col"]]$coord)
# lexicon
ca_lesson_disc_lexicon_coord <- as.data.frame(ca_lesson_disc[["row"]]$coord)
# rbind variables and lexicon
ca_lesson_disc_coord <- rbind(ca_lesson_disc_var_coord, ca_lesson_disc_lexicon_coord)
## contrib ----
# create a dataframe with contrib for all dimensions of CA
# variables
ca_lesson_disc_var_contrib <- as.data.frame(ca_lesson_disc[["col"]]$contrib)
# lexicon
ca_lesson_disc_lexicon_contrib <- as.data.frame(ca_lesson_disc[["row"]]$contrib)
# rbind variables and lexicon
ca_lesson_disc_contrib <- rbind(ca_lesson_disc_var_contrib, ca_lesson_disc_lexicon_contrib)
## cos2 ----
# create a dataframe with cos2 for all dimensions of CA
# variables
ca_lesson_disc_var_cos2 <- as.data.frame(ca_lesson_disc[["col"]]$cos2)
# lexicon
ca_lesson_disc_lexicon_cos2 <- as.data.frame(ca_lesson_disc[["row"]]$cos2)
# rbind variables and lexicon
ca_lesson_disc_cos2 <- rbind(ca_lesson_disc_var_cos2, ca_lesson_disc_lexicon_cos2)
# round values of the three tables to 3 digits
round_df <- function(x, digits) {
# round all numeric variables
# x: data frame
# digits: number of digits to round
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
ca_lesson_disc_coord <- round_df(ca_lesson_disc_coord, 3)
ca_lesson_disc_contrib <- round_df(ca_lesson_disc_contrib, 3)
ca_lesson_disc_cos2 <- round_df(ca_lesson_disc_cos2, 3)
## Isolate dimension 1 ----
# create a dataframe with coord, contrib and cos2 values for dimension 1
table_ca_lesson_disc_dim1 <- data.frame(ca_lesson_disc_coord[,1], ca_lesson_disc_contrib[,1], ca_lesson_disc_cos2[,1])
# rename rows and columns
rownames(table_ca_lesson_disc_dim1) <- rownames(ca_lesson_disc_coord)
colnames(table_ca_lesson_disc_dim1) <- c('coord', 'contrib', 'cos2')
# order columns in decreasing order according to cos2
table_ca_lesson_disc_dim1 <- table_ca_lesson_disc_dim1 %>% arrange(desc(table_ca_lesson_disc_dim1$cos2))
# keep 30 first rows
table_ca_lesson_disc_dim1_extract <- table_ca_lesson_disc_dim1[1:30, ]
## Isolate dimension 2 ----
# create a dataframe with coord, contrib and cos2 values for dimension 2
table_ca_lesson_disc_dim2 <- data.frame(ca_lesson_disc_coord[,2], ca_lesson_disc_contrib[,2], ca_lesson_disc_cos2[,2])
# rename rows and columns
rownames(table_ca_lesson_disc_dim2) <- rownames(ca_lesson_disc_coord)
colnames(table_ca_lesson_disc_dim2) <- c('coord', 'contrib', 'cos2')
# order columns in decreasing order according to cos2
table_ca_lesson_disc_dim2 <- table_ca_lesson_disc_dim2 %>% arrange(desc(table_ca_lesson_disc_dim2$cos2))
# keep 30 first rows
table_ca_lesson_disc_dim2_extract <- table_ca_lesson_disc_dim2[1:30, ]