-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDuoLingo.Rmd
185 lines (157 loc) · 6.68 KB
/
DuoLingo.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
---
title: "DuoLingo"
output: pdf_document
date: '2022-07-18'
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(tidyr)
library(stringr)
library(lme4)
library(dplyr)
library(ggplot2)
```
```{r cars}
source("config.R")
data <- fread(path_duo)
data$lexeme_id = as.factor(data$lexeme_id)
data$user_id = as.factor(data$user_id)
data$ui_language = as.factor(data$ui_language)
data$learning_language = as.factor(data$learning_language)
dim(data)
colSums(is.na(data))
```
```{r}
length(unique(data$lexeme_id))
length(unique(data$user_id))
```
```{r}
#data$p_recall = sapply(data$p_recall, function(v){return(min(max(v,0.0001),0.9999))})
data$word_root <- as.factor(substr(data$lexeme_string, str_locate(data$lexeme_string, '/')[,1]+1, str_locate(data$lexeme_string, '<')[,1]-1))
data$p_log_odds = log(data$p_recall/(1-data$p_recall))
data = data[data$history_seen < 50,]
```
```{r}
summary(data)
table(data$ui_language, data$learning_language) # from eng to deu, esp, por, fre, ita, from pt,it,es to en
nrow(data[data$p_recall==1,])/nrow(data) # 83% are completely correct
```
```{r}
duo_to_en = data[data$learning_language == 'en',]
length(unique(duo_to_en$word_root))
length(unique(duo_to_en$user_id))
duo_from_en = data[data$learning_language != 'en',]
length(unique(duo_from_en$word_root))
length(unique(duo_from_en$user_id))
# join translations to english lexemes
trans_es = fread("es_to_en.csv")
trans_es$en_word_root=tolower(trans_es$en_word_root)
duo_with_es = merge(duo_from_en[duo_from_en$learning_language=="es",], as.data.frame(trans_es[,c("es_word_root","en_word_root")]), by.x="word_root", by.y="es_word_root")
trans_fr = fread("fr-to-en.csv")
trans_fr$en_word_root=tolower(trans_fr$en_word_root)
duo_with_fr = merge(duo_from_en[duo_from_en$learning_language=="fr",], as.data.frame(trans_fr[,c("fr_word_root","en_word_root")]), by.x="word_root", by.y="fr_word_root")
trans_de = fread("de-to-en.csv")
trans_de$en_word_root[333] = "similar"
trans_de$en_word_root[817] = "change"
trans_de$en_word_root[1324] = "check"
trans_de$en_word_root[1390] = "skip"
trans_de$en_word_root=tolower(trans_de$en_word_root)
duo_with_ger = merge(duo_from_en[duo_from_en$learning_language=="de",], as.data.frame(trans_de[,c("de_word_root","en_word_root")]), by.x="word_root", by.y="de_word_root")
trans_it = fread("it_to_en.csv")
trans_it$en_word_root=tolower(trans_it$en_word_root)
duo_with_it = merge(duo_from_en[duo_from_en$learning_language=="it",], as.data.frame(trans_it[,c("it_word_root","en_word_root")]), by.x="word_root", by.y="it_word_root")
trans_pt = fread("iconv -f ISO-8859-1 -t UTF-8 pt_to_en.csv")
trans_pt$en_word_root=tolower(trans_pt$en_word_root)
duo_with_pt = merge(duo_from_en[duo_from_en$learning_language=="pt",], as.data.frame(trans_pt[,c("pt_word_root","en_word_root")]), by.x="word_root", by.y="pt_word_root")
duo_trans = rbind(duo_with_es, duo_with_fr, duo_with_ger, duo_with_it, duo_with_pt)
duo_trans = duo_trans[duo_trans$word_root!="",]
dim(duo_trans)
# TODO: carefully select words, similar ones, dissimilar ones, false friends
# choose 5 random users, p_recall ~ opp * word
# hypothesis: similar words
# suggestion: LearnSphere
```
```{r}
users = sample(duo_trans[duo_trans$learning_language=="fr" ,]$user_id, 5)
duo_users = duo_trans[duo_trans$user_id %in% users,]
p_recalls = aggregate(p_recall~word_root*user_id, duo_users, FUN=mean)
words = sample(duo_trans[duo_trans$learning_language=="fr",]$word_root, 20)
p_recalls[p_recalls$word_root %in% words,] %>% ggplot() +
geom_line(aes(x = word_root, y = p_recall, group = user_id, color=user_id))
```
```{r}
summary(duo_to_en)
```
```{r}
num_langs = duo_trans %>% group_by(en_word_root) %>% summarise(n_distinct(learning_language))
num_langs[order(-num_langs$`n_distinct(learning_language)`),]
word_counts = aggregate(duo_from_en$lexeme_id, by=list(word_root = duo_from_en$word_root, L1 = duo_from_en$learning_language), FUN=length)
for (l1 in unique(word_counts$L1)) {
l1_counts = subset(word_counts, L1==l1)
write.csv(l1_counts[order(-l1_counts$x),c("word_root")], paste0(l1,'.csv'))
}
```
```{r}
model = glm(p_recall~history_seen, data=duo_to_en)
#plot(seq(0,50), model$coefficients[1]+seq(0,50)*-7.160e-03, type="l")
summary(model)
model2 = glm(p_recall~history_seen, data=duo_to_en)
#plot(seq(0,50), model$coefficients[1]+seq(0,50)*-7.160e-03, type="l")
summary(model2)
```
```{r}
emp_learning = aggregate(duo_to_en$p_recall, by=list(duo_to_en$history_seen), FUN=mean)
num_users = aggregate(duo_to_en$p_recall, by=list(duo_to_en$history_seen), FUN=length)
plot(emp_learning$Group.1[1:150], emp_learning$x[1:150], ylim=c(0,1))
lines(num_users$Group.1[1:150], num_users$x[1:150]/max(num_users$x), col="red")
print(merge(emp_learning, num_users, by="Group.1"))
```
```{r}
emp_learning = aggregate(duo_from_en$p_recall, by=list(duo_from_en$history_seen), FUN=mean)
num_users = aggregate(duo_from_en$p_recall, by=list(duo_from_en$history_seen), FUN=length)
plot(emp_learning$Group.1[1:150], emp_learning$x[1:150], ylim=c(0,1))
lines(num_users$Group.1[1:150], num_users$x[1:150]/max(num_users$x), col="red")
print(merge(emp_learning, num_users, by="Group.1"))
```
```{r}
sample(duo_trans$user_id, 10)
```
```{r}
duo_trans[duo_trans$user_id=="u:iLoW","lexeme_strings"]
```
```{r}
kc_filter = duo_to_en$word_root=="eat"
emp_learning = aggregate(duo_to_en$p_recall[kc_filter], by=list(duo_to_en$history_seen[kc_filter]), FUN=mean)
plot(emp_learning$Group.1[1:150], emp_learning$x[1:150])
```
```{r}
for (word in num_langs[num_langs$`n_distinct(learning_language)`==5,]$en_word_root) {
print(word)
kc_filter = duo_trans$en_word_root==word
emp_learning = aggregate(p_recall ~ history_seen * learning_language, data = duo_trans[kc_filter], mean)
users = aggregate(p_recall ~ history_seen * learning_language, data = duo_trans[kc_filter], length)
#emp_learning = merge(emp_learning, users, by="history_seen")
print(emp_learning[emp_learning$history_seen<25,] %>% ggplot() +
geom_line(aes(x = history_seen, y = p_recall, group = learning_language, color=learning_language))+
ggtitle(word) ) + ylim(0.5,1)
}
```
```{r}
for (word in unique()) {
kc_filter = duo_to_en$word_root==word
emp_learning = aggregate(p_recall ~ history_seen * ui_language, data = duo_to_en[kc_filter], mean)
users = aggregate(p_recall ~ history_seen * ui_language, data = duo_to_en[kc_filter], mean)
emp_learning = merge
print(emp_learning %>% ggplot() +
geom_line(aes(x = history_seen, y = p_recall, group = ui_language, color=ui_language))+ggtitle(word) )
}
duo_trans$lexeme_string
```
```{r}
plot(duo_trans$delta, duo_trans$p_recall, type="p")
```
```{r}
#unique(duo_trans$lexeme_string[str_detect(duo_trans$lexeme_string, "<det>")])
#model<-glm(p_recall~history_seen,data=data)
```