DuoLingo.Rmd

---
title: "DuoLingo"
output: pdf_document
date: '2022-07-18'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(tidyr)
library(stringr)
library(lme4)
library(dplyr)
library(ggplot2)
```

```{r cars}
source("config.R")

data <- fread(path_duo)
data$lexeme_id = as.factor(data$lexeme_id)
data$user_id = as.factor(data$user_id)
data$ui_language = as.factor(data$ui_language)
data$learning_language = as.factor(data$learning_language)
dim(data)

colSums(is.na(data))

```
```{r}
length(unique(data$lexeme_id))
length(unique(data$user_id))
```


```{r}
#data$p_recall = sapply(data$p_recall, function(v){return(min(max(v,0.0001),0.9999))})
data$word_root <- as.factor(substr(data$lexeme_string, str_locate(data$lexeme_string, '/')[,1]+1, str_locate(data$lexeme_string, '<')[,1]-1))
data$p_log_odds = log(data$p_recall/(1-data$p_recall))
data = data[data$history_seen < 50,]
```
```{r}
summary(data)
table(data$ui_language, data$learning_language) # from eng to deu, esp, por, fre, ita, from pt,it,es to en
nrow(data[data$p_recall==1,])/nrow(data) # 83% are completely correct
```
```{r}
duo_to_en = data[data$learning_language == 'en',]
length(unique(duo_to_en$word_root))
length(unique(duo_to_en$user_id))

duo_from_en = data[data$learning_language != 'en',]
length(unique(duo_from_en$word_root))
length(unique(duo_from_en$user_id))

# join translations to english lexemes
trans_es = fread("es_to_en.csv")
trans_es$en_word_root=tolower(trans_es$en_word_root)
duo_with_es = merge(duo_from_en[duo_from_en$learning_language=="es",], as.data.frame(trans_es[,c("es_word_root","en_word_root")]), by.x="word_root", by.y="es_word_root")

trans_fr = fread("fr-to-en.csv")
trans_fr$en_word_root=tolower(trans_fr$en_word_root)
duo_with_fr = merge(duo_from_en[duo_from_en$learning_language=="fr",], as.data.frame(trans_fr[,c("fr_word_root","en_word_root")]), by.x="word_root", by.y="fr_word_root")

trans_de = fread("de-to-en.csv")
trans_de$en_word_root[333] = "similar"
trans_de$en_word_root[817] = "change"
trans_de$en_word_root[1324] = "check"
trans_de$en_word_root[1390] = "skip"
trans_de$en_word_root=tolower(trans_de$en_word_root)
duo_with_ger = merge(duo_from_en[duo_from_en$learning_language=="de",], as.data.frame(trans_de[,c("de_word_root","en_word_root")]), by.x="word_root", by.y="de_word_root")

trans_it = fread("it_to_en.csv")
trans_it$en_word_root=tolower(trans_it$en_word_root)
duo_with_it = merge(duo_from_en[duo_from_en$learning_language=="it",], as.data.frame(trans_it[,c("it_word_root","en_word_root")]), by.x="word_root", by.y="it_word_root")

trans_pt = fread("iconv -f ISO-8859-1 -t UTF-8 pt_to_en.csv")
trans_pt$en_word_root=tolower(trans_pt$en_word_root)
duo_with_pt = merge(duo_from_en[duo_from_en$learning_language=="pt",], as.data.frame(trans_pt[,c("pt_word_root","en_word_root")]), by.x="word_root", by.y="pt_word_root")

duo_trans = rbind(duo_with_es, duo_with_fr, duo_with_ger, duo_with_it, duo_with_pt)
duo_trans = duo_trans[duo_trans$word_root!="",]
dim(duo_trans)

# TODO: carefully select words, similar ones, dissimilar ones, false friends
# choose 5 random users, p_recall ~ opp * word
# hypothesis: similar words
# suggestion: LearnSphere
```
```{r}
users = sample(duo_trans[duo_trans$learning_language=="fr" ,]$user_id, 5)
duo_users = duo_trans[duo_trans$user_id %in% users,]
p_recalls = aggregate(p_recall~word_root*user_id, duo_users, FUN=mean)
words = sample(duo_trans[duo_trans$learning_language=="fr",]$word_root, 20)

p_recalls[p_recalls$word_root %in% words,] %>% ggplot() + 
    geom_line(aes(x = word_root, y = p_recall, group = user_id, color=user_id))
```

```{r}
summary(duo_to_en)
```
```{r}
num_langs = duo_trans %>% group_by(en_word_root) %>% summarise(n_distinct(learning_language))
num_langs[order(-num_langs$`n_distinct(learning_language)`),]

word_counts = aggregate(duo_from_en$lexeme_id, by=list(word_root = duo_from_en$word_root, L1 = duo_from_en$learning_language), FUN=length)

for (l1 in unique(word_counts$L1)) {
  l1_counts = subset(word_counts, L1==l1)
  write.csv(l1_counts[order(-l1_counts$x),c("word_root")], paste0(l1,'.csv'))
}

```

```{r}
model = glm(p_recall~history_seen, data=duo_to_en)
#plot(seq(0,50), model$coefficients[1]+seq(0,50)*-7.160e-03, type="l")
summary(model)
model2 = glm(p_recall~history_seen, data=duo_to_en)
#plot(seq(0,50), model$coefficients[1]+seq(0,50)*-7.160e-03, type="l")
summary(model2)
```
```{r}
emp_learning = aggregate(duo_to_en$p_recall, by=list(duo_to_en$history_seen), FUN=mean)
num_users = aggregate(duo_to_en$p_recall, by=list(duo_to_en$history_seen), FUN=length)
plot(emp_learning$Group.1[1:150], emp_learning$x[1:150], ylim=c(0,1))
lines(num_users$Group.1[1:150], num_users$x[1:150]/max(num_users$x), col="red")
print(merge(emp_learning, num_users, by="Group.1"))
```
```{r}
emp_learning = aggregate(duo_from_en$p_recall, by=list(duo_from_en$history_seen), FUN=mean)
num_users = aggregate(duo_from_en$p_recall, by=list(duo_from_en$history_seen), FUN=length)
plot(emp_learning$Group.1[1:150], emp_learning$x[1:150], ylim=c(0,1))
lines(num_users$Group.1[1:150], num_users$x[1:150]/max(num_users$x), col="red")
print(merge(emp_learning, num_users, by="Group.1"))
```
```{r}
sample(duo_trans$user_id, 10)
```
```{r}
duo_trans[duo_trans$user_id=="u:iLoW","lexeme_strings"]
```


```{r}
kc_filter = duo_to_en$word_root=="eat"
emp_learning = aggregate(duo_to_en$p_recall[kc_filter], by=list(duo_to_en$history_seen[kc_filter]), FUN=mean)
plot(emp_learning$Group.1[1:150], emp_learning$x[1:150])
```
```{r}
for (word in num_langs[num_langs$`n_distinct(learning_language)`==5,]$en_word_root) {
  print(word)
  kc_filter = duo_trans$en_word_root==word
  emp_learning = aggregate(p_recall ~ history_seen * learning_language, data = duo_trans[kc_filter], mean)
  users = aggregate(p_recall ~ history_seen * learning_language, data = duo_trans[kc_filter], length)
  #emp_learning = merge(emp_learning, users, by="history_seen")
  print(emp_learning[emp_learning$history_seen<25,] %>% ggplot() + 
    geom_line(aes(x = history_seen, y = p_recall, group = learning_language, color=learning_language))+
      ggtitle(word) ) + ylim(0.5,1)
}
```
```{r}
for (word in unique()) {
  kc_filter = duo_to_en$word_root==word
  emp_learning = aggregate(p_recall ~ history_seen * ui_language, data = duo_to_en[kc_filter], mean)
  users = aggregate(p_recall ~ history_seen * ui_language, data = duo_to_en[kc_filter], mean)
  emp_learning = merge
  print(emp_learning %>% ggplot() + 
    geom_line(aes(x = history_seen, y = p_recall, group = ui_language, color=ui_language))+ggtitle(word) )
}

duo_trans$lexeme_string
```
```{r}
plot(duo_trans$delta, duo_trans$p_recall, type="p")
```

```{r}
#unique(duo_trans$lexeme_string[str_detect(duo_trans$lexeme_string, "<det>")])
#model<-glm(p_recall~history_seen,data=data)
```