-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdescribe_french_language_learning.Rmd
135 lines (104 loc) · 4.54 KB
/
describe_french_language_learning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
---
title: "Describe French Language Learning"
output: pdf_document
date: '2022-07-17'
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r echo=FALSE,include=FALSE}
#install packages only if you have not already done so
list.of.packages <- c("rticles","psych","ggplot2","dplyr","DescTools","lsmeans","stats","stats4","xlsx","lmtest","MASS","gmodels","BioStatR","olsrr","nlme","orcutt",'lme4','tidyverse','tiidyr','fastglm')
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos="http://lib.stat.cmu.edu/R/CRAN/")
#library packages
for (pkg in c("rticles","psych","ggplot2","dplyr","DescTools","lsmeans","stats","stats4","xlsx","lmtest","MASS","gmodels","BioStatR","olsrr","nlme","orcutt",'lme4','data.table','tidyr','fastglm')) {
library(pkg, character.only = TRUE)
}
setwd("./")
```
# Load dataset and check dimensions
```{r}
source("config.R")
data <- fread(path_french_language_learning)
subset(data, !is.na(data))
summary(data)
## RQ1 number of questions answered in 10 minutes -- high motivation or low motivation
## feature engineering for predictors -- how many attempts in each oppo? -- how long they spent in each oppo?
## history_seen -- number opportunities
## just because the person is motivated may not make the more correct? or
## when they work more they get more correct?
## linguistic analysis -- close to the lexeme -- singular/third person -- is not a knowledge component -- tags as lexemes as predictors in the -- if its data driven
## https://www.jacklynch.net/language.html
## RQ2 anglo-saxonic, romance languages, romanian -- french, greek -- slavic (check wiki) -- Honeiah -- https://www.jacklynch.net/language.html
## if linguistics are correct -- german and english have more overlap than other language pairs
## ui_language to predict their accuracy not necessary to look at cognates -- do they perform better on avg than mandarin
## they could be better before -- or do better
## subgroup or word-level? -- lot variance
## language is related or unrelated -- at word to see the overlap character match (similarity) between the words
## other way just look at the families -- spanish and english vs german and english or spanish -- german
## words that seem they are cognates but not -- most well studied language families in the world (Indo European)
## false friends -- theory of similarity -> predict the opposite (harder to learn but looks easy) -- if you can find those in the data counter point to the similarity argument
## how many words look similar -- even better predictor -- proportions -- they won't be same from a quantiative
## find lists online -- people that whose L1 is english (embrasada) doesn't mean embarassing -- it's a false friend between spanish and portuguese (even within the same family)
## spanish to english; chinese to english
## ui_language pt -- portuguese speakers Brazil/Portugal (some pronunciation)
```
```{r}
# https://github.com/jaredhuling/fastglm
#afm.model.reg <- glmer(p_recall ~ (1|user_id) + (history_seen|lexeme_id) - 1,
#data=duolingo_df)#, family=binomial())
```
```{r}
duolingo_df <- fread('./learning_traces.13m.csv',showProgress = TRUE)
duolingo_df <- duolingo_df[1:100000]
```
```{r}
duolingo_df$lexeme_id <- as.factor(duolingo_df$lexeme_id)
duolingo_df$user_id <- as.factor(duolingo_df$user_id)
duolingo_df <- subset(duolingo_df, !is.na(duolingo_df$p_recall) | !is.na(duolingo_df$lexeme_id))
summary(duolingo_df)
afm.model.reg <- glmer(p_recall ~ (1|user_id) + (history_seen|word_root) - 1,
data=duolingo_df)#, family=binomial())
summary(afm.model.reg)
AIC(afm.model.reg) ## output AIC
BIC(afm.model.reg) ## output BIC
#colSums(is.na(data))
#no.rm=TRUE
#na.omit
```
```{r}
library()
```
# Checkout label and features
```{r}
names(data)
```
```{r}
interesting_cols = names(data)[c(9,11,15,18,21,25)]
```
## Label
```{r}
print(table(data$Outcome)) # absolute occurences of label
print(table(data$Outcome)/length(data$Outcome))
sample(data$Outcome, 100)
```
### What happens for empty outcomes?
```{r}
unique(data[data$Outcome=="","Action"])
```
### What actions are associated to non-empty outcomes?
```{r}
table(data[data$Outcome!="","Action"])
```
### What is a UpdateComboBox action?
```{r}
table(data[data$Action=="UpdateComboBox",c("Problem.Name")])[1:50]
```
### What is an ANSWER action
```{r}
table(data[data$Action=="ANSWER",c("Problem.Name")])[1:50]
```
```{r}
data[data$Action=="ANSWER",c("Problem.Name","Input","Outcome")][950:1000,]
```