-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsentence_length.R
22 lines (17 loc) · 1.01 KB
/
sentence_length.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
train.features=read.csv('sentenceLength.train.txt',sep='|')
train.senses=read.csv('EnglishLS.train/EnglishLS.train.key',sep=' ',header=F,col.names=c('word','occurrance','sense1','sense2'))
train=merge(train.features,train.senses)
test.features=read.csv('sentenceLength.test.txt',sep='|')
test.senses=read.csv('EnglishLS.test/EnglishLS.test.key',sep=' ',header=F,col.names=c('word','occurrance','sense1','sense2'))
test=merge(test.features,test.senses)
#Logit doesn't say much. Outliers are influential.
#fit=glm(sense1~word,data=train,family='binomial')
#predict(fit,test)
fit=glm(sense1~word,data=subset(train,word=='activate.v'),2),family='binomial')
#hist(predict(fit,test))
#Clustering within a word
activate=subset(train,word=='activate.v')
sunflowerplot(fanny(activate$wordCount,2)$clustering,activate$sense1)
mosaicplot(~fanny(activate$wordCount,2)$clustering+activate$sense1)
sunflowerplot(fanny(activate$characterCount,2)$clustering,activate$sense1)
mosaicplot(~fanny(activate$characterCount,3)$clustering+activate$sense1)