-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject3script.r
392 lines (321 loc) · 9.01 KB
/
project3script.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#install and open libraries
install.packages("devtools")
install.packages("readr")
install.packages("wordcloud")
install.packages("quanteda")
install.packages("corpus")
install.packages("tm")
install.packages("syuzhet")
install.packages("zipfR")
install.packages ("ngram")
install.packages("knitr")
install.packages("RColorBrewer")
install.packages("NLP")
install.packages("tokenizers")
install.packages("tidytext")
install.packages("corpustools")
install.packages("readtext")
install.packages("stringi")
install.packages("dplyr")
install.packages("Matrix")
library(readr)
library(wordcloud)
library(corpus)
library(tm)
library(syuzhet)
library(quanteda)
library(stringi)
library(tidytext)
library(corpustools)
library(zipfR)
library(ngram)
library(knitr)
library(tokenizers)
library(stringr)
library(dplyr)
library(Matrix)
#load the text into a data frame
text <- read_lines("text/TwentyThousandLeagues.txt")
textdf <- data.frame(text)
# Create VCorpus of the text and inspect it
textCorpus <- VCorpus(DirSource("text/",ignore.case = TRUE,mode="text"))
textCorpus
inspect(textCorpus)
str(textCorpus)
#extract the document
textCorpus1 <- textCorpus[[1]]
textCorpus1
inspect(textCorpus1)
str(textCorpus1)
#get the document term matrix
dtm <- DocumentTermMatrix(textCorpus)
dtm
inspect(dtm)
str(dtm)
#get the term frequency
tf <- termFreq(textCorpus1)
tf
str(tf)
#b Find the ten longest word and ten longest sentence
data <- data.frame(text = sapply(textCorpus1, as.character), stringsAsFactors = FALSE)
#Convert Corpus into data frame
LongestWordsList<-data.frame(Row=c(1,2,3,4,5,6,7,8,9,10))
#Create a dataframe to store the ten longest word
words <- tokenize_words(data[[1]])
longestWordLength = 0;
longestWord = '';
#seperates the data frame into words. Iterates the text file to find the largest one
#Does this ten times to get the the ten largest words
for(a in 1:11){
for (i in 1:length(words)){
if(length(tokenize_words(words[[i]])!=0)){
words2<- tokenize_words(words[[i]])
for (x in 1:length(words2)){
if ((str_length(words2[[x]])>longestWordLength) && (is.element(words2[[x]],unlist(LongestWordsList))==FALSE)){
longestWordLength= str_length(words2[[x]])
longestWord=words2[[x]]
LongestWordsList[a,"Row"]<-longestWord
}
}
}
}
longestWordLength = 0;
longestWord = '';
}
#prints the ten largest words
LongestWordsList
#Finding the ten largest sentences
SentencesList<-data.frame(Row=c(1,2,3,4,5,6,7,8,9,10))
bdy2 <- paste(data[[1]], collapse=" ")
Sentence <- tokenize_sentences(bdy2)
Sentence2<- tokenize_sentences(Sentence[[1]])
longestSentLength = 0;
longestSent = '';
#seperates the data frame into sentences Iterates the text file to find the largest one
#Does this ten times to get the the ten largest sentences
for(a in 1:11){
for (i in 1:length(Sentence[[1]])){
var <- tokenize_words(Sentence2[[i]])
if (length(var[[1]])>longestSentLength && (is.element(Sentence2[[i]], unlist(SentencesList))==FALSE)){
#cat(Sentence2[[i]])
# cat(" ")
longestSentLength=length(var[[1]])
longestSent=Sentence2[[i]]
SentencesList[a,"Row"]<-longestSent
}
}
longestSentLength = 0;
longestSent = '';
}
SentencesList
#convert to lowercase
textCorpus <- tm_map(textCorpus,content_transformer(tolower))
textCorpus
inspect(textCorpus)
#remove punctuation
removePunct <- function(x) gsub("[^[:alpha:][:space:]]*","",x)
textCorpus <- tm_map(textCorpus,content_transformer(removePunct))
#extract the document which is now lowercase and no punctuation
textCorpus1 <- textCorpus[[1]]
textCorpus1
inspect(textCorpus1)
str(textCorpus1)
#get the new document term matrix
dtm <- DocumentTermMatrix(textCorpus)
dtm
inspect(dtm)
str(dtm)
#remove stopwords
stopwords <- c(tm::stopwords('english'))
stopwords
textCorpus <- tm_map(textCorpus,removeWords,stopwords)
#extract the document which is now with no stopwords
textCorpus1 <- textCorpus[[1]]
textCorpus1$content
inspect(textCorpus1)
str(textCorpus1)
#get the new document term matrix
dtm <- DocumentTermMatrix(textCorpus)
dtm
inspect(dtm)
str(dtm)
#find frequent terms
freqTerms <- findFreqTerms(dtm,lowfreq=100)
freqTerms
#find words associated with others
captainAssoc <- findAssocs(dtm,"ship",0.1)
captainAssoc
#get the term document matrix
tdm <- TermDocumentMatrix(textCorpus)
tdm
inspect(tdm)
str(tdm)
#find the summed term frequency of text
stoptf <- rowSums(as.matrix(tdm))
stoptf
stoptfsub <- subset(stoptf, stoptf >= 60)
stoptfsub
#remove sparse terms
tdm <- removeSparseTerms(tdm,sparse=0.5)
tdm
inspect(tdm)
str(tdm)
#create a dendrogram
distMatrix <- dist(tdm)
dendrogram <- hclust(distMatrix,method="ward.D2")
plot(dendrogram)
#sort frequent terms
stoptf <- sort(stoptf,decreasing = TRUE)
stoptf
#create wordcloud
colpal <- brewer.pal(9,"BuGn")
wordcloud(words = names(stoptf),freq=stoptf,min.freq=10,random.order=F,colors=colpal)
#find column frequencies
coltf <- colSums(as.matrix(dtm))
coltf
#order by frqequencies in descending order
order <- order(coltf,decreasing=TRUE)
#find the most and least frequent terms
coltf[head(order)]
coltf[tail(order)]
#tokenization
tokens <- tokens(textCorpus$content)
tokens
#construct a dfm of the text
dfm <- dfm(tokens,toLower=TRUE)
dfm
#sentiment analysis
textdfcontent <- as.data.frame(textCorpus1$content)
textdfcontent
sentiment <- get_nrc_sentiment(textCorpus1$content)
sentiment
#analysis of sentiments, shows how many sentiments
#in a given sentence and how many of each type of sentiment
numSentiments <- rowSums(sentiment)
numSentiments
sentimentCount <- colSums(sentiment)
sentimentCount
#text weighing of dfm
weighteddfm <- docfreq(dfm)
weighteddfm
weights <- dfm_weight(dfm)
weights
tfidf <- dfm_tfidf(dfm,scheme_tf="count",scheme_df="inverse")
tfidf
# E - zipfr
# prepares the info for and create the frequency spectrum (spc)
testMattdm <- as.matrix(tdm)
tabMattdm <- as.matrix(table(testMattdm))
str(tabMattdm)
tabMattdm.df <- as.data.frame(tabMattdm)
tabMattdm.df
write.table(tabMattdm.df, file = "testTab.txt", sep = "\t",
row.names = TRUE, col.names = FALSE)
tstTab <- read.delim("testTab.txt", header = FALSE, sep = "\t")
colnames(tstTab)[1:2] <-c("m","Vm")
write.table(tstTab, file = "test2Tab.txt", sep = "\t",
row.names = FALSE, col.names = TRUE)
# create spc
freq.spc <- read.spc("test2Tab.txt")
## analysis of spc
# summary of the spc
summary(freq.spc)
# sample size
N(freq.spc)
# vocabulary size
V(freq.spc)
# Num of types per freq
Vm(freq.spc, 1)
Vm(freq.spc, 1:5)
# measure of productivity (num of hapaa legomena/ sample size)
Vm(freq.spc, 1)/ N(freq.spc)
# plot of the spc
plot(freq.spc)
plot(freq.spc, log ="x")
# scatterplot of the spc
with(freq.spc, plot(m, Vm, main="Frequency Spectrum"))
# F - NGram
# prepares the text for ngrams
SentencesList
# cleans up the text and changes it from list to character.
sen.unlist <- unlist(SentencesList, use.names = FALSE)
typeof(sen.unlist)
sens <- sen.unlist
sens.new <- vector()
for(i in sens) {
senStr <- preprocess(i, case = "lower", remove.punct = TRUE)
senStr
sens.new <- c(sens.new, senStr)
}
sens.new
# gets all the words larger than 6 char into one char vector.
sens.size6 <- character(0)
for(j in sens.new) {
sen.split <- strsplit(j, " ")[[1]]
for(word in sen.split) {
if (nchar(word) >= 7) {
sens.size6 <- paste(sens.size6, word, sep = " ")
}
}
}
sens.size6
# Creates bigrams of all words in the 10
# sentences longer than 6 char long
sen2.ng <- ngram(sens.size6, n=2)
sen2.ng
print(sen2.ng, output="truncated")
# Creates trigrams of all words in the 10
# sentences longer than 6 char long
sen3.ng <- ngram(sens.size6, n=3)
sen3.ng
print(sen3.ng, output="truncated")
# summarizing n2
head(get.phrasetable(sen2.ng))
# summarizing n3
head(get.phrasetable(sen3.ng))
#part g
# tidytext: Each variable must have its own column, Each observation must have its own row, Each value must have its own cell
# Import data document using read_lines
dataDoc <- toString(read_lines("TwentyThousandLeagues.txt", skip = 0, n = -1L, ))
rt <- read_lines("text/*.txt",docvarsfrom="filepaths")
rt
# -----Stringi-----
# Transform to lower case
dataDoc <- stri_trans_tolower(dataDoc)
# Remove tags
dataDoc <- stri_replace_all(dataDoc,"",regex = "<.*?>")
# Strip surrounding white space
dataDoc <- stri_trim(dataDoc)
dataDoc
# ----Quanteda----
# Tokenization
toks <- quanteda::tokens(dataDoc)
# Convert to lower case
toks <- tokens_tolower(toks)
# Remove stopwords
sw <- stopwords("english")
head(sw)
tokens_remove(toks,sw)
fulltext <- corpus(rt)
dtm <- dfm(fulltext, tolower = TRUE, stem = TRUE,remove_punct = TRUE,remove = stopwords("english"))
dtm@docvars
dtm@Dimnames
# Document Frequency
doc_freq <- docfreq(dtm)
doc_freq
hist(doc_freq)
# Terms with frequency >= 6
dtm6 <- dtm[, doc_freq >= 6]
# TF-IDF
dtm1 <- dfm_weight(dtm, "tfidf")
dfm_tfidf(dtm)
dataDoc
# Extract words
wrds <- dtm@Dimnames
wrds$features
str(wrds$features)
wrds$features
# Get nrc values
get_nrc_values(wrds$features)
# Explore Corpus Term Frequency Characteristics
#Zipf_plot(cleanTDM)