-
Notifications
You must be signed in to change notification settings - Fork 10
/
test_text2vec.R
49 lines (38 loc) · 1.87 KB
/
test_text2vec.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
load('data/shakes_words_df_4text2vec.RData')
library(tidyverse); library(text2vec)
shakes_words_as_list = shakes_words %>%
anti_join(tidytext::stop_words) %>%
split(.$id) %>%
map(function(x) x %>% pull(word))
# tokens <- word_tokenizer(shakes_words_as_list) # you've essentially done this
# Create vocabulary. Terms will be unigrams (simple words).
it = itoken(shakes_words_as_list, progressbar = FALSE)
vocab <- create_vocabulary(it) %>%
prune_vocabulary(term_count_min = 5L)
# Use our filtered vocabulary
vectorizer <- vocab_vectorizer(vocab)
# use window of 5 for context words
tcm <- create_tcm(it, vectorizer, skip_grams_window = 3)
RcppParallel::setThreadOptions(numThreads = 6)
glove = GloVe$new(word_vectors_size = 10, vocabulary = vocab, x_max = 10)
word_vectors_main = glove$fit_transform(tcm, n_iter = 100)
# word_vectors <- glove$get_word_vectors() # deprecated
word_vectors_context <- glove$components
word_vectors = word_vectors_main + t(word_vectors_context)
test <- word_vectors["romeo", , drop = FALSE] +
word_vectors["juliet", , drop = FALSE]
cos_sim = sim2(x = word_vectors, y = test, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)
test <- word_vectors["romeo", , drop = FALSE] -
word_vectors["juliet", , drop = FALSE]
cos_sim = sim2(x = word_vectors, y = test, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)
test <- word_vectors["romeo", , drop = FALSE]
cos_sim = sim2(x = word_vectors, y = test, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)
test <- word_vectors["hamlet", , drop = FALSE]
cos_sim = sim2(x = word_vectors, y = test, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 10)
test <- word_vectors["juliet", , drop = FALSE]
cos_sim = sim2(x = word_vectors, y = test, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)