-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtensor_decomp_twitter.py
187 lines (144 loc) · 6.2 KB
/
tensor_decomp_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import scipy.io
import numpy as np
import collections
import matlab.engine
import gensim
import argparse
import joblib
import codecs
import os
from nltk.tokenize import word_tokenize
from string import punctuation
import re
from scipy.spatial.distance import cdist
def BuildVocabulary(documents):
max_sentence_len = 0
word_frequency = collections.Counter()
for words in documents:
if len(words) > max_sentence_len:
max_sentence_len = len(words)
for word in words:
word_frequency[word] += 1
max_vocab = len(word_frequency)
word2index = {x[0]: i + 2 for i, x in enumerate(word_frequency.most_common(max_vocab))}
word2index["<PAD>"] = 0
word2index["<UNK>"] = 1
return word2index, max_sentence_len
def Sentence2Index(documents, word2index):
X = []
for words in documents:
sequence = []
for word in words:
if word in word2index:
sequence.append(word2index[word])
else:
sequence.append(word2index["<UNK>"])
X.append(sequence)
return X
class TwoDimDict():
def __init__(self):
self.data = {}
def add(self, i, j, val):
if i in self.data and j in self.data[i]:
self.data[i][j] += val
else:
if i in self.data:
self.data[i][j] = val
else:
self.data[i] = {}
self.data[i][j] = val
def get_item(self):
for i, i_val in self.data.items():
for j, j_val in self.data[i].items():
yield (i, j, j_val)
def decompose_tensors(documents, win_size=5, cp_rank=150):
"""
Method for computing tensor decompositions based on https://www.cs.ucr.edu/~epapalex/papers/asonam18-fakenews.pdf
:param documents: the documents to be decomposed, tokenized
:type documents: Iterable[Iterable[str]]
:param win_size: the window size to use for word cooccurance. Will look win_size to the left and win_size to the right of each token
:type win_size: int
:param cp_rank:
:type cp_rank: int
"""
vocabulary, sentence_len = BuildVocabulary(documents)
doc_size = len(documents)
vocab_size = len(vocabulary)
X = Sentence2Index(documents, vocabulary)
coord_list = []
val_list = []
for k in range(doc_size):
word_word_dict = TwoDimDict()
# by Andrew, April 04, 2019
# print('build word_word_doc tensor, {:d}/{:d} ...'.format(k, doc_size))
for i in range(1, len(documents[k])):
if X[k][i] == 0:
break
for j in range(1, win_size + 1):
left_win_idx = i - j
right_win_idx = i + j
for win_idx in [left_win_idx, right_win_idx]: # check the window both to left and to right
if (win_idx >= 0) and (win_idx < len(X[k])) and (X[k][win_idx] != 0):
word_word_dict.add(X[k][i], X[k][win_idx], 1.0)
for item in word_word_dict.get_item():
coord_list.append((item[0], item[1], k))
val_list.append(item[2])
scipy.io.savemat('tmp_tensor_info.mat',
dict(coord_list=coord_list, val_list=val_list, vocab_size=vocab_size, doc_size=doc_size))
eng = matlab.engine.start_matlab()
eng.TensorDecomposition(cp_rank, nargout=0)
eng.quit()
doc2vec = scipy.io.loadmat('tmp_doc2vec_mat.mat')
os.remove("tmp_tensor_info.mat")
os.remove("tmp_doc2vec_mat.mat")
return doc2vec['doc2vec'], vocabulary
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--root_path", help="root path", default='./data/SemEval/')
parser.add_argument("--method", help="method", default='tensor')
args = parser.parse_args()
punc = re.compile(f"[{punctuation}]+")
midnight = re.compile("@midnight", flags=re.I)
semeval_dir = args.root_path
eveluation_dir = os.path.join(semeval_dir, "evaluation_dir/evaluation_data")
prediction_dir = os.path.join(semeval_dir, "predictions")
if not os.path.exists(prediction_dir):
os.makedirs(prediction_dir)
for f in os.listdir(eveluation_dir):
if not f.endswith(".tsv"):
continue
filename = os.path.join(eveluation_dir, f)
print(f)
hashtag = re.compile(f"#{f[:-4].replace('_','')}", flags=re.I)
labels = []
texts = []
ids = []
with codecs.open(filename, "r", encoding="utf-8") as file:
for line in file:
line = line.split("\t")
ids.append(line[0])
texts.append(midnight.sub("", hashtag.sub("", line[1])))
labels.append(int(line[2]))
texts = [[word for word in word_tokenize(text) if not punc.fullmatch(word)] for text in texts]
if args.method == 'word2vec':
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
decomp = np.zeros((len(texts), 300))
for i, text in enumerate(texts):
counter = 0
for word in text:
if word in model.vocab:
decomp[i] += model.word_vec(word)
counter+=1
decomp[i] /= counter
if args.method == 'tensor':
decomp, vocab_map = decompose_tensors(texts)
center = np.mean(decomp, axis=0)
distances=cdist(decomp, [center])
ranked=sorted(list(zip(ids, labels, distances)), key= lambda x : x[2]) #sort from most central to least
global_predictions = list(zip(*ranked))[0]
with open(os.path.join(prediction_dir, 'taska', f"{f[:-4]}_PREDICT.tsv"), "w") as of:
for i, id1 in enumerate(global_predictions[:-1]):
for id2 in global_predictions[i+1:]:
of.write("{}\t{}\t1\n".format(str(id1), str(id2)))
with open(os.path.join(prediction_dir, 'taskb', f"{f[:-4]}_PREDICT.tsv"), "w") as of:
of.write("\n".join(global_predictions))