-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathsentenceSimilarity.py
97 lines (67 loc) · 2.61 KB
/
sentenceSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#encoding=utf-8
import gc
from gensim import corpora, models, similarities
from sentence import Sentence
from collections import defaultdict
class SentenceSimilarity():
def __init__(self,seg):
self.seg = seg
def set_sentences(self,sentences):
self.sentences = []
for i in range(0,len(sentences)):
self.sentences.append(Sentence(sentences[i],self.seg,i))
# 获取切过词的句子
def get_cuted_sentences(self):
cuted_sentences = []
for sentence in self.sentences:
cuted_sentences.append(sentence.get_cuted_sentence())
return cuted_sentences
# 构建其他复杂模型前需要的简单模型
def simple_model(self,min_frequency = 1):
self.texts = self.get_cuted_sentences()
# 删除低频词
frequency = defaultdict(int)
for text in self.texts:
for token in text:
frequency[token] += 1
self.texts = [[token for token in text if frequency[token] > min_frequency] for text in self.texts]
self.dictionary = corpora.Dictionary(self.texts)
self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
# tfidf模型
def TfidfModel(self):
self.simple_model()
# 转换模型
self.model = models.TfidfModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lsi模型
def LsiModel(self):
self.simple_model()
# 转换模型
self.model = models.LsiModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
#lda模型
def LdaModel(self):
self.simple_model()
# 转换模型
self.model = models.LdaModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
def sentence2vec(self,sentence):
sentence = Sentence(sentence,self.seg)
vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence())
return self.model[vec_bow]
# 求最相似的句子
def similarity(self,sentence):
sentence_vec = self.sentence2vec(sentence)
sims = self.index[sentence_vec]
sim = max(enumerate(sims), key=lambda item: item[1])
index = sim[0]
score = sim[1]
sentence = self.sentences[index]
sentence.set_score(score)
return sentence