-
Notifications
You must be signed in to change notification settings - Fork 1
/
scoring.py
124 lines (107 loc) · 4.48 KB
/
scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
''' Text Keyword Match'''
# --------------------------------
# Date : 19-06-2020
# Project : Text Keyword Match
# Category : NLP/NLTK sentence Scoring
# Company : weblineindia
# Department : AI/ML
# --------------------------------
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
class scoreText(object):
"""
A class used to score sentences based on the input keyword
"""
def __init__(self):
self.sentences = []
def cleanText(self, sentences):
"""
Eliminates the duplicates and cleans the text
"""
try:
sentences = list(set(sentences))
mainBody = []
for i, text in enumerate(sentences):
text = re.sub("[-()\"#/@&&^*();:<>{}`+=~|!?,]", "", text)
mainBody.append(text)
return mainBody
except Exception as e:
print("Error occured in text clean", e)
def preProcessText(self, sentences):
"""
Tokenization of sentence and lemmatization of words
"""
try:
# Tokenize words in a sentence
word_tokens = word_tokenize(sentences)
# Lemmatization of words
wordlist = [lemmatizer.lemmatize(
w) for w in word_tokens if not w in stop_words]
return wordlist
except Exception as e:
print("Error occured in text preprocessing", e)
# similarity of subject
def scoreText(self, keyword, sentences):
"""
Compares sentences with keyword with bleu scoring technique
"""
try:
# Remove symbols from text
sentences = self.cleanText(sentences)
# Tokenization and Lennatization of the keyword
keywordList = self.preProcessText(keyword)
scoredSentencesList = []
for i in range(len(sentences)):
# Tokenization and Lennatization of the sentences
wordlist = self.preProcessText(sentences[i])
# list of keyword taken as reference
reference = [keywordList]
chencherry = SmoothingFunction()
# sentence bleu calculates the score based on 1-gram,2-gram,3-gram-4-gram,
# and a cumulative of the above is taken as score of the sentence.
bleu_score_1 = sentence_bleu(
reference, wordlist, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
bleu_score_2 = sentence_bleu(
reference, wordlist, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)
bleu_score_3 = sentence_bleu(
reference, wordlist, weights=(0.33, 0.33, 0.34, 0), smoothing_function=chencherry.method1)
bleu_score_4 = sentence_bleu(
reference, wordlist, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)
bleu_score = (4*bleu_score_4 + 3*bleu_score_3 +
2*bleu_score_2 + bleu_score_1)/10
# append the score with sentence to the list
scList = [bleu_score, sentences[i]]
scoredSentencesList.append(scList)
return scoredSentencesList
except Exception as e:
print("Error occured in score text", e)
def sortText(self, scoredText):
"""
Returns 3 top scored list of sentences
"""
try:
scoredTexts = sorted(scoredText, key=lambda x: x[0], reverse=True)
scoredTexts = [v[1] for i, v in enumerate(scoredTexts) if i < 3]
return scoredTexts
except Exception as e:
print("Error occured in sorting text", e)
def sentenceMatch(self, keyword, paragraph):
"""
Converts paragraph into list and calls scoreText and sortText functions,
and returns the most matching sentences with the keywords.
"""
try:
sentencesList = sent_tokenize(paragraph)
scoredSentence = self.scoreText(keyword, sentencesList)
sortedSentence = self.sortText(scoredSentence)
return sortedSentence
except Exception as e:
print("Error occured in sentence match", e)