-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtf-idf.py
40 lines (30 loc) · 1.26 KB
/
tf-idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# DATA BLOCK
text = '''he really really loves coffee
my sister dislikes coffee
my sister loves tea'''
import math
def main(text):
# split the text first into lines and then into lists of words
docs = [line.split() for line in text.splitlines()]
N = len(docs)
# create the vocabulary: the list of words that appear at least once
vocabulary = list(set(text.split()))
df = {}
tf = {}
for word in vocabulary:
# tf: number of occurrences of word w in document divided by document length
# note: tf[word] will be a list containing the tf of each word for each document
# for example tf['he'][0] contains the term frequence of the word 'he' in the first
# document
tf[word] = [doc.count(word)/len(doc) for doc in docs]
# df: number of documents containing word w
df[word] = sum([word in doc for doc in docs])/N
# loop through documents to calculate the tf-idf values
for doc_index, doc in enumerate(docs):
tfidf = []
for word in vocabulary:
# ADD THE CORRECT FORMULA HERE. Remember to use the base 10 logarithm: math.log(x, 10)
tfidf.append(tf[word][doc_index]*math.log(1/df[word], 10))
print(tfidf)
if __name__ == "__main__":
main(text)