-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathextract_keywords.py
67 lines (54 loc) · 1.45 KB
/
extract_keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
import os
import os.path
from io import StringIO
import torch
import sys
import glob
import shutil
from textblob import TextBlob
from sacremoses import MosesPunctNormalizer
from collections import Counter
import string
regular_punct = list('!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~')
def remove_punctuation(text,punct_list=regular_punct):
for punc in punct_list:
if punc in text:
text = text.replace(punc, '')
return text.strip()
files = sys.argv[1:]
if not files:
print("No files provided.")
exit(1)
txt = StringIO()
while files:
file = files.pop()
if os.path.isfile(file):
with open(file, "r", encoding="utf-8") as f:
txt.write(f.read())
elif os.path.isdir(file):
for n in os.listdir(file):
p = os.path.join(file, n)
if os.path.isfile(p):
files.append(p)
else:
raise Exception(f"File not found: {file}")
mpn = MosesPunctNormalizer()
txt.seek(0)
raw = remove_punctuation(mpn.normalize(txt.read()))
blob = TextBlob(raw)
phrases = blob.noun_phrases
counter = Counter([p.strip() for p in phrases])
#print(counter)
strip = set(["ive", "im", "id", "yeah", "ill", "op", "thats", "dont", "theres", "hes", "shes", "well"])
prompt = ""
i = 0
for noun, frequency in counter.most_common():
if noun in strip:
continue
prompt += noun + ", "
i += 1
if i > 30:
break
prompt = prompt.strip().strip(",")
print(prompt)