-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathword_usage.py
executable file
·162 lines (126 loc) · 4.48 KB
/
word_usage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/python3
import sys, getopt, re, os
from collections import defaultdict
input_file = ''
dictionary_file = ''
limit = 0
min_frequency = 0
max_frequency = 0
min_length = 0
max_length = 0
words_only = False
no_repeats = False
strip_apostrophes = False
non_dictionary_only = False
def clean(line):
# Convert curly apostrophes to straight
line = line.replace(u"\u2018", "'")
line = line.replace(u"\u2019", "'")
line = line.replace(u"\u0060", "'")
line = line.replace(u"\u00b4", "'")
# Filter out symbols
line = re.sub("[^a-zA-Z\u00c0-\u024f\u0370-\u1fff\u3040-\ufeff']", " ", line)
# Remove any double spaces introduced by last regex
line = re.sub("\ {2,}", " ", line)
line = line.lower()
return line
def clean_and_split(line, strip_apostrophes=False):
if strip_apostrophes:
# Remove apostrophes to split words
line = line.replace("'", " ")
line = line.replace('"', "")
line = clean(line)
return line.split()
def printhelp():
print('word_usage.py -i <input file> [-d <dictionary>] [--limit x] [--min-frequency x] [--max-frequency x] [--min-length x] [--max-length x] [--show-words-only] [--non-dictionary-words] [--strip-apostrophes] [--no-repeats]')
try:
opts, args = getopt.getopt(sys.argv[1:],"i:d:",["input=","dictionary=","limit=","min-frequency=","max-frequency=","min-length=","max-length=","show-words-only","strip-apostrophes","no-repeats","non-dictionary-words"])
except getopt.GetoptError:
printhelp()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
printhelp()
sys.exit()
elif opt in ("-i", "--input"):
input_file = arg
elif opt in ("-d", "--dictionary"):
dictionary_file = arg
elif opt == "--limit":
limit = int(arg)
elif opt == "--min-frequency":
min_frequency = int(arg)
elif opt == "--max-frequency":
max_frequency = int(arg)
elif opt == "--min-length":
min_length = int(arg)
elif opt == "--max-length":
max_length = int(arg)
elif opt == "--strip-apostrophes":
strip_apostrophes = True
elif opt == "--show-words-only":
words_only = True
elif opt == "--no-repeats":
no_repeats = True
elif opt == "--non-dictionary-words":
non_dictionary_only = True
word_dict = defaultdict(int)
if not os.path.exists(input_file):
printhelp()
sys.exit(2)
dictionary = set()
# Scan dictionary if the user specified it (assumes one word per line)
if dictionary_file:
with open(dictionary_file, encoding='utf-8') as f:
for line in f:
line = clean(line).strip()
if len(line) > 0:
dictionary.add(line)
# Scan sentences
with open(input_file, encoding='utf-8') as f:
for line in f:
words = clean_and_split(line, strip_apostrophes)
repeat_list = []
for w in words:
# Ignore apostrophes at start or end
if len(w) > 1 and w[:1] == "'":
w = w[1:]
if len(w) > 1 and w[-1] == "'":
w = w[:-1]
w_len = len(w)
if w_len > 0 and w != "'":
if w_len < min_length or (max_length > 0 and w_len > max_length):
continue
if no_repeats:
if w in repeat_list:
continue
repeat_list.append(w)
val = word_dict[w]
val += 1
word_dict[w] = val
if min_frequency == 0 and not non_dictionary_only and dictionary_file:
# Add dictionary words if they don't exist
for word in dictionary:
val = word_dict[word]
word_dict[word] = val
# Filter by min/max frequency
filtered_words = defaultdict(int)
for word in word_dict:
if (min_frequency == 0 or int(word_dict[word]) >= min_frequency) and \
(max_frequency == 0 or int(word_dict[word]) <= max_frequency):
# Check if it's in the dictionary
if non_dictionary_only and word in dictionary:
continue
filtered_words[word] = word_dict[word]
# Now sort by alphabetical order of word
sorted_words = sorted(filtered_words.items(), key=lambda x: x[0])
# Sort words by most to least frequent
sorted_words = sorted(sorted_words, key=lambda x: x[1], reverse=True)
# Set limit if specified
if limit > 0 and len(sorted_words) > limit:
sorted_words = sorted_words[:limit]
for word, num in sorted_words:
if words_only:
print(word)
else:
print("{} {}".format(word, num))