-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_kmeans.py
66 lines (56 loc) · 2.67 KB
/
run_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import random
import time
from pprint import pp
import kmeans
import library
if __name__ == '__main__':
t0 = time.time()
docs_dir = '../input/CORD-19-research-challenge/document_parses/pdf_json'
num_docs = 200000
k_value = 10
num_clusterings = 1
t = time.time()
vocab, docs = library.load_project('data_after_removing_words', num_docs=num_docs, random_files=False)
print(f"loaded {len(docs):,} docs {time.time()-t:.2f}")
t = time.time()
sub_corpus_freqs = kmeans.sub_corpus_frequencies(docs)
print(f"recalc frequencies {time.time()-t:.2f}")
for _ in range(num_clusterings):
results = kmeans.find_clusters(docs, k_value)
print(
'sorted cluster sizes:',
sorted([len(cluster) for cluster in results.clusters], reverse=True),
f"(required {results.iterations} iterations)"
)
pp(kmeans.timings, indent=1)
clusters = []
for cluster in results.clusters:
clusters.append({
'cluster': cluster,
'length': len(cluster),
'common_words': kmeans.common_words_in_cluster(cluster, sub_corpus_freqs, vocab),
})
clusters.sort(key=lambda c: -c['length'])
for index, cluster in enumerate(clusters):
print(f"------- Cluster {index} -------")
print(f"Size: {cluster['length']:,}")
print(f"Defining words:")
print(" - " + ", ".join([f"{word[0]} {round(word[2]*100)}% (+{round(word[1]*100)}%)" for word in cluster['common_words'][:10]]))
print('5 random papers')
sample = random.sample(cluster['cluster'], 5)
for doc in sample:
print(f" - {library.get_doc_title_from_filename(doc[0], docs_dir)}")
print(kmeans.doc_sorted_tfidf_words(doc, vocab['words'])[:10])
freqs_list = [(library.lookup_word(word, vocab), sub_corpus_freqs[word]) for word in sub_corpus_freqs]
freqs_sample = random.sample(freqs_list, 5)
freqs_top_10 = sorted(freqs_list, key=lambda word: word[1], reverse=True)[:10]
print(f"------- Corpus Stats -------")
print("10 most frequent words")
print(freqs_top_10)
print("Random sample of word frequencies")
print(freqs_sample)
lookup_words = ['coronavirus', 'covid19', 'government', 'policy', 'respiratory']
print("Frequencies of some meaningful words:")
print(", ".join([f"{word} {sub_corpus_freqs[vocab['index'][word]]}" for word in lookup_words]))
kmeans.save_clusters(clusters, f"clusters/clusters-{len(docs)}-{k_value}-{round(time.time())}.pickle")
print(f"total time: {time.time() - t0}")