-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNltk.py
71 lines (66 loc) · 2.76 KB
/
Nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import nltk
from pprint import pprint
nltk.download([
"names",
"stopwords",
"state_union",
"twitter_samples",
"movie_reviews",
"averaged_perceptron_tagger",
"vader_lexicon",
"punkt",
])
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]
stopwords = nltk.corpus.stopwords.words("english")
def FrequencyDistributions(corpus: str):
"""
Create words frequencies using:
(1) nltk.FreqDist
(2) nltk.Text().vocab()
"""
print(f"=== {FrequencyDistributions.__name__} ===")
words: list[str] = [w for w in nltk.word_tokenize(corpus) if w.isalpha() and w not in stopwords]
#print("Tokens / words:")
#pprint(words, width=79, compact=True)
frequencies = nltk.FreqDist(words)
print(f"10 Most common: {frequencies.most_common(10)}")
print("Tabulated (10):")
frequencies.tabulate(10)
frequencies_lower = nltk.FreqDist([w.lower() for w in frequencies])
print(f"\n10 Most common (lower-cased): {frequencies_lower.most_common(10)}")
print("\nTabulated (10 lower-cased):")
frequencies_lower.tabulate(10)
text = nltk.Text(words)
frequencies_text = text.vocab() # Equivalent to fd = nltk.FreqDist(words)
print(f"\n10 Most common (nltk.Text): {frequencies_text.most_common(10)}")
print("Tabulated (10 nltk.Text):")
frequencies_text.tabulate(10)
def ConcordanceCollocations():
"""
In the context of NLP, a concordance is a collection of word locations along with their context.
"""
print(f"=== {ConcordanceCollocations.__name__} ===")
state_union_words: list[str] = nltk.corpus.state_union.words()
state_union_text: str = nltk.corpus.state_union.raw()
text = nltk.Text(state_union_words)
text.concordance("america", lines=10)
concordances = text.concordance_list("america", lines=10)
print("\nconcordance list:")
for c in concordances:
print(c.line)
words: list[str] = [w for w in nltk.word_tokenize(state_union_text) if w.isalpha() and w not in stopwords]
bigrams = nltk.collocations.BigramCollocationFinder.from_words(words)
print(f"\n10 Most common Bigrams: {bigrams.ngram_fd.most_common(10)}")
print("Tabulated (10 Bigrams):")
bigrams.ngram_fd.tabulate(10)
trigrams = nltk.collocations.TrigramCollocationFinder.from_words(words)
print(f"10 Most common Trigrams: {trigrams.ngram_fd.most_common(10)}")
print("Tabulated (10 Trigrams):")
trigrams.ngram_fd.tabulate(10)
quadgrams = nltk.collocations.QuadgramCollocationFinder.from_words(words)
print(f"10 Most common Quadgrams: {quadgrams.ngram_fd.most_common(10)}")
print("Tabulated (10 Quadgrams):")
quadgrams.ngram_fd.tabulate(10)
if __name__ == "__main__":
FrequencyDistributions(nltk.corpus.state_union.raw())
ConcordanceCollocations()