-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathn_gram_counter.rb
52 lines (40 loc) · 1.04 KB
/
n_gram_counter.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class NgramFinder
attr_reader :ngram_freqs
def initialize(file)
@word_pattern = /[\w\-\']+/
@words_array = extract_words(file)
@ngram_freqs = Hash.new(0)
end
def find_ngrams(n=3)
limits = set_bounds(n)
count_ngrams(*limits)
ngram_freqs
end
def find_and_sort_ngrams_by_frequency(n=3)
find_ngrams(n).sort_by {|k, v| v}.reverse
end
private
def extract_words(file)
File.read(file).scan(@word_pattern)
end
def set_bounds(n)
stop_index = @words_array.length - n+1
offset = n-1
[stop_index, offset]
end
def count_ngrams(stop_index, offset)
stop_index.times do |i|
ngram = @words_array[i..i+offset].join(' ')
ngram_freqs[ngram] += 1
end
end
end
#Driver Code
file = ARGV[0] || 'sample/input.txt'
ngram_finder = NgramFinder.new(file)
p top_ten = if ARGV[1]
ngram_length = ARGV[1].to_i
ngram_finder.find_and_sort_ngrams_by_frequency(ngram_length)
else
ngram_finder.find_and_sort_ngrams_by_frequency
end.take(10)