-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathviterbi.py
92 lines (81 loc) · 2.58 KB
/
viterbi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
__author__="Juliana Louback <[email protected]>"
import sys
from collections import defaultdict
import math
import logging
import emission_counts
import transition_counts
"""
Usage:
python viterbi.py ner.counts ngram.counts ner_dev.dat > [output_file]
Implementation of the Viterbi algorithm
Calculate emission e(x|y) and trigram probability based on data
in ner_counts,
Read ner_dev.dat, output prediction to [output_file]
"""
# Get Count(y), Count(x~>y), Count(bigram), Count (trigram)
count_xy, count_y = emission_counts.count(sys.argv[1])
bigram_counts, trigram_counts = transition_counts.count(sys.argv[2])
# Go through dev data, predict tag & compute probability based on model above
dev_data = file(sys.argv[3],'r')
log_probability = 0
# First round for q(*, *, y_1)
first_round = True
line = dev_data.readline()
while line:
word = line.strip()
# Check for end of sentence
if word == '':
sys.stdout.write('\n')
log_probability = 0
first_round = True
else:
# Check if there is an existing label associated to the word
if word in count_xy:
max_probability = 0
for label in list(count_xy[word]):
# Calculate e(x|y)
emission = float(count_xy[word][label]) / float(count_y[label])
# Calculate q(y| y_i-2, y_i-1)
# Check for first round
if first_round:
y_2 = '*'
y_1 = '*'
first_round = False
bigram = y_2 + ' ' + y_1
trigram = y_2 + ' ' + y_1 + ' ' + label
parameter = 0.0000000001
if trigram in trigram_counts:
parameter = float(trigram_counts[trigram])/float(bigram_counts[bigram])
probability = parameter*emission
if probability > max_probability:
max_probability = probability
arg_max = label
# If Count(x~>y) = 0, use _RARE_
else:
for label in list(count_xy['_RARE_']):
# Calculate e(_RARE_|y)
probability = 0
emission = float(count_xy['_RARE_'][label]) / float(count_y[label])
# Calculate q(y| y_i-2, y_i-1)
# Check for first round
if first_round:
y_2 = '*'
y_1 = '*'
first_round = False
bigram = y_2 + ' ' + y_1
trigram = y_2 + ' ' + y_1 + ' ' + label
parameter = 0.0000000001
if trigram in trigram_counts:
parameter = float(trigram_counts[trigram])/float(bigram_counts[bigram])
probability = parameter*emission
if probability > max_probability:
max_probability = probability
arg_max = label
log_probability = log_probability + math.log(max_probability)
sys.stdout.write("{} {} {}\n".format(word,arg_max,log_probability))
#Arrange next round of y_i-2, y_i-1
y_2 = y_1
y_1 = arg_max
line = dev_data.readline()
dev_data.close()