-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_words_by_morphemes.py
69 lines (61 loc) · 2.85 KB
/
split_words_by_morphemes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import morfessor as m
import argparse
import sys
import io
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Trains a morfessor model given a corpus, and splits all words in the corpus according to the morfessor model. You can also provide a Morfessor binary model file.')
parser.add_argument('corpus', metavar='corpus', type=str,
help='The text corpus, one sentence per line')
parser.add_argument('-model', metavar='model', type=str,default = None,
help='Morfessor model file path (binary file)')
parser.add_argument('-output_corpus', metavar='output_corpus', type=str, default=None,
help='The output corpus file path')
parser.add_argument('-save_model', metavar='save_model', type=str, default=None,
help='Saves the model as a binary model file (provide file path)')
parser.add_argument('-maxepochs', metavar='maxepochs', type=int, default=None,
help='Maximum iterations for training the model (default= no maximum, stop untill convergence)')
args = parser.parse_args()
mio = m.io.MorfessorIO()
corpus = mio.read_corpus_file(args.corpus)
# If a model is provided, load the model
if args.model:
sys.stderr.write("reading Morfessor model...\n")
model = mio.read_any_model(args.model)
# If not, train, and possibly save the model
else:
sys.stderr.write("-- training Morfessor model --\n")
sys.stderr.write("reading corpus\n")
model = m.baseline.BaselineModel()
model.load_data(corpus)
sys.stderr.write("training model\n")
if args.maxepochs:
sys.stderr.write('max epochs:' + str(args.maxepochs)+"\n")
model.train_batch(max_epochs=args.maxepochs)
else:
model.train_batch()
if args.save_model:
sys.stderr.write("writing model to file:"+(args.save_model)+"\n")
mio.write_binary_model_file(args.save_model, model)
corpus = [s.split() for s in io.open(args.corpus, 'r').read().split('\n') if s!=""]
output = ""
for sentence in corpus:
out = []
for word in sentence:
try:
segmentation = model.segment(word)
except:
try:
segmentation = model.viterbi_segment(word)[0]
except:
segmentation = [word]
for segment in segmentation:
out += segmentation
if args.output_corpus:
output += (' '.join(out) + '\n')
else:
print ' '.join(out).encode('utf-8')
if args.output_corpus:
sys.stderr.write('writing segmented corpus to'+args.output_corpus+'\n')
fout = open(args.output_corpus, 'w')
fout.write(output.encode('utf-8'))
fout.close()