-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
180 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import pickle | ||
|
||
from POS.crf_pos_feature_helper import token2features | ||
|
||
|
||
class POSTagger: | ||
def __init__(self, model_path): | ||
self.model_path = model_path | ||
self.crf = pickle.load(open(model_path, "rb")) | ||
|
||
def parse(self, token_stream): | ||
return self.parse_sentences([token_stream])[0] | ||
|
||
def parse_sentences(self, list_of_token_stream): | ||
X_test = [token2features(s) for s in list_of_token_stream] | ||
y_pred = self.crf.predict(X_test) | ||
out = [] | ||
for x_sent, y_pred in zip(list_of_token_stream, y_pred): | ||
out.append(list(zip(x_sent, y_pred))) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .POSTagger import POSTagger | ||
from . import crf_pos_feature_helper | ||
from .util import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import string | ||
|
||
from POS.util import template, isdigit, ngram | ||
|
||
|
||
def word2features(sent, i): | ||
W = sent[i] | ||
features = { | ||
'B': 1.0, | ||
'W': W, | ||
'P': W in string.punctuation, | ||
'T': template(W), | ||
'D(W)': isdigit(W), | ||
} | ||
for leng in range(max(4 + 1, len(W)) + 1): | ||
for k, v in ngram(W, leng=leng): | ||
features[k] = v | ||
if i > 0: | ||
W = sent[i - 1][0] | ||
features.update({ | ||
'-1W[-3': W[-3:], | ||
'-1W[-2': W[-2:], | ||
'-1W[-1': W[-1:], | ||
'-1W': W, | ||
'-1W0W': W + sent[i], | ||
'-1P': W in string.punctuation, | ||
'-1T': template(W) | ||
}) | ||
else: | ||
features['BOS'] = True | ||
if i > 1: | ||
W = sent[i - 2][0] | ||
features.update({ | ||
'-2W[-3': W[-3:], | ||
'-2W[-2': W[-2:], | ||
'-2W[-1': W[-1:], | ||
'-2P': W in string.punctuation, | ||
'-2T': template(W) | ||
}) | ||
|
||
if i < len(sent) - 2: | ||
W = sent[i + 2][0] | ||
features.update({ | ||
'+2W[-1': W[-1:], | ||
'+2W[-2': W[-2:], | ||
'+2W': W, | ||
'+2P': W in string.punctuation, | ||
'+2T': template(W) | ||
}) | ||
if i < len(sent) - 1: | ||
W = sent[i + 1][0] | ||
features.update({ | ||
'+1W[-1': W[-1:], | ||
'+1W': W, | ||
'+1W0W': W + sent[i], | ||
'+1W[-2': W[-2:], | ||
'+1:P': W in string.punctuation, | ||
'+1:T': template(W) | ||
}) | ||
else: | ||
features['EOS'] = True | ||
if 0 < i < len(sent) - 1: | ||
features['-1W/+1W'] = sent[i + 1][0] + "/" + sent[i - 1][0] | ||
return features | ||
|
||
|
||
def token2features(token_list): | ||
return [word2features(token_list, i) for i in range(len(token_list))] | ||
|
||
|
||
def sent2labels(sent): | ||
return [postag for token, postag in sent] | ||
|
||
|
||
def sent2tokens(sent): | ||
return [token for token, postag in sent] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from nltk import tree2conlltags | ||
|
||
|
||
def read_conll(path, col=2): | ||
with open(path, "r", encoding="utf-8") as conll: | ||
out = [] | ||
for sent in conll.readlines(): | ||
split = sent.strip("\r\n").split() | ||
if len(split) > 1: | ||
none_token_count = col - 1 | ||
new_elem = split[-1:] | ||
new_elem = split[:none_token_count] + new_elem | ||
out.append(new_elem) | ||
|
||
else: | ||
yield out | ||
out = [] | ||
|
||
|
||
def template(word): | ||
return "".join([(lambda item: "x" if not item in "آایو" else "a")(char) for char in word]) | ||
|
||
|
||
def isdigit(word): | ||
return all(map(lambda char: char in "۱۲۳۴۵۶۷۸۹۰1234567890.", word)) | ||
|
||
|
||
def ngram(word, leng=2): | ||
for i in range(len(word) - 1): | ||
yield 'word[' + str(i) + ":" + str(i + leng) + "]", word[i:i + leng] | ||
|
||
|
||
def tree2brackets(tree): | ||
str, tag = '', '' | ||
for item in tree2conlltags(tree): | ||
if item[2][0] in {'B', 'O'} and tag: | ||
str += tag + '] ' | ||
tag = '' | ||
|
||
if item[2][0] == 'B': | ||
tag = item[2].split('-')[1] | ||
str += '[' | ||
str += item[0] + ' ' | ||
|
||
if tag: | ||
str += tag + '] ' | ||
|
||
return str.strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
precision recall f1-score support | ||
|
||
N 0.985 0.970 0.977 186585 | ||
P 0.998 0.998 0.998 89450 | ||
V 0.999 0.999 0.999 87762 | ||
ADV 0.976 0.972 0.974 15983 | ||
ADVe 0.988 0.978 0.983 1053 | ||
RES 0.989 0.992 0.991 2784 | ||
RESe 1.000 0.989 0.994 174 | ||
DET 0.973 0.977 0.975 19786 | ||
DETe 0.960 0.970 0.965 2156 | ||
AJ 0.978 0.975 0.977 61526 | ||
AJe 0.949 0.964 0.957 19919 | ||
CL 0.932 0.918 0.925 1892 | ||
INT 1.000 1.000 1.000 73 | ||
CONJ 0.996 0.997 0.997 74796 | ||
CONJe 1.000 1.000 1.000 82 | ||
POSTP 1.000 1.000 1.000 13174 | ||
PRO 0.973 0.974 0.973 23094 | ||
PROe 0.878 0.579 0.698 273 | ||
NUM 0.988 0.992 0.990 24864 | ||
NUMe 0.932 0.918 0.925 2519 | ||
PUNC 1.000 1.000 1.000 84088 | ||
Ne 0.970 0.985 0.977 163760 | ||
Pe 0.986 0.992 0.989 10004 | ||
|
||
avg / total 0.985 0.985 0.985 885797 | ||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from POS.POSTagger import POSTagger | ||
|
||
pos_tagger = POSTagger("/home/hassan/PycharmProjects/perpos_git/model/perpos.model") | ||
print(pos_tagger.parse("به گزارش گروه بین الملل خبرگزاری فارس، «بشار اسد» رئیس جمهور سوریه در جمع روزنامه نگاران روس گفت که هر گونه عملیات نظامی در سوریه بدون موافقت مقامات این کشور، تجاوز است؛ چه در رقه باشد یا در سایر مناطق.".split())) |