Skip to content

Commit

Permalink
init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mhbashari committed Mar 21, 2017
1 parent bef66d2 commit 7f1a561
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 0 deletions.
20 changes: 20 additions & 0 deletions POS/POSTagger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pickle

from POS.crf_pos_feature_helper import token2features


class POSTagger:
def __init__(self, model_path):
self.model_path = model_path
self.crf = pickle.load(open(model_path, "rb"))

def parse(self, token_stream):
return self.parse_sentences([token_stream])[0]

def parse_sentences(self, list_of_token_stream):
X_test = [token2features(s) for s in list_of_token_stream]
y_pred = self.crf.predict(X_test)
out = []
for x_sent, y_pred in zip(list_of_token_stream, y_pred):
out.append(list(zip(x_sent, y_pred)))
return out
3 changes: 3 additions & 0 deletions POS/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .POSTagger import POSTagger
from . import crf_pos_feature_helper
from .util import *
76 changes: 76 additions & 0 deletions POS/crf_pos_feature_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import string

from POS.util import template, isdigit, ngram


def word2features(sent, i):
W = sent[i]
features = {
'B': 1.0,
'W': W,
'P': W in string.punctuation,
'T': template(W),
'D(W)': isdigit(W),
}
for leng in range(max(4 + 1, len(W)) + 1):
for k, v in ngram(W, leng=leng):
features[k] = v
if i > 0:
W = sent[i - 1][0]
features.update({
'-1W[-3': W[-3:],
'-1W[-2': W[-2:],
'-1W[-1': W[-1:],
'-1W': W,
'-1W0W': W + sent[i],
'-1P': W in string.punctuation,
'-1T': template(W)
})
else:
features['BOS'] = True
if i > 1:
W = sent[i - 2][0]
features.update({
'-2W[-3': W[-3:],
'-2W[-2': W[-2:],
'-2W[-1': W[-1:],
'-2P': W in string.punctuation,
'-2T': template(W)
})

if i < len(sent) - 2:
W = sent[i + 2][0]
features.update({
'+2W[-1': W[-1:],
'+2W[-2': W[-2:],
'+2W': W,
'+2P': W in string.punctuation,
'+2T': template(W)
})
if i < len(sent) - 1:
W = sent[i + 1][0]
features.update({
'+1W[-1': W[-1:],
'+1W': W,
'+1W0W': W + sent[i],
'+1W[-2': W[-2:],
'+1:P': W in string.punctuation,
'+1:T': template(W)
})
else:
features['EOS'] = True
if 0 < i < len(sent) - 1:
features['-1W/+1W'] = sent[i + 1][0] + "/" + sent[i - 1][0]
return features


def token2features(token_list):
return [word2features(token_list, i) for i in range(len(token_list))]


def sent2labels(sent):
return [postag for token, postag in sent]


def sent2tokens(sent):
return [token for token, postag in sent]
48 changes: 48 additions & 0 deletions POS/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from nltk import tree2conlltags


def read_conll(path, col=2):
with open(path, "r", encoding="utf-8") as conll:
out = []
for sent in conll.readlines():
split = sent.strip("\r\n").split()
if len(split) > 1:
none_token_count = col - 1
new_elem = split[-1:]
new_elem = split[:none_token_count] + new_elem
out.append(new_elem)

else:
yield out
out = []


def template(word):
return "".join([(lambda item: "x" if not item in "آایو" else "a")(char) for char in word])


def isdigit(word):
return all(map(lambda char: char in "۱۲۳۴۵۶۷۸۹۰1234567890.", word))


def ngram(word, leng=2):
for i in range(len(word) - 1):
yield 'word[' + str(i) + ":" + str(i + leng) + "]", word[i:i + leng]


def tree2brackets(tree):
str, tag = '', ''
for item in tree2conlltags(tree):
if item[2][0] in {'B', 'O'} and tag:
str += tag + '] '
tag = ''

if item[2][0] == 'B':
tag = item[2].split('-')[1]
str += '['
str += item[0] + ' '

if tag:
str += tag + '] '

return str.strip()
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
precision recall f1-score support

N 0.985 0.970 0.977 186585
P 0.998 0.998 0.998 89450
V 0.999 0.999 0.999 87762
ADV 0.976 0.972 0.974 15983
ADVe 0.988 0.978 0.983 1053
RES 0.989 0.992 0.991 2784
RESe 1.000 0.989 0.994 174
DET 0.973 0.977 0.975 19786
DETe 0.960 0.970 0.965 2156
AJ 0.978 0.975 0.977 61526
AJe 0.949 0.964 0.957 19919
CL 0.932 0.918 0.925 1892
INT 1.000 1.000 1.000 73
CONJ 0.996 0.997 0.997 74796
CONJe 1.000 1.000 1.000 82
POSTP 1.000 1.000 1.000 13174
PRO 0.973 0.974 0.973 23094
PROe 0.878 0.579 0.698 273
NUM 0.988 0.992 0.990 24864
NUMe 0.932 0.918 0.925 2519
PUNC 1.000 1.000 1.000 84088
Ne 0.970 0.985 0.977 163760
Pe 0.986 0.992 0.989 10004

avg / total 0.985 0.985 0.985 885797


Binary file added model/perpos.model
Binary file not shown.
4 changes: 4 additions & 0 deletions sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from POS.POSTagger import POSTagger

pos_tagger = POSTagger("/home/hassan/PycharmProjects/perpos_git/model/perpos.model")
print(pos_tagger.parse("به گزارش گروه بین الملل خبرگزاری فارس، «بشار اسد» رئیس جمهور سوریه در جمع روزنامه نگاران روس گفت که هر گونه عملیات نظامی در سوریه بدون موافقت مقامات این کشور، تجاوز است؛ چه در رقه باشد یا در سایر مناطق.".split()))

0 comments on commit 7f1a561

Please sign in to comment.