diff --git a/POS/POSTagger.py b/POS/POSTagger.py new file mode 100644 index 0000000..920b684 --- /dev/null +++ b/POS/POSTagger.py @@ -0,0 +1,20 @@ +import pickle + +from POS.crf_pos_feature_helper import token2features + + +class POSTagger: + def __init__(self, model_path): + self.model_path = model_path + self.crf = pickle.load(open(model_path, "rb")) + + def parse(self, token_stream): + return self.parse_sentences([token_stream])[0] + + def parse_sentences(self, list_of_token_stream): + X_test = [token2features(s) for s in list_of_token_stream] + y_pred = self.crf.predict(X_test) + out = [] + for x_sent, y_pred in zip(list_of_token_stream, y_pred): + out.append(list(zip(x_sent, y_pred))) + return out diff --git a/POS/__init__.py b/POS/__init__.py new file mode 100644 index 0000000..10a2c23 --- /dev/null +++ b/POS/__init__.py @@ -0,0 +1,3 @@ +from .POSTagger import POSTagger +from . import crf_pos_feature_helper +from .util import * \ No newline at end of file diff --git a/POS/crf_pos_feature_helper.py b/POS/crf_pos_feature_helper.py new file mode 100644 index 0000000..62ba1af --- /dev/null +++ b/POS/crf_pos_feature_helper.py @@ -0,0 +1,76 @@ +import string + +from POS.util import template, isdigit, ngram + + +def word2features(sent, i): + W = sent[i] + features = { + 'B': 1.0, + 'W': W, + 'P': W in string.punctuation, + 'T': template(W), + 'D(W)': isdigit(W), + } + for leng in range(max(4 + 1, len(W)) + 1): + for k, v in ngram(W, leng=leng): + features[k] = v + if i > 0: + W = sent[i - 1][0] + features.update({ + '-1W[-3': W[-3:], + '-1W[-2': W[-2:], + '-1W[-1': W[-1:], + '-1W': W, + '-1W0W': W + sent[i], + '-1P': W in string.punctuation, + '-1T': template(W) + }) + else: + features['BOS'] = True + if i > 1: + W = sent[i - 2][0] + features.update({ + '-2W[-3': W[-3:], + '-2W[-2': W[-2:], + '-2W[-1': W[-1:], + '-2P': W in string.punctuation, + '-2T': template(W) + }) + + if i < len(sent) - 2: + W = sent[i + 2][0] + features.update({ + '+2W[-1': W[-1:], + '+2W[-2': W[-2:], + '+2W': W, + '+2P': W in string.punctuation, + '+2T': template(W) + }) + if i < len(sent) - 1: + W = sent[i + 1][0] + features.update({ + '+1W[-1': W[-1:], + '+1W': W, + '+1W0W': W + sent[i], + '+1W[-2': W[-2:], + '+1:P': W in string.punctuation, + '+1:T': template(W) + }) + else: + features['EOS'] = True + if 0 < i < len(sent) - 1: + features['-1W/+1W'] = sent[i + 1][0] + "/" + sent[i - 1][0] + return features + + +def token2features(token_list): + return [word2features(token_list, i) for i in range(len(token_list))] + + +def sent2labels(sent): + return [postag for token, postag in sent] + + +def sent2tokens(sent): + return [token for token, postag in sent] diff --git a/POS/util.py b/POS/util.py new file mode 100644 index 0000000..1281518 --- /dev/null +++ b/POS/util.py @@ -0,0 +1,48 @@ +from nltk import tree2conlltags + + +def read_conll(path, col=2): + with open(path, "r", encoding="utf-8") as conll: + out = [] + for sent in conll.readlines(): + split = sent.strip("\r\n").split() + if len(split) > 1: + none_token_count = col - 1 + new_elem = split[-1:] + new_elem = split[:none_token_count] + new_elem + out.append(new_elem) + + else: + yield out + out = [] + + +def template(word): + return "".join([(lambda item: "x" if not item in "آایو" else "a")(char) for char in word]) + + +def isdigit(word): + return all(map(lambda char: char in "۱۲۳۴۵۶۷۸۹۰1234567890.", word)) + + +def ngram(word, leng=2): + for i in range(len(word) - 1): + yield 'word[' + str(i) + ":" + str(i + leng) + "]", word[i:i + leng] + + +def tree2brackets(tree): + str, tag = '', '' + for item in tree2conlltags(tree): + if item[2][0] in {'B', 'O'} and tag: + str += tag + '] ' + tag = '' + + if item[2][0] == 'B': + tag = item[2].split('-')[1] + str += '[' + str += item[0] + ' ' + + if tag: + str += tag + '] ' + + return str.strip() diff --git a/README.md b/README.md new file mode 100644 index 0000000..f94d80b --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ + precision recall f1-score support + + N 0.985 0.970 0.977 186585 + P 0.998 0.998 0.998 89450 + V 0.999 0.999 0.999 87762 + ADV 0.976 0.972 0.974 15983 + ADVe 0.988 0.978 0.983 1053 + RES 0.989 0.992 0.991 2784 + RESe 1.000 0.989 0.994 174 + DET 0.973 0.977 0.975 19786 + DETe 0.960 0.970 0.965 2156 + AJ 0.978 0.975 0.977 61526 + AJe 0.949 0.964 0.957 19919 + CL 0.932 0.918 0.925 1892 + INT 1.000 1.000 1.000 73 + CONJ 0.996 0.997 0.997 74796 + CONJe 1.000 1.000 1.000 82 + POSTP 1.000 1.000 1.000 13174 + PRO 0.973 0.974 0.973 23094 + PROe 0.878 0.579 0.698 273 + NUM 0.988 0.992 0.990 24864 + NUMe 0.932 0.918 0.925 2519 + PUNC 1.000 1.000 1.000 84088 + Ne 0.970 0.985 0.977 163760 + Pe 0.986 0.992 0.989 10004 + + avg / total 0.985 0.985 0.985 885797 + + \ No newline at end of file diff --git a/model/perpos.model b/model/perpos.model new file mode 100644 index 0000000..9428804 Binary files /dev/null and b/model/perpos.model differ diff --git a/sample.py b/sample.py new file mode 100644 index 0000000..cb81476 --- /dev/null +++ b/sample.py @@ -0,0 +1,4 @@ +from POS.POSTagger import POSTagger + +pos_tagger = POSTagger("/home/hassan/PycharmProjects/perpos_git/model/perpos.model") +print(pos_tagger.parse("به گزارش گروه بین الملل خبرگزاری فارس، «بشار اسد» رئیس جمهور سوریه در جمع روزنامه نگاران روس گفت که هر گونه عملیات نظامی در سوریه بدون موافقت مقامات این کشور، تجاوز است؛ چه در رقه باشد یا در سایر مناطق.".split()))