init commit

mhbashari · Mar 21, 2017 · 7f1a561 · 7f1a561
1 parent bef66d2
commit 7f1a561
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 0 deletions.
diff --git a/POS/POSTagger.py b/POS/POSTagger.py
@@ -0,0 +1,20 @@
+import pickle
+
+from POS.crf_pos_feature_helper import token2features
+
+
+class POSTagger:
+    def __init__(self, model_path):
+        self.model_path = model_path
+        self.crf = pickle.load(open(model_path, "rb"))
+
+    def parse(self, token_stream):
+        return self.parse_sentences([token_stream])[0]
+
+    def parse_sentences(self, list_of_token_stream):
+        X_test = [token2features(s) for s in list_of_token_stream]
+        y_pred = self.crf.predict(X_test)
+        out = []
+        for x_sent, y_pred in zip(list_of_token_stream, y_pred):
+            out.append(list(zip(x_sent, y_pred)))
+        return out
diff --git a/POS/__init__.py b/POS/__init__.py
@@ -0,0 +1,3 @@
+from .POSTagger import POSTagger
+from . import crf_pos_feature_helper
+from .util import *
diff --git a/POS/crf_pos_feature_helper.py b/POS/crf_pos_feature_helper.py
@@ -0,0 +1,76 @@
+import string
+
+from POS.util import template, isdigit, ngram
+
+
+def word2features(sent, i):
+    W = sent[i]
+    features = {
+        'B': 1.0,
+        'W': W,
+        'P': W in string.punctuation,
+        'T': template(W),
+        'D(W)': isdigit(W),
+    }
+    for leng in range(max(4 + 1, len(W)) + 1):
+        for k, v in ngram(W, leng=leng):
+            features[k] = v
+    if i > 0:
+        W = sent[i - 1][0]
+        features.update({
+            '-1W[-3': W[-3:],
+            '-1W[-2': W[-2:],
+            '-1W[-1': W[-1:],
+            '-1W': W,
+            '-1W0W': W + sent[i],
+            '-1P': W in string.punctuation,
+            '-1T': template(W)
+        })
+    else:
+        features['BOS'] = True
+    if i > 1:
+        W = sent[i - 2][0]
+        features.update({
+            '-2W[-3': W[-3:],
+            '-2W[-2': W[-2:],
+            '-2W[-1': W[-1:],
+            '-2P': W in string.punctuation,
+            '-2T': template(W)
+        })
+
+    if i < len(sent) - 2:
+        W = sent[i + 2][0]
+        features.update({
+            '+2W[-1': W[-1:],
+            '+2W[-2': W[-2:],
+            '+2W': W,
+            '+2P': W in string.punctuation,
+            '+2T': template(W)
+        })
+    if i < len(sent) - 1:
+        W = sent[i + 1][0]
+        features.update({
+            '+1W[-1': W[-1:],
+            '+1W': W,
+            '+1W0W': W + sent[i],
+            '+1W[-2': W[-2:],
+            '+1:P': W in string.punctuation,
+            '+1:T': template(W)
+        })
+    else:
+        features['EOS'] = True
+    if 0 < i < len(sent) - 1:
+        features['-1W/+1W'] = sent[i + 1][0] + "/" + sent[i - 1][0]
+    return features
+
+
+def token2features(token_list):
+    return [word2features(token_list, i) for i in range(len(token_list))]
+
+
+def sent2labels(sent):
+    return [postag for token, postag in sent]
+
+
+def sent2tokens(sent):
+    return [token for token, postag in sent]
diff --git a/POS/util.py b/POS/util.py
@@ -0,0 +1,48 @@
+from nltk import tree2conlltags
+
+
+def read_conll(path, col=2):
+    with open(path, "r", encoding="utf-8") as conll:
+        out = []
+        for sent in conll.readlines():
+            split = sent.strip("\r\n").split()
+            if len(split) > 1:
+                none_token_count = col - 1
+                new_elem = split[-1:]
+                new_elem = split[:none_token_count] + new_elem
+                out.append(new_elem)
+
+            else:
+                yield out
+                out = []
+
+
+def template(word):
+    return "".join([(lambda item: "x" if not item in "آایو" else "a")(char) for char in word])
+
+
+def isdigit(word):
+    return all(map(lambda char: char in "۱۲۳۴۵۶۷۸۹۰1234567890.", word))
+
+
+def ngram(word, leng=2):
+    for i in range(len(word) - 1):
+        yield 'word[' + str(i) + ":" + str(i + leng) + "]", word[i:i + leng]
+
+
+def tree2brackets(tree):
+    str, tag = '', ''
+    for item in tree2conlltags(tree):
+        if item[2][0] in {'B', 'O'} and tag:
+            str += tag + '] '
+            tag = ''
+
+        if item[2][0] == 'B':
+            tag = item[2].split('-')[1]
+            str += '['
+        str += item[0] + ' '
+
+    if tag:
+        str += tag + '] '
+
+    return str.strip()
diff --git a/README.md b/README.md
@@ -0,0 +1,29 @@
+               precision   recall  f1-score   support
+
+          N         0.985      0.970     0.977    186585
+          P         0.998      0.998     0.998     89450
+          V         0.999      0.999     0.999     87762
+        ADV         0.976      0.972     0.974     15983
+       ADVe         0.988      0.978     0.983      1053
+        RES         0.989      0.992     0.991      2784
+       RESe         1.000      0.989     0.994       174
+        DET         0.973      0.977     0.975     19786
+       DETe         0.960      0.970     0.965      2156
+         AJ         0.978      0.975     0.977     61526
+        AJe         0.949      0.964     0.957     19919
+         CL         0.932      0.918     0.925      1892
+        INT         1.000      1.000     1.000        73
+       CONJ         0.996      0.997     0.997     74796
+      CONJe         1.000      1.000     1.000        82
+      POSTP         1.000      1.000     1.000     13174
+        PRO         0.973      0.974     0.973     23094
+       PROe         0.878      0.579     0.698       273
+        NUM         0.988      0.992     0.990     24864
+       NUMe         0.932      0.918     0.925      2519
+       PUNC         1.000      1.000     1.000     84088
+         Ne         0.970      0.985     0.977    163760
+         Pe         0.986      0.992     0.989     10004
+
+    avg / total      0.985     0.985     0.985    885797
+
+
diff --git a/model/perpos.model b/model/perpos.model
diff --git a/sample.py b/sample.py
@@ -0,0 +1,4 @@
+from POS.POSTagger import POSTagger
+
+pos_tagger = POSTagger("/home/hassan/PycharmProjects/perpos_git/model/perpos.model")
+print(pos_tagger.parse("به گزارش گروه بین الملل خبرگزاری فارس، «بشار اسد» رئیس جمهور سوریه در جمع روزنامه نگاران روس گفت که هر گونه عملیات نظامی در سوریه بدون موافقت مقامات این کشور، تجاوز است؛ چه در رقه باشد یا در سایر مناطق.".split()))