-
Notifications
You must be signed in to change notification settings - Fork 100
/
Copy pathnamed_entity_recognition.py
155 lines (131 loc) · 5.15 KB
/
named_entity_recognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# ========================================================================
# Copyright 2021 Emory University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========================================================================
__author__ = 'Jinho D. Choi'
import glob, os
from types import SimpleNamespace
from typing import Dict, List, Tuple, Set, Iterable, Any
import ahocorasick
def recognize_ngram(tokens: List[str], gazetteer: Dict[str, Set[str]]) -> List[Tuple[int, int, str, Set[str]]]:
"""
:param tokens: a sequence of input tokens.
:param gazetteer: a dictionary whose key is the text span of a named entity (e.g., "Emory University") and the value is the set of named entity tags for the entity.
:return: a list of entities where each entity is represented by a tuple consisting of the following 4 items:
- Index of the beginning token (inclusive)
- Index of the ending token (exclusive)
- Text span representing the entity (e.g., "Emory University")
- Set of named entity tags for the entity
"""
entities = []
for i in range(len(tokens)):
for j in range(i+1, len(tokens)+1):
key = ' '.join(tokens[i:j])
val = gazetteer.get(key, None)
if val: entities.append((i, j, key, val))
return entities
def create_ac(data: Iterable[Tuple[str, Any]]) -> ahocorasick.Automaton:
"""
Creates the Aho-Corasick automation and adds all (span, value) pairs in the data and finalizes this matcher.
:param data: a collection of (span, value) pairs.
"""
AC = ahocorasick.Automaton(ahocorasick.STORE_ANY)
for span, value in data:
if span in AC:
t = AC.get(span)
else:
t = SimpleNamespace(span=span, values=set())
AC.add_word(span, t)
t.values.add(value)
AC.make_automaton()
return AC
def match(AC: ahocorasick.Automaton, tokens: List[str]) -> List[Tuple[str, int, int, Set[str]]]:
"""
:param AC: the finalized Aho-Corasick automation.
:param tokens: the list of input tokens.
:return: a list of tuples where each tuple consists of
- span: str,
- start token index (inclusive): int
- end token index (exclusive): int
- a set of values for the span: Set[str]
"""
smap, emap, idx = dict(), dict(), 0
for i, token in enumerate(tokens):
smap[idx] = i
idx += len(token)
emap[idx] = i
idx += 1
# find matches
text = ' '.join(tokens)
spans = []
for eidx, t in AC.iter(text):
eidx += 1
sidx = eidx - len(t.span)
sidx = smap.get(sidx, None)
eidx = emap.get(eidx, None)
if sidx is None or eidx is None: continue
spans.append((t.span, sidx, eidx + 1, t.values))
return spans
def remove_subsets(entities: List[Tuple[str, int, int, Set[str]]]) -> List[Tuple[str, int, int, Set[str]]]:
"""
:param entities: a list of tuples where each tuple consists of
- span: str,
- start token index (inclusive): int
- end token index (exclusive): int
- a set of values for the span: Set[str]
:return: a list of entities where each entity is represented by a tuple of (span, start index, end index, value set)
"""
tmp = []
for e0 in entities:
remove = False
for e1 in entities:
if e0 == e1: continue
if e0[1] >= e1[1] and e0[2] <= e1[2]:
remove = True
break
if not remove: tmp.append(e0)
return tmp
def read_gazetteers(dirname: str) -> ahocorasick.Automaton:
data = []
for filename in glob.glob(os.path.join(dirname, '*.txt')):
label = os.path.basename(filename)[:-4]
for line in open(filename):
data.append((line.strip(), label))
return create_ac(data)
if __name__ == '__main__':
GAZETTEER = {
'Jinho': {'PER'},
'Jinho Choi': {'PER'},
'Emory': {'PER', 'ORG'},
'Emory University': {'ORG'},
'United States': {'GPE'},
'United States of America': {'GPE'},
}
text = 'Jinho Choi is a professor at Emory University in the United States of America'
tokens = text.split()
entities = recognize_ngram(tokens, GAZETTEER)
for entity in entities: print(entity)
GAZETTEER = [
('Jinho', 'PER'),
('Jinho Choi', 'PER'),
('Emory', 'PER'),
('Emory', 'ORG'),
('Emory University', 'ORG'),
('United States', 'GPE'),
('United States of America', 'GPE'),
('Korean', 'LANG'),
('Korea', 'GPE'),
('South Korea', 'GPE'),
]
AC = create_ac(GAZETTEER)