Refactor and fix bugs WIP

openredact · Jul 28, 2020 · 764d1a8 · 764d1a8
1 parent 91f6173
commit 764d1a8
Show file tree

Hide file tree

Showing 37 changed files with 678 additions and 620 deletions.
diff --git a/.run/Lint, format and test project.run.xml b/.run/Lint, format and test project.run.xml
@@ -1,7 +1,7 @@
 <component name="ProjectRunConfigurationManager">
   <configuration default="false" name="Lint, format and test project" type="CompoundRunConfigurationType">
     <toRun name="Pre-commit Hooks" type="PythonConfigurationType" />
-    <toRun name="Backend Tests" type="tests" />
+    <toRun name="Tests" type="tests" />
     <method v="2" />
   </configuration>
 </component>
diff --git a/.run/Backend Tests.run.xml → .run/Tests.run.xml b/.run/Backend Tests.run.xml → .run/Tests.run.xml
@@ -1,5 +1,5 @@
 <component name="ProjectRunConfigurationManager">
-  <configuration default="false" name="Backend Tests" type="tests" factoryName="py.test">
+  <configuration default="false" name="Tests" type="tests" factoryName="py.test">
     <module name="nerwhal" />
     <option name="INTERPRETER_OPTIONS" value="" />
     <option name="PARENT_ENVS" value="true" />

diff --git a/README.md b/README.md
@@ -13,13 +13,13 @@ A Python module that finds personally identifiable information in unstructured t
 
 ## Description
 
-PII Identifier is a framework that helps find PIIs (Personally Identifiable Information) in text. Recognizers uncover
+Nerwhal is a framework that helps find named entities in text. Recognizers uncover
 mentions that can be used to identify persons, such as name, phone number or place of birth.
 
 Note, that while the package is language agnostic, the included models and recognizers are for the **German** language.
 
 _**:warning: Disclaimer :warning::**_ This is a prototype, which must not be used in production without further protections. For
-the following reasons not all PIIs can be found:
+the following reasons not all named entities can be found:
 - the set of recognizers is not exhaustive
 - the rules of each recognizer do not cover all of the ways in which information can be expressed; the limitations of
 each recognizer are to the best of our knowledge noted in its code documentation.
@@ -37,7 +37,7 @@ The recognizers are built on top of powerful NLP engines:
 - and of course the good ol' regular expressions
 
 The engines can be found in the [Backends Package](nerwhal/backends). The recognizers operate on these backends
-and are located in the [Recognizers Package](nerwhal/recognizers).
+and are located in the [Recognizers Package](nerwhal/example_recognizers).
 
 ## Usage
 

diff --git a/nerwhal/__init__.py b/nerwhal/__init__.py
@@ -1 +1,3 @@
-from nerwhal.core import recognize, evaluate, Pii, Config  # noqa: F401
+from nerwhal.core import recognize, evaluate  # noqa: F401
+from nerwhal.types import NamedEntity, Config  # noqa: F401
+import nerwhal.recognizer_bases  # noqa: F401
diff --git a/nerwhal/aggregation_strategies.py b/nerwhal/aggregation_strategies.py
@@ -1,16 +1,16 @@
-def aggregate(piis, *other_piis, strategy="keep_all"):
-    """Aggregate two or more lists of Piis.
+def aggregate(ents, *other_ents, strategy="keep_all"):
+    """Aggregate two or more lists of named entities.
 
-    You can choose from several strategies for how to deal with overlapping Piis.
-    - `keep_all`: Append all lists and keep all Piis.
-    - `ensure_disjointness`: Like `keep_all`, but raises an `AssertionError` if two Piis overlap.
-    - `merge`: Appends the lists while choosing the Pii with higher score on overlaps.
+    You can choose from several strategies for how to deal with overlapping entities.
+    - `keep_all`: Append all lists and keep all entities.
+    - `ensure_disjointness`: Like `keep_all`, but raises an `AssertionError` if two entities overlap.
+    - `merge`: Appends the lists while choosing the entity with higher score on overlaps.
     """
-    items = piis.copy()
-    for _piis in other_piis:
-        items.extend(_piis)
+    items = ents.copy()
+    for _ents in other_ents:
+        items.extend(_ents)
 
-    items.sort(key=lambda pii: (pii.start_char, pii.end_char, 1.0 - pii.score, pii.tag))
+    items.sort(key=lambda ent: (ent.start_char, ent.end_char, 1.0 - ent.score, ent.tag))
 
     if strategy == "keep_all":
         aggregated = items
@@ -23,43 +23,43 @@ def aggregate(piis, *other_piis, strategy="keep_all"):
     return aggregated
 
 
-def _ensure_disjointness_strategy(piis):
-    """A strategy that ensures that all PIIs are disjoint.
+def _ensure_disjointness_strategy(ents):
+    """A strategy that ensures that all entities are disjoint.
 
-    Checks that all piis are disjoint by comparing end of previous pii with start of the current one.
+    Checks that all entities are disjoint by comparing end of previous entity with start of the current one.
     """
-    prev_pii_end = 0
-    for pii in piis:
-        if prev_pii_end > pii.start_char:
+    prev_ent_end = 0
+    for ent in ents:
+        if prev_ent_end > ent.start_char:
             raise AssertionError(
-                f"All piis were assumed to be disjunct, but {pii.text} ({pii.start_char}-{pii.end_char}) wasn't"
+                f"All entities were assumed to be disjunct, but {ent.text} ({ent.start_char}-{ent.end_char}) wasn't"
             )
 
-        prev_pii_end = pii.end_char
-    return piis
+        prev_ent_end = ent.end_char
+    return ents
 
 
-def _overlapping(pii_a, pii_b):
-    return pii_a.start_char <= pii_b.start_char < pii_a.end_char or pii_a.start_char < pii_b.end_char <= pii_a.end_char
+def _overlapping(ent_a, ent_b):
+    return ent_a.start_char <= ent_b.start_char < ent_a.end_char or ent_a.start_char < ent_b.end_char <= ent_a.end_char
 
 
-def _overlapping_and_outscored(pii, other_pii):
-    return other_pii and _overlapping(pii, other_pii) and pii.score < other_pii.score
+def _overlapping_and_outscored(ent, other_ent):
+    return other_ent and _overlapping(ent, other_ent) and ent.score < other_ent.score
 
 
-def _merge_strategy(piis):
-    """A strategy to resolve overlapping PIIs by giving those with higher scores priority."""
+def _merge_strategy(ents):
+    """A strategy to resolve overlapping named entities by giving those with higher scores priority."""
     res = []
-    prev_pii = None
-    for idx, pii in enumerate(piis):
-        next_pii = piis[idx + 1] if idx + 1 < len(piis) else None
+    prev_ent = None
+    for idx, ent in enumerate(ents):
+        next_ent = ents[idx + 1] if idx + 1 < len(ents) else None
 
-        if _overlapping_and_outscored(pii, prev_pii) or _overlapping_and_outscored(pii, next_pii):
+        if _overlapping_and_outscored(ent, prev_ent) or _overlapping_and_outscored(ent, next_ent):
             # don't add this one
             continue
 
-        res += [pii]
+        res += [ent]
 
-        prev_pii = pii
+        prev_ent = ent
 
     return res
diff --git a/nerwhal/backends/__init__.py b/nerwhal/backends/__init__.py
@@ -8,6 +8,9 @@ def load(backend):
     elif backend == "flashtext":
         mod = ".flashtext_backend"
         cls = "FlashtextBackend"
+    elif backend == "entity-ruler":
+        mod = ".entity_ruler_backend"
+        cls = "EntityRulerBackend"
     else:
         raise ValueError(f"Unknown backend type {backend}")
 

diff --git a/nerwhal/backends/base.py b/nerwhal/backends/base.py
@@ -1,15 +1,18 @@
 from abc import ABC, abstractmethod
+from typing import Type
+
+from nerwhal.recognizer_bases.base import Recognizer
 
 
 class Backend(ABC):
-    """Backends are the engines behind the recognizers that drive the search for PIIs.
+    """Backends are the engines behind the recognizers that drive the search for named entities.
 
     Recognizers use the functionality provided by a backend to do their job. Each recognizer has to specify one backend
     that it operates on.
     """
 
     @abstractmethod
-    def register_recognizer(self, recognizer):
+    def register_recognizer(self, recognizer_cls: Type[Recognizer]):
         """Add the given recognizer to this backend instance.
 
         One backend can have several recognizers. Once added they cannot be removed anymore.
@@ -21,6 +24,6 @@ def register_recognizer(self, recognizer):
     def run(self, text):
         """Run the backend and all registered recognizers.
 
-        :return: the list of PIIs that have been identified
+        :return: the list of named entities that have been identified
         """
         pass
diff --git a/nerwhal/backends/entity_ruler_backend.py b/nerwhal/backends/entity_ruler_backend.py
@@ -0,0 +1,34 @@
+from typing import Type
+
+from spacy.pipeline import EntityRuler
+
+from .base import Backend
+from nerwhal.recognizer_bases import EntityRulerRecognizer
+from nerwhal.types import NamedEntity
+from ..nlp_utils import load_nlp, configure_entity_extension_attributes, set_entity_extension_attributes
+
+configure_entity_extension_attributes()
+
+
+class EntityRulerBackend(Backend):
+    def __init__(self, model_name):
+        self.nlp = load_nlp(model_name, "tokenize,pos,lemma")
+
+    def register_recognizer(self, recognizer_cls: Type[EntityRulerRecognizer]):
+        recognizer = recognizer_cls(self.nlp)
+
+        name = recognizer_cls.__name__
+        ruler = EntityRuler(self.nlp)
+        self.nlp.add_pipe(ruler, name)
+        rules = [{"label": recognizer.TAG, "pattern": pattern} for pattern in recognizer.patterns]
+        ruler.add_patterns(rules)
+        self.nlp.add_pipe(set_entity_extension_attributes(recognizer.SCORE, name), name="label_" + name, after=name)
+
+    def run(self, text):
+        doc = self.nlp(text)
+
+        ents = []
+        for ent in doc.ents:
+            ents += [NamedEntity(ent.start_char, ent.end_char, ent.label_, ent.text, ent._.score, ent._.model)]
+
+        return ents
diff --git a/nerwhal/backends/flashtext_backend.py b/nerwhal/backends/flashtext_backend.py
@@ -1,24 +1,29 @@
+from typing import Type
+
 from flashtext import KeywordProcessor
 
-from nerwhal import Pii
-from nerwhal.backends.base import Backend
+from .base import Backend
+from nerwhal.types import NamedEntity
+from ..recognizer_bases import FlashtextRecognizer
 
 
 class FlashtextBackend(Backend):
     def __init__(self):
         self.keyword_processors = []
         self.entities = []
-        self.precisions = []
+        self.score = []
+
+    def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]):
+        recognizer = recognizer_cls()
 
-    def register_recognizer(self, recognizer):
         keyword_processor = KeywordProcessor()
         self.keyword_processors.append(keyword_processor.add_keywords_from_list(recognizer.keywords))
-        self.entities.append(recognizer.entity)
-        self.precisions.append(recognizer.precision)
+        self.entities.append(recognizer.TAG)
+        self.score.append(recognizer.SCORE)
 
     def run(self, text):
-        piis = []
-        for keyword_processor, entity, precision in zip(self.keyword_processors, self.entities, self.precisions):
+        ents = []
+        for keyword_processor, entity, score in zip(self.keyword_processors, self.entities, self.score):
             keywords = keyword_processor.extract_keywords(text, span_info=True)
-            piis += [Pii(start, end, entity, keyword, precision, "flashtext") for keyword, start, end in keywords]
-        return piis
+            ents += [NamedEntity(start, end, entity, keyword, score, "flashtext") for keyword, start, end in keywords]
+        return ents
diff --git a/nerwhal/backends/re_backend.py b/nerwhal/backends/re_backend.py
@@ -1,22 +1,26 @@
 import re
+from typing import Type
 
-from nerwhal import Pii
-from nerwhal.backends.base import Backend
+from .base import Backend
+from nerwhal.types import NamedEntity
+from ..recognizer_bases import ReRecognizer
 
 
 class ReBackend(Backend):
     def __init__(self):
         self.compiled_regexps = []
         self.entities = []
-        self.precisions = []
+        self.score = []
+
+    def register_recognizer(self, recognizer_cls: Type[ReRecognizer]):
+        recognizer = recognizer_cls()
 
-    def register_recognizer(self, recognizer):
         self.compiled_regexps += [re.compile(recognizer.regexp, flags=recognizer.flags)]
-        self.entities.append(recognizer.entity)
-        self.precisions.append(recognizer.precision)
+        self.entities.append(recognizer.TAG)
+        self.score.append(recognizer.SCORE)
 
     def run(self, text):
-        piis = []
-        for pattern, entity, precision in zip(self.compiled_regexps, self.entities, self.precisions):
-            piis += [Pii(m.start(), m.end(), entity, m.group(), precision, "re") for m in pattern.finditer(text)]
-        return piis
+        ents = []
+        for pattern, entity, score in zip(self.compiled_regexps, self.entities, self.score):
+            ents += [NamedEntity(m.start(), m.end(), entity, m.group(), score, "re") for m in pattern.finditer(text)]
+        return ents
diff --git a/nerwhal/backends/spacy_backend.py b/nerwhal/backends/spacy_backend.py