Skip to content

Commit

Permalink
Refactor and fix bugs WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
langhabel committed Jul 28, 2020
1 parent 91f6173 commit 764d1a8
Show file tree
Hide file tree
Showing 37 changed files with 678 additions and 620 deletions.
2 changes: 1 addition & 1 deletion .run/Lint, format and test project.run.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="Lint, format and test project" type="CompoundRunConfigurationType">
<toRun name="Pre-commit Hooks" type="PythonConfigurationType" />
<toRun name="Backend Tests" type="tests" />
<toRun name="Tests" type="tests" />
<method v="2" />
</configuration>
</component>
2 changes: 1 addition & 1 deletion .run/Backend Tests.run.xml → .run/Tests.run.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="Backend Tests" type="tests" factoryName="py.test">
<configuration default="false" name="Tests" type="tests" factoryName="py.test">
<module name="nerwhal" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ A Python module that finds personally identifiable information in unstructured t

## Description

PII Identifier is a framework that helps find PIIs (Personally Identifiable Information) in text. Recognizers uncover
Nerwhal is a framework that helps find named entities in text. Recognizers uncover
mentions that can be used to identify persons, such as name, phone number or place of birth.

Note, that while the package is language agnostic, the included models and recognizers are for the **German** language.

_**:warning: Disclaimer :warning::**_ This is a prototype, which must not be used in production without further protections. For
the following reasons not all PIIs can be found:
the following reasons not all named entities can be found:
- the set of recognizers is not exhaustive
- the rules of each recognizer do not cover all of the ways in which information can be expressed; the limitations of
each recognizer are to the best of our knowledge noted in its code documentation.
Expand All @@ -37,7 +37,7 @@ The recognizers are built on top of powerful NLP engines:
- and of course the good ol' regular expressions

The engines can be found in the [Backends Package](nerwhal/backends). The recognizers operate on these backends
and are located in the [Recognizers Package](nerwhal/recognizers).
and are located in the [Recognizers Package](nerwhal/example_recognizers).

## Usage

Expand Down
4 changes: 3 additions & 1 deletion nerwhal/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from nerwhal.core import recognize, evaluate, Pii, Config # noqa: F401
from nerwhal.core import recognize, evaluate # noqa: F401
from nerwhal.types import NamedEntity, Config # noqa: F401
import nerwhal.recognizer_bases # noqa: F401
62 changes: 31 additions & 31 deletions nerwhal/aggregation_strategies.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
def aggregate(piis, *other_piis, strategy="keep_all"):
"""Aggregate two or more lists of Piis.
def aggregate(ents, *other_ents, strategy="keep_all"):
"""Aggregate two or more lists of named entities.
You can choose from several strategies for how to deal with overlapping Piis.
- `keep_all`: Append all lists and keep all Piis.
- `ensure_disjointness`: Like `keep_all`, but raises an `AssertionError` if two Piis overlap.
- `merge`: Appends the lists while choosing the Pii with higher score on overlaps.
You can choose from several strategies for how to deal with overlapping entities.
- `keep_all`: Append all lists and keep all entities.
- `ensure_disjointness`: Like `keep_all`, but raises an `AssertionError` if two entities overlap.
- `merge`: Appends the lists while choosing the entity with higher score on overlaps.
"""
items = piis.copy()
for _piis in other_piis:
items.extend(_piis)
items = ents.copy()
for _ents in other_ents:
items.extend(_ents)

items.sort(key=lambda pii: (pii.start_char, pii.end_char, 1.0 - pii.score, pii.tag))
items.sort(key=lambda ent: (ent.start_char, ent.end_char, 1.0 - ent.score, ent.tag))

if strategy == "keep_all":
aggregated = items
Expand All @@ -23,43 +23,43 @@ def aggregate(piis, *other_piis, strategy="keep_all"):
return aggregated


def _ensure_disjointness_strategy(piis):
"""A strategy that ensures that all PIIs are disjoint.
def _ensure_disjointness_strategy(ents):
"""A strategy that ensures that all entities are disjoint.
Checks that all piis are disjoint by comparing end of previous pii with start of the current one.
Checks that all entities are disjoint by comparing end of previous entity with start of the current one.
"""
prev_pii_end = 0
for pii in piis:
if prev_pii_end > pii.start_char:
prev_ent_end = 0
for ent in ents:
if prev_ent_end > ent.start_char:
raise AssertionError(
f"All piis were assumed to be disjunct, but {pii.text} ({pii.start_char}-{pii.end_char}) wasn't"
f"All entities were assumed to be disjunct, but {ent.text} ({ent.start_char}-{ent.end_char}) wasn't"
)

prev_pii_end = pii.end_char
return piis
prev_ent_end = ent.end_char
return ents


def _overlapping(pii_a, pii_b):
return pii_a.start_char <= pii_b.start_char < pii_a.end_char or pii_a.start_char < pii_b.end_char <= pii_a.end_char
def _overlapping(ent_a, ent_b):
return ent_a.start_char <= ent_b.start_char < ent_a.end_char or ent_a.start_char < ent_b.end_char <= ent_a.end_char


def _overlapping_and_outscored(pii, other_pii):
return other_pii and _overlapping(pii, other_pii) and pii.score < other_pii.score
def _overlapping_and_outscored(ent, other_ent):
return other_ent and _overlapping(ent, other_ent) and ent.score < other_ent.score


def _merge_strategy(piis):
"""A strategy to resolve overlapping PIIs by giving those with higher scores priority."""
def _merge_strategy(ents):
"""A strategy to resolve overlapping named entities by giving those with higher scores priority."""
res = []
prev_pii = None
for idx, pii in enumerate(piis):
next_pii = piis[idx + 1] if idx + 1 < len(piis) else None
prev_ent = None
for idx, ent in enumerate(ents):
next_ent = ents[idx + 1] if idx + 1 < len(ents) else None

if _overlapping_and_outscored(pii, prev_pii) or _overlapping_and_outscored(pii, next_pii):
if _overlapping_and_outscored(ent, prev_ent) or _overlapping_and_outscored(ent, next_ent):
# don't add this one
continue

res += [pii]
res += [ent]

prev_pii = pii
prev_ent = ent

return res
3 changes: 3 additions & 0 deletions nerwhal/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ def load(backend):
elif backend == "flashtext":
mod = ".flashtext_backend"
cls = "FlashtextBackend"
elif backend == "entity-ruler":
mod = ".entity_ruler_backend"
cls = "EntityRulerBackend"
else:
raise ValueError(f"Unknown backend type {backend}")

Expand Down
9 changes: 6 additions & 3 deletions nerwhal/backends/base.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from abc import ABC, abstractmethod
from typing import Type

from nerwhal.recognizer_bases.base import Recognizer


class Backend(ABC):
"""Backends are the engines behind the recognizers that drive the search for PIIs.
"""Backends are the engines behind the recognizers that drive the search for named entities.
Recognizers use the functionality provided by a backend to do their job. Each recognizer has to specify one backend
that it operates on.
"""

@abstractmethod
def register_recognizer(self, recognizer):
def register_recognizer(self, recognizer_cls: Type[Recognizer]):
"""Add the given recognizer to this backend instance.
One backend can have several recognizers. Once added they cannot be removed anymore.
Expand All @@ -21,6 +24,6 @@ def register_recognizer(self, recognizer):
def run(self, text):
"""Run the backend and all registered recognizers.
:return: the list of PIIs that have been identified
:return: the list of named entities that have been identified
"""
pass
34 changes: 34 additions & 0 deletions nerwhal/backends/entity_ruler_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import Type

from spacy.pipeline import EntityRuler

from .base import Backend
from nerwhal.recognizer_bases import EntityRulerRecognizer
from nerwhal.types import NamedEntity
from ..nlp_utils import load_nlp, configure_entity_extension_attributes, set_entity_extension_attributes

configure_entity_extension_attributes()


class EntityRulerBackend(Backend):
def __init__(self, model_name):
self.nlp = load_nlp(model_name, "tokenize,pos,lemma")

def register_recognizer(self, recognizer_cls: Type[EntityRulerRecognizer]):
recognizer = recognizer_cls(self.nlp)

name = recognizer_cls.__name__
ruler = EntityRuler(self.nlp)
self.nlp.add_pipe(ruler, name)
rules = [{"label": recognizer.TAG, "pattern": pattern} for pattern in recognizer.patterns]
ruler.add_patterns(rules)
self.nlp.add_pipe(set_entity_extension_attributes(recognizer.SCORE, name), name="label_" + name, after=name)

def run(self, text):
doc = self.nlp(text)

ents = []
for ent in doc.ents:
ents += [NamedEntity(ent.start_char, ent.end_char, ent.label_, ent.text, ent._.score, ent._.model)]

return ents
25 changes: 15 additions & 10 deletions nerwhal/backends/flashtext_backend.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
from typing import Type

from flashtext import KeywordProcessor

from nerwhal import Pii
from nerwhal.backends.base import Backend
from .base import Backend
from nerwhal.types import NamedEntity
from ..recognizer_bases import FlashtextRecognizer


class FlashtextBackend(Backend):
def __init__(self):
self.keyword_processors = []
self.entities = []
self.precisions = []
self.score = []

def register_recognizer(self, recognizer_cls: Type[FlashtextRecognizer]):
recognizer = recognizer_cls()

def register_recognizer(self, recognizer):
keyword_processor = KeywordProcessor()
self.keyword_processors.append(keyword_processor.add_keywords_from_list(recognizer.keywords))
self.entities.append(recognizer.entity)
self.precisions.append(recognizer.precision)
self.entities.append(recognizer.TAG)
self.score.append(recognizer.SCORE)

def run(self, text):
piis = []
for keyword_processor, entity, precision in zip(self.keyword_processors, self.entities, self.precisions):
ents = []
for keyword_processor, entity, score in zip(self.keyword_processors, self.entities, self.score):
keywords = keyword_processor.extract_keywords(text, span_info=True)
piis += [Pii(start, end, entity, keyword, precision, "flashtext") for keyword, start, end in keywords]
return piis
ents += [NamedEntity(start, end, entity, keyword, score, "flashtext") for keyword, start, end in keywords]
return ents
24 changes: 14 additions & 10 deletions nerwhal/backends/re_backend.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
import re
from typing import Type

from nerwhal import Pii
from nerwhal.backends.base import Backend
from .base import Backend
from nerwhal.types import NamedEntity
from ..recognizer_bases import ReRecognizer


class ReBackend(Backend):
def __init__(self):
self.compiled_regexps = []
self.entities = []
self.precisions = []
self.score = []

def register_recognizer(self, recognizer_cls: Type[ReRecognizer]):
recognizer = recognizer_cls()

def register_recognizer(self, recognizer):
self.compiled_regexps += [re.compile(recognizer.regexp, flags=recognizer.flags)]
self.entities.append(recognizer.entity)
self.precisions.append(recognizer.precision)
self.entities.append(recognizer.TAG)
self.score.append(recognizer.SCORE)

def run(self, text):
piis = []
for pattern, entity, precision in zip(self.compiled_regexps, self.entities, self.precisions):
piis += [Pii(m.start(), m.end(), entity, m.group(), precision, "re") for m in pattern.finditer(text)]
return piis
ents = []
for pattern, entity, score in zip(self.compiled_regexps, self.entities, self.score):
ents += [NamedEntity(m.start(), m.end(), entity, m.group(), score, "re") for m in pattern.finditer(text)]
return ents
84 changes: 0 additions & 84 deletions nerwhal/backends/spacy_backend.py

This file was deleted.

Loading

0 comments on commit 764d1a8

Please sign in to comment.