diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3f31b46a..1d4181a7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -83,12 +83,6 @@ jobs: run: | pip install --use-feature=2020-resolver --progress-bar off .[test] - - name: Install Forte-wrappers-spacy - run: | - git clone https://github.com/asyml/forte-wrappers.git - cd forte-wrappers - pip install src/spacy - - name: Test with pytest and run coverage run: | coverage run -m pytest tests/ diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py new file mode 100644 index 00000000..89c0dc51 --- /dev/null +++ b/fortex/health/processors/coreference_processor.py @@ -0,0 +1,209 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Coreference Processor +""" +from typing import Dict, Set + +import neuralcoref + +from forte.common import Resources, ProcessExecutionException +from forte.common.configuration import Config +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor +from forte.utils import get_class + +from ft.onto.base_ontology import CoreferenceGroup + +from fortex.spacy.spacy_processors import load_lang_model + +__all__ = [ + "CoreferenceProcessor", +] + + +class CoreferenceProcessor(PackProcessor): + r""" + Implementation of this CoreferenceProcessor has been based on + huggingface NeuralCoref. You can find more details in the original repo. + + Note that the NeuralCoref package from PyPI uses a dated spaCy + version (2.1), which can cause segmentation fault with the spaCy + we use (2.3). Please install NeuralCoref by building from source. + + Referred repository link: + https://github.com/huggingface/neuralcoref + """ + + def __init__(self): + super().__init__() + self.spacy_nlp = None + + def set_up(self, configs: Config): + self.spacy_nlp = load_lang_model(configs.lang) + + if self.spacy_nlp is None: + raise ProcessExecutionException( + "The SpaCy pipeline is not initialized, maybe you " + "haven't called the initialization function." + ) + + cfg_inference = configs.cfg_inference + neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference) + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.set_up(configs) + + def _process(self, input_pack: DataPack): + r""" + Coreference resolution is done by + a spaCy pipeline with `NeuralCoref` added. + + Then we translate the output to `CoreferenceGroup`. + """ + + # Default: Document + entry_type = get_class(self.configs.entry_type) + + # Default: MedicalEntityMention + mention_type = get_class(self.configs.mention_type) + + for entry_specified in input_pack.get(entry_type=entry_type): + result = self.spacy_nlp(entry_specified.text) + + if result._.has_coref: + for cluster in result._.coref_clusters: + + mentions = [] + for mention in cluster.mentions: + mention_text = mention.text + mention = mention_type( + input_pack, + mention.start_char + entry_specified.begin, + mention.end_char + entry_specified.begin, + ) + + # TODO: remove assertion? + assert mention.text == mention_text, ( + f"The processor extracted mention {mention.text}" + " which is different from the original mention" + f" {mention_text}. The offeset calculation is wrong." + ) + mentions.append(mention) + + group = CoreferenceGroup(input_pack) + group.add_members(mentions) + + @classmethod + def default_configs(cls): + r""" + This defines a basic config structure for `CoreferenceProcessor`. + + Following are the keys for this dictionary: + - `entry_type`: Input entry type. You can change the context of + coreference resolution by setting this parameter. For example, + if you want to do coreference resolution within documents, set + it to `"ft.onto.base_ontology.Document"`. If you want to do + coreference resolution within sentences, set it to + `"ft.onto.base_ontology.Sentence"`. + Default: `"ft.onto.base_ontology.Document"`. + - `mention_type`: The type of members in `CoreferenceGroup`. + It can be set to `"ft.onto.base_ontology.EntityMention"` or + its subclasses. + Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. + - `lang`: The SpaCy pipeline to be used. The pipeline does the + preprocessing steps for NeuralCoref. + Default: `"en_core_web_sm"`. + - `cfg_inference`: A dict containing the inference configs of + NeuralCoref. See `get_default_cfg_inference` for default values, and see + https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters + for the meaing of these parameters. + + Returns: A dictionary with the default config for this processor. + """ + return { + "entry_type": "ft.onto.base_ontology.Document", + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "cfg_inference": cls.get_default_cfg_inference(), + } + + @classmethod + def get_default_cfg_inference(cls): + """ + This defines the default inference config of NeuralCoref. + + Following are the keys for this dictionary: + - `greedyness` (`float`): A number between 0 and 1 determining + how greedy the model is about making coreference decisions + (more greedy means more coreference links). + Default: `0.5`. + - `max_dist` (`int`): How many mentions back to look when + considering possible antecedents of the current mention. + Decreasing the value will cause the system to run faster + but less accurately. + Default: `50`. + - `max_dist_match` (`int`): The system will consider linking + the current mention + to a preceding one further than max_dist away if they share + a noun or proper noun. In this case, it looks max_dist_match + away instead. + Default: `500`. + - `blacklist` (`bool`): Should the system resolve coreferences + for pronouns in the following list: ["i", "me", "my", "you", "your"]. + Default `True`. + - `conv_dict` (`dict(str, list(str))`): A conversion dictionary + that you can use + to replace the embeddings of rare words (keys) by an average + of the embeddings of a list of common words (values). + Ex: `conv_dict={"Angela": ["woman", "girl"]}` + will help resolving coreferences for Angela by using the + embeddings for the more common woman and girl instead of the + embedding of Angela. + This currently only works for single words (not for words groups). + Default: `None`. + + Returns: A dictionary with the default inference config of NeuralCoref. + """ + return { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": None, + } + + def expected_types_and_attributes(self): + r""" + Method to add user specified expected type which would be checked + before running the processor if the pipeline is initialized with + `enforce_consistency=True` or + :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for + the pipeline. + """ + # return {self.configs.entry_type: {"text"}} # TODO: fix this + return {self.configs.entry_type: set()} + + def record(self, record_meta: Dict[str, Set[str]]): + r""" + Method to add output type record of `CoreferenceProcessor` which + is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute + `members` to :attr:`forte.data.data_pack.Meta.record`. + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"} diff --git a/requirements.txt b/requirements.txt index c3516774..fc0f8c81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,11 @@ git+https://git@github.com/asyml/forte-wrappers.git#egg=forte.huggingface&subdir dataclasses~=0.8; python_version < '3.7' setuptools~=57.0.0 transformers~=4.2.2 + +# spacy>=2.3.0, <=2.3.5 # will be installed by forte.spacy +cython>=0.25 +pytest +# It is annoying that if we install neuralcoref and spacy at the same +# time, neuralcoref will throw "Cython failed" during building. +# Therefore, we must install neuralcoref after spacy is installed. +# git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref \ No newline at end of file diff --git a/setup.py b/setup.py index a63a10fb..d654ac34 100644 --- a/setup.py +++ b/setup.py @@ -1,28 +1,51 @@ import sys from pathlib import Path import setuptools +import subprocess +import os long_description = (Path(__file__).parent / "README.md").read_text() if sys.version_info < (3, 6): - sys.exit('Python>=3.6 is required by forte-medical.') + sys.exit("Python>=3.6 is required by forte-medical.") + +# If we install neuralcoref and spacy at the same +# time, neuralcoref will throw "Cython failed" during building, +# which is because neuralcoref does not set them as dependencies +# properly. +# Therefore, we must install neuralcoref after cython and spacy +# are installed. +p = subprocess.call( + [ + sys.executable, + "-m", + "pip", + "install", + "forte.spacy", # TODO: version + "cython>=0.25", + ], + env=os.environ, +) +if p != 0: + raise RuntimeError("Installing NeuralCoref dependencies failed.") setuptools.setup( name="forte.health", - version='0.1.0', + version="0.1.0", url="https://github.com/asyml/ForteHealth", description="NLP pipeline framework for biomedical and clinical domains", long_description=long_description, long_description_content_type="text/markdown", - license='Apache License Version 2.0', + license="Apache License Version 2.0", packages=setuptools.find_namespace_packages( - include=['fortex.health', 'ftx.*'], - exclude=["scripts*", "examples*", "tests*"] + include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"] ), namespace_packages=["fortex"], install_requires=[ - 'forte~=0.2.0', + "forte~=0.2.0", + "forte.spacy", # TODO: version + "cython>=0.25", ], extras_require={ "test": [ @@ -30,8 +53,16 @@ "testfixtures", "transformers==4.2.2", "protobuf==3.19.4", + "pytest", + "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", + ], + "icd_coding": [ + "transformers", + ], + "coreference": [ + "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], }, include_package_data=True, - python_requires='>=3.6' + python_requires=">=3.6", ) diff --git a/tests/fortex/health/processors/coreference_processor_test.py b/tests/fortex/health/processors/coreference_processor_test.py new file mode 100644 index 00000000..62d38c38 --- /dev/null +++ b/tests/fortex/health/processors/coreference_processor_test.py @@ -0,0 +1,156 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for CoreferenceProcessor +""" + +import unittest +from ddt import data, ddt, unpack + +from forte.data.data_pack import DataPack +from forte.data.readers import StringReader +from forte.pipeline import Pipeline +from forte.utils import get_class + +from ft.onto.base_ontology import CoreferenceGroup +from fortex.health.processors.coreference_processor import ( + CoreferenceProcessor, +) +from fortex.spacy import SpacyProcessor + + +@ddt +class TestCoreferenceProcessor(unittest.TestCase): + @data( + ( + "ADDENDUM:\n" + "RADIOLOGIC STUDIES: Radiologic studies also included " + "a chest CT, which confirmed cavitary lesions " + "in the left lung apex consistent with infectious process/tuberculosis.\n" + "This also moderate-sized left pleural effusion.\n" + "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, " + "but old infarction consistent with past medical history.\n" + "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum " + "most likely secondary to steoporosis.\n" + "These can be followed by repeat imaging as an outpatient.", + [["HEAD CT", "Head CT", "Abdominal CT"]], + "ft.onto.base_ontology.Document", + ), + ( + "My sister has a dog. She loves him.", + [["My sister", "She"], ["a dog", "him"]], + "ft.onto.base_ontology.Document", + ), + ( + "My sister loves her dog. My aunt also loves him.", + [["My sister", "her"], ["My aunt", "him"]], + "ft.onto.base_ontology.Sentence", + # Sentence-level coref resolution. + ), + ( + "My sister loves her dog. My aunt also loves him.", + [["My sister", "her"], ["her dog", "him"]], + "ft.onto.base_ontology.Document", + # Document-level coref is different from sentence-level. + ), + ) + @unpack + def test_inputs_and_entry_types(self, input_data, check_list, entry_type): + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": entry_type, + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": None, + }, + }, + ) + + self.pl.initialize() + + entry_type = get_class(entry_type) + + for pack in self.pl.process_dataset(input_data): + output_list = [] + + for entry in pack.get(entry_type): + for group in entry.get(CoreferenceGroup): + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + + self.assertEqual(output_list, check_list, f"input: {entry.text}") + + @data( + ( + "Deepika has a dog. She loves him. The movie star has always been fond of animals", + [["Deepika", "She", "him", "The movie star"]], + {}, + ), + ( + "Deepika has a dog. She loves him. The movie star has always been fond of animals", + [["Deepika", "She", "The movie star"], ["a dog", "him"]], + {"Deepika": ["woman", "actress"]}, + ), + ) + @unpack + def test_conv_dict(self, input_data, check_list, conv_dict): + entry_type = "ft.onto.base_ontology.Document" + + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": entry_type, + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": conv_dict, + }, + }, + ) + + self.pl.initialize() + + entry_type = get_class(entry_type) + + for pack in self.pl.process_dataset(input_data): + output_list = [] + + for entry in pack.get(entry_type): + for group in entry.get(CoreferenceGroup): + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + + self.assertEqual(output_list, check_list, f"input: {entry.text}")