From 880329f52d80042af4054ad44045958eecd2bee7 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Wed, 15 Jun 2022 09:39:10 +0800 Subject: [PATCH 01/63] add empty coref processor --- .../processors/coreference_processor.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 fortex/health/processors/coreference_processor.py diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py new file mode 100644 index 00000000..65cc1b81 --- /dev/null +++ b/fortex/health/processors/coreference_processor.py @@ -0,0 +1,90 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Coreference Processor +""" +import os +import re +from typing import Dict, List, Set + +from forte.common import Resources +from forte.common.configuration import Config +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor + +# from ft.onto.base_ontology import TODO +# from ftx.medical.clinical_ontology import TODO + +__all__ = [ + "CoreferenceProcessor", +] + + +class CoreferenceProcessor(PackProcessor): + r""" + TODO: Add docstring + """ + + def __init__(self): + super().__init__() + # TODO + + def set_up(self, configs: Config): + pass + # TODO + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.set_up(configs) + + def _process(self, input_pack: DataPack): + r""" + TODO: Add docstring + """ + pass + # TODO + + @classmethod + def default_configs(cls): + r""" + TODO: Add docstring + """ + return { + # TODO + } + + def expected_types_and_attributes(self): + r""" + Method to add user specified expected type which would be checked + before running the processor if the pipeline is initialized with + `enforce_consistency=True` or + :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for + the pipeline. + """ + return { + # TODO + } + + def record(self, record_meta: Dict[str, Set[str]]): + r""" + Method to add output type record of `CoreferenceProcessor` which + is `"ftx.onto.clinical.TODO"` with attribute + `TODO` + to :attr:`forte.data.data_pack.Meta.record`. + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + # TODO From d9b3f71cdc134440ca370b0d3ec4d936140f69c6 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Wed, 15 Jun 2022 17:13:06 +0800 Subject: [PATCH 02/63] add process method. TODO: correct span --- .../processors/coreference_processor.py | 67 ++++++++++++++++--- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 65cc1b81..76e1f579 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -17,14 +17,15 @@ import os import re from typing import Dict, List, Set +import importlib -from forte.common import Resources +from forte.common import Resources, ProcessExecutionException from forte.common.configuration import Config from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor -# from ft.onto.base_ontology import TODO -# from ftx.medical.clinical_ontology import TODO +from ft.onto.base_ontology import CoreferenceGroup, Token +from ftx.medical.clinical_ontology import MedicalEntityMention, MedicalArticle __all__ = [ "CoreferenceProcessor", @@ -39,10 +40,19 @@ class CoreferenceProcessor(PackProcessor): def __init__(self): super().__init__() # TODO + self.coref = None # TODO: add type + self.spacy_nlp = None # TODO: find an elegant way to set this. def set_up(self, configs: Config): - pass - # TODO + import neuralcoref + self.spacy_nlp = self.resources.get('spacy_processor').nlp + if self.spacy_nlp is None: + raise ProcessExecutionException( + "The SpaCy pipeline is not initialized, maybe you " + "haven't called the initialization function." + ) + kwargs = {} # TODO + neuralcoref.add_to_pipe(self.spacy_nlp) def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) @@ -52,8 +62,38 @@ def _process(self, input_pack: DataPack): r""" TODO: Add docstring """ - pass - # TODO + path_str, module_str = self.configs.entry_type.rsplit(".", 1) + # By default, path_str would be ft.onto.base_ontology and module_str would be Document # TODO: check + + mod = importlib.import_module(path_str) + entry = getattr(mod, module_str) + for entry_specified in input_pack.get(entry_type=entry): + result = self.spacy_nlp(entry_specified.text) + tokens = [(token.text, token.pos) for token in input_pack.get(Token, entry_specified)] + + article = MedicalArticle( + pack=input_pack, + begin=entry_specified.span.begin, + end=entry_specified.span.end, + ) + + if not result._.has_coref: + article.has_coref = False + article.coref_groups = [] + else: + article.has_coref = True + article.coref_groups = [] + for cluster in result._.coref_clusters: + + mentions = [] + for mention in cluster.mentions: + mention = MedicalEntityMention(input_pack, mention.start, mention.end) + mentions.append(mention) + + group = CoreferenceGroup(input_pack) + group.add_members(mentions) + + article.coref_groups.append(group) @classmethod def default_configs(cls): @@ -61,7 +101,8 @@ def default_configs(cls): TODO: Add docstring """ return { - # TODO + # TODO: remove unnecessaries + "entry_type": "ft.onto.base_ontology.Document", } def expected_types_and_attributes(self): @@ -79,12 +120,16 @@ def expected_types_and_attributes(self): def record(self, record_meta: Dict[str, Set[str]]): r""" Method to add output type record of `CoreferenceProcessor` which - is `"ftx.onto.clinical.TODO"` with attribute - `TODO` + is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute + `coref_clusters` to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. - """ + """ # TODO: check docstring # TODO + record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = { + "coref_groups", + "has_coref" + } From da363726f1935b724ffa98e59d9fed90c015bcfa Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Thu, 16 Jun 2022 14:48:31 +0800 Subject: [PATCH 03/63] fix output span --- .../processors/coreference_processor.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 76e1f579..cf4dc52e 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -40,18 +40,19 @@ class CoreferenceProcessor(PackProcessor): def __init__(self): super().__init__() # TODO - self.coref = None # TODO: add type - self.spacy_nlp = None # TODO: find an elegant way to set this. + self.coref = None # TODO: add type + self.spacy_nlp = None # TODO: find an elegant way to set this. def set_up(self, configs: Config): import neuralcoref - self.spacy_nlp = self.resources.get('spacy_processor').nlp + + self.spacy_nlp = self.resources.get("spacy_processor").nlp if self.spacy_nlp is None: raise ProcessExecutionException( "The SpaCy pipeline is not initialized, maybe you " "haven't called the initialization function." ) - kwargs = {} # TODO + kwargs = {} # TODO neuralcoref.add_to_pipe(self.spacy_nlp) def initialize(self, resources: Resources, configs: Config): @@ -69,7 +70,7 @@ def _process(self, input_pack: DataPack): entry = getattr(mod, module_str) for entry_specified in input_pack.get(entry_type=entry): result = self.spacy_nlp(entry_specified.text) - tokens = [(token.text, token.pos) for token in input_pack.get(Token, entry_specified)] + tokens = [(token) for token in input_pack.get(Token, entry_specified)] article = MedicalArticle( pack=input_pack, @@ -84,13 +85,17 @@ def _process(self, input_pack: DataPack): article.has_coref = True article.coref_groups = [] for cluster in result._.coref_clusters: - + mentions = [] for mention in cluster.mentions: - mention = MedicalEntityMention(input_pack, mention.start, mention.end) + mention = MedicalEntityMention( + input_pack, + tokens[mention.start].begin, + tokens[mention.end - 1].end, + ) mentions.append(mention) - group = CoreferenceGroup(input_pack) + group = CoreferenceGroup(input_pack) group.add_members(mentions) article.coref_groups.append(group) @@ -127,9 +132,9 @@ def record(self, record_meta: Dict[str, Set[str]]): Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. - """ # TODO: check docstring + """ # TODO: check docstring # TODO record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = { "coref_groups", - "has_coref" - } + "has_coref", + } From 6dafabebead99dc8dbf689c74eb6080b4979523c Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Thu, 16 Jun 2022 18:01:26 +0800 Subject: [PATCH 04/63] add more default configs and comment --- .../processors/coreference_processor.py | 96 +++++++++++++++---- 1 file changed, 77 insertions(+), 19 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index cf4dc52e..d434aa4d 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -14,10 +14,11 @@ """ Coreference Processor """ -import os -import re -from typing import Dict, List, Set +from typing import Dict, Optional, Set import importlib +from boto import config + +from spacy.language import Language from forte.common import Resources, ProcessExecutionException from forte.common.configuration import Config @@ -34,14 +35,17 @@ class CoreferenceProcessor(PackProcessor): r""" - TODO: Add docstring + Implementation of this CoreferenceProcessor has been based on huggingface + NeuralCoref. Note that official released NeuralCoref uses a dated spaCy + version (2.1), which can cause segmentation fault with the spaCy we use (2.3). + Please install NeuralCoref by building from source: + + https://github.com/huggingface/neuralcoref """ def __init__(self): super().__init__() - # TODO - self.coref = None # TODO: add type - self.spacy_nlp = None # TODO: find an elegant way to set this. + self.spacy_nlp: Optional[Language] = None # TODO: a more elegant way def set_up(self, configs: Config): import neuralcoref @@ -52,8 +56,19 @@ def set_up(self, configs: Config): "The SpaCy pipeline is not initialized, maybe you " "haven't called the initialization function." ) - kwargs = {} # TODO - neuralcoref.add_to_pipe(self.spacy_nlp) + + model = configs.model + cfg_inference = { + "greedyness": configs.greedyness, + "max_dist": configs.max_dist, + "max_dist_match": configs.max_dist_match, + "blacklist": configs.blacklist, + "store_scores": configs.store_scores, + "conv_dict": configs.conv_dict, + } + neuralcoref.add_to_pipe( + self.spacy_nlp, model=model, cfg_inference=cfg_inference + ) def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) @@ -64,11 +79,12 @@ def _process(self, input_pack: DataPack): TODO: Add docstring """ path_str, module_str = self.configs.entry_type.rsplit(".", 1) - # By default, path_str would be ft.onto.base_ontology and module_str would be Document # TODO: check + # By default, path_str would be ft.onto.base_ontology + # and module_str would be Document # TODO: check mod = importlib.import_module(path_str) - entry = getattr(mod, module_str) - for entry_specified in input_pack.get(entry_type=entry): + entry_type = getattr(mod, module_str) + for entry_specified in input_pack.get(entry_type=entry_type): result = self.spacy_nlp(entry_specified.text) tokens = [(token) for token in input_pack.get(Token, entry_specified)] @@ -100,14 +116,59 @@ def _process(self, input_pack: DataPack): article.coref_groups.append(group) + # @classmethod + # def default_configs(cls): + # r""" + # This defines a basic config structure for `CoreferenceProcessor`. + + # Following are the keys for this dictionary: + # - `entry_type`: input entry type, + # - `model`: the neural net model to be used by NeuralCoref. If set to True + # (default), a new instance will be created with `NeuralCoref.Model()` + # in NeuralCoref.from_disk() or NeuralCoref.from_bytes(). + # - `cfg_inference`: A dict of configuration of inference. If set to an empty + # dict, the default configuration in NeuralCoref will be used. Available + # entries: `greedyness` (default 0.5), `max_dist` (default 50), + # `max_dist_match` (default 500), `blacklist` (default True), + # `store_scores` (default True), `conv_dict` (default None), + + # Returns: A dictionary with the default config for this processor. + # """ + # return { + # # TODO: remove unnecessaries + # "entry_type": "ft.onto.base_ontology.Document", + # "model": True, + # "cfg_inference": {}, + # } + @classmethod def default_configs(cls): r""" - TODO: Add docstring + This defines a basic config structure for `CoreferenceProcessor`. + + Following are the keys for this dictionary: + - `entry_type`: Input entry type. Default `"ft.onto.base_ontology.Document"`. + - `model`: the neural net model to be used by NeuralCoref. If set to `True` + (default), a new instance will be created with `NeuralCoref.Model()` + in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`. + - `greedyness`: TODO. Default `0.5`. + - `max_dist`: TODO. Default `50`. + - `max_dist_match`: TODO. Default `500`. + - `blacklist`: TODO. Default `True`. + - `store_scores`: TODO. Default `True` + - `conv_dict`: TODO. Default `None`. + + Returns: A dictionary with the default config for this processor. """ return { - # TODO: remove unnecessaries "entry_type": "ft.onto.base_ontology.Document", + "model": True, + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "store_scores": True, + "conv_dict": None, } def expected_types_and_attributes(self): @@ -118,22 +179,19 @@ def expected_types_and_attributes(self): :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ - return { - # TODO - } + return {"ft.onto.base_ontology.Document": set()} def record(self, record_meta: Dict[str, Set[str]]): r""" Method to add output type record of `CoreferenceProcessor` which is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute - `coref_clusters` + `coref_groups` and `has_coref` to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ # TODO: check docstring - # TODO record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = { "coref_groups", "has_coref", From 0e18a8f370e24a442e5486d0ac9b478b78258e99 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 11:10:00 +0800 Subject: [PATCH 05/63] change nlp pipeline --- .../processors/coreference_processor.py | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index d434aa4d..7399a65c 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -14,10 +14,12 @@ """ Coreference Processor """ +from lib2to3.pgen2 import token from typing import Dict, Optional, Set import importlib -from boto import config +from numpy import append +import spacy from spacy.language import Language from forte.common import Resources, ProcessExecutionException @@ -25,7 +27,7 @@ from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor -from ft.onto.base_ontology import CoreferenceGroup, Token +from ft.onto.base_ontology import CoreferenceGroup, Token, EntityMention from ftx.medical.clinical_ontology import MedicalEntityMention, MedicalArticle __all__ = [ @@ -50,7 +52,11 @@ def __init__(self): def set_up(self, configs: Config): import neuralcoref - self.spacy_nlp = self.resources.get("spacy_processor").nlp + # TODO: remove these comments + # TODO: a more elegant way + # self.spacy_nlp = self.resources.get("spacy_processor").nlp # borrow nlp from SpacyProcessor + self.spacy_nlp = spacy.load(configs.lang) + if self.spacy_nlp is None: raise ProcessExecutionException( "The SpaCy pipeline is not initialized, maybe you " @@ -76,17 +82,35 @@ def initialize(self, resources: Resources, configs: Config): def _process(self, input_pack: DataPack): r""" - TODO: Add docstring + Coreference resolution is done by + a spaCy pipeline with `NeuralCoref` in it. + + We translate the output to `CoreferenceGroup` and + `MedicalEntityMention` """ - path_str, module_str = self.configs.entry_type.rsplit(".", 1) - # By default, path_str would be ft.onto.base_ontology - # and module_str would be Document # TODO: check - mod = importlib.import_module(path_str) - entry_type = getattr(mod, module_str) + def load_module(string): + path_str, module_str = string.rsplit(".", 1) + mod = importlib.import_module(path_str) + return getattr(mod, module_str) + + # Default: Document + entry_type = load_module(self.configs.entry_type) + + # Default: MedicalEntityMention + mention_type = load_module(self.configs.mention_type) + for entry_specified in input_pack.get(entry_type=entry_type): result = self.spacy_nlp(entry_specified.text) - tokens = [(token) for token in input_pack.get(Token, entry_specified)] + + # TODO: remove these comments + # Marker155326 + # When tokenization is different from SpacyProcessor, this will be a bug: + token_begins = [] + token_ends = [] + for token in input_pack.get(Token, entry_specified): + token_begins.append(token.begin) + token_ends.append(token.end) article = MedicalArticle( pack=input_pack, @@ -104,10 +128,10 @@ def _process(self, input_pack: DataPack): mentions = [] for mention in cluster.mentions: - mention = MedicalEntityMention( + mention = mention_type( input_pack, - tokens[mention.start].begin, - tokens[mention.end - 1].end, + token_begins[mention.start], + token_ends[mention.end - 1], ) mentions.append(mention) @@ -116,31 +140,6 @@ def _process(self, input_pack: DataPack): article.coref_groups.append(group) - # @classmethod - # def default_configs(cls): - # r""" - # This defines a basic config structure for `CoreferenceProcessor`. - - # Following are the keys for this dictionary: - # - `entry_type`: input entry type, - # - `model`: the neural net model to be used by NeuralCoref. If set to True - # (default), a new instance will be created with `NeuralCoref.Model()` - # in NeuralCoref.from_disk() or NeuralCoref.from_bytes(). - # - `cfg_inference`: A dict of configuration of inference. If set to an empty - # dict, the default configuration in NeuralCoref will be used. Available - # entries: `greedyness` (default 0.5), `max_dist` (default 50), - # `max_dist_match` (default 500), `blacklist` (default True), - # `store_scores` (default True), `conv_dict` (default None), - - # Returns: A dictionary with the default config for this processor. - # """ - # return { - # # TODO: remove unnecessaries - # "entry_type": "ft.onto.base_ontology.Document", - # "model": True, - # "cfg_inference": {}, - # } - @classmethod def default_configs(cls): r""" @@ -148,6 +147,8 @@ def default_configs(cls): Following are the keys for this dictionary: - `entry_type`: Input entry type. Default `"ft.onto.base_ontology.Document"`. + - `mention_type`: Output mention type. Default `ftx.medical.clinical_ontology.MedicalEntityMention`. + It can also be set to `ft.onto.base_ontology.EntityMention`. - `model`: the neural net model to be used by NeuralCoref. If set to `True` (default), a new instance will be created with `NeuralCoref.Model()` in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`. @@ -162,6 +163,8 @@ def default_configs(cls): """ return { "entry_type": "ft.onto.base_ontology.Document", + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", "model": True, "greedyness": 0.5, "max_dist": 50, From fcafb22468ee905f3aca4076917a634a894e8dc6 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 11:31:48 +0800 Subject: [PATCH 06/63] add more entries to MedicalArticle; add comment --- .../processors/coreference_processor.py | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 7399a65c..92f58950 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -121,9 +121,13 @@ def load_module(string): if not result._.has_coref: article.has_coref = False article.coref_groups = [] + article.coref_resolved = result._.coref_resolved + article.coref_scores = {} else: article.has_coref = True article.coref_groups = [] + article.coref_resolved = result._.coref_resolved + article.coref_scores = result._.coref_scores for cluster in result._.coref_clusters: mentions = [] @@ -146,18 +150,32 @@ def default_configs(cls): This defines a basic config structure for `CoreferenceProcessor`. Following are the keys for this dictionary: - - `entry_type`: Input entry type. Default `"ft.onto.base_ontology.Document"`. - - `mention_type`: Output mention type. Default `ftx.medical.clinical_ontology.MedicalEntityMention`. - It can also be set to `ft.onto.base_ontology.EntityMention`. - - `model`: the neural net model to be used by NeuralCoref. If set to `True` - (default), a new instance will be created with `NeuralCoref.Model()` + - `entry_type`: Input entry type. Default: `"ft.onto.base_ontology.Document"`. + - `mention_type`: Output mention type. + Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. + It can also be set to `"ft.onto.base_ontology.EntityMention"`. + - `model`: the neural net model to be used by NeuralCoref. If set to `True`, + a new instance will be created with `NeuralCoref.Model()`. Default: `True`. in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`. - - `greedyness`: TODO. Default `0.5`. - - `max_dist`: TODO. Default `50`. - - `max_dist_match`: TODO. Default `500`. - - `blacklist`: TODO. Default `True`. - - `store_scores`: TODO. Default `True` - - `conv_dict`: TODO. Default `None`. + - `greedyness` (`float`): A number between 0 and 1 determining how greedy + the model is about making coreference decisions + (more greedy means more coreference links). Default: `0.5`. + - `max_dist` (`int`): How many mentions back to look when considering possible + antecedents of the current mention. Decreasing the value will cause + the system to run faster but less accurately. Default: `50`. + - `max_dist_match` (`int`): The system will consider linking the current mention + to a preceding one further than max_dist away if they share a noun or + proper noun. In this case, it looks max_dist_match away instead. Default: `500`. + - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the + following list: ["i", "me", "my", "you", "your"]. Default `True`. + - `store_scores` (`bool`): Should the system store the scores for the coreferences + in annotations. Default: `True` + - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use + to replace the embeddings of rare words (keys) by an average of the embeddings + of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}` + will help resolving coreferences for Angela by using the embeddings for the more + common woman and girl instead of the embedding of Angela. + This currently only works for single words (not for words groups). Default: `None`. Returns: A dictionary with the default config for this processor. """ @@ -188,7 +206,7 @@ def record(self, record_meta: Dict[str, Set[str]]): r""" Method to add output type record of `CoreferenceProcessor` which is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute - `coref_groups` and `has_coref` + `coref_groups`, `has_coref`, `coref_scores`, and `coref_resolved` to :attr:`forte.data.data_pack.Meta.record`. Args: @@ -198,4 +216,6 @@ def record(self, record_meta: Dict[str, Set[str]]): record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = { "coref_groups", "has_coref", + "coref_scores", + "coref_resolved", } From 0b675aa63b65ec213c4d0c9f191a88add15102c3 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 11:36:33 +0800 Subject: [PATCH 07/63] fixed some comments --- fortex/health/processors/coreference_processor.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 92f58950..ae7a685f 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -38,10 +38,13 @@ class CoreferenceProcessor(PackProcessor): r""" Implementation of this CoreferenceProcessor has been based on huggingface - NeuralCoref. Note that official released NeuralCoref uses a dated spaCy + NeuralCoref. You can find more details in the original repo. + + Note that official released NeuralCoref uses a dated spaCy version (2.1), which can cause segmentation fault with the spaCy we use (2.3). - Please install NeuralCoref by building from source: + Please install NeuralCoref by building from source. + Referred repository link: https://github.com/huggingface/neuralcoref """ @@ -83,9 +86,9 @@ def initialize(self, resources: Resources, configs: Config): def _process(self, input_pack: DataPack): r""" Coreference resolution is done by - a spaCy pipeline with `NeuralCoref` in it. + a spaCy pipeline with `NeuralCoref` added. - We translate the output to `CoreferenceGroup` and + Then we translate the output to `CoreferenceGroup` and `MedicalEntityMention` """ From d0812eee84e8fe4eece88a6f5c0412751fb5b411 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 11:51:38 +0800 Subject: [PATCH 08/63] fix comments and format files --- fortex/health/processors/coreference_processor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index ae7a685f..c8c80e85 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -39,8 +39,8 @@ class CoreferenceProcessor(PackProcessor): r""" Implementation of this CoreferenceProcessor has been based on huggingface NeuralCoref. You can find more details in the original repo. - - Note that official released NeuralCoref uses a dated spaCy + + Note that the NeuralCoref package from PyPI uses a dated spaCy version (2.1), which can cause segmentation fault with the spaCy we use (2.3). Please install NeuralCoref by building from source. @@ -154,17 +154,17 @@ def default_configs(cls): Following are the keys for this dictionary: - `entry_type`: Input entry type. Default: `"ft.onto.base_ontology.Document"`. - - `mention_type`: Output mention type. + - `mention_type`: Output mention type. Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. It can also be set to `"ft.onto.base_ontology.EntityMention"`. - `model`: the neural net model to be used by NeuralCoref. If set to `True`, a new instance will be created with `NeuralCoref.Model()`. Default: `True`. in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`. - - `greedyness` (`float`): A number between 0 and 1 determining how greedy - the model is about making coreference decisions + - `greedyness` (`float`): A number between 0 and 1 determining how greedy + the model is about making coreference decisions (more greedy means more coreference links). Default: `0.5`. - - `max_dist` (`int`): How many mentions back to look when considering possible - antecedents of the current mention. Decreasing the value will cause + - `max_dist` (`int`): How many mentions back to look when considering possible + antecedents of the current mention. Decreasing the value will cause the system to run faster but less accurately. Default: `50`. - `max_dist_match` (`int`): The system will consider linking the current mention to a preceding one further than max_dist away if they share a noun or From 07694534df6a0e284400d7b9996da56b567cacce Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 12:08:00 +0800 Subject: [PATCH 09/63] update requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index c3516774..19b1db42 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,5 @@ git+https://git@github.com/asyml/forte-wrappers.git#egg=forte.huggingface&subdir dataclasses~=0.8; python_version < '3.7' setuptools~=57.0.0 transformers~=4.2.2 +# neuralcoref (build from source) for CoreferenceProcessor +git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref From 5a55b419fbc4659e772959ff2243896cea8db975 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 12:18:10 +0800 Subject: [PATCH 10/63] update setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 44e57751..5d5a5f20 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", + "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], extras_require={ "test": [ From fdc343e714aa18a49fdb133b721b55acb4fa6ea1 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 12:50:46 +0800 Subject: [PATCH 11/63] add unit test --- .../processors/coreference_processor_test.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 tests/forte_medical/processors/coreference_processor_test.py diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py new file mode 100644 index 00000000..ee2b3320 --- /dev/null +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -0,0 +1,121 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for CoreferenceProcessor +""" + +import unittest +from ddt import data, ddt + +from forte.data.data_pack import DataPack +from forte.data.readers import StringReader +from forte.pipeline import Pipeline + +from ftx.medical.clinical_ontology import MedicalArticle +from ft.onto.base_ontology import ( + Token, +) + +from fortex.spacy import SpacyProcessor +from fortex.health.processors.coreference_processor import ( + CoreferenceProcessor, +) + + +class TestCoreferenceProcessor(unittest.TestCase): + def setUp(self): + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add( + SpacyProcessor(), + {"processors": ["sentence", "tokenize"], "lang": "en_core_web_sm"}, + ) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": "ft.onto.base_ontology.Document", + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "model": True, + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "store_scores": True, + "conv_dict": None, + }, + ) + + self.pl.initialize() + + @data("My sister has a dog. She loves him.") + def test_daily_language(self, input_data): + for pack in self.pl.process_dataset(input_data): + for article in pack.get(MedicalArticle): + has_coref = article.has_coref + assert has_coref == True + + coref_groups = article.coref_groups + output_list = [] + check_list = [["My sister", "She"], ["a dog", "him"]] + for group in coref_groups: + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + assert output_list == check_list + + @data("My sister has a dog. She loves him.") + def test_daily_language(self, input_data): + for pack in self.pl.process_dataset(input_data): + for article in pack.get(MedicalArticle): + has_coref = article.has_coref + assert has_coref == True + + coref_groups = article.coref_groups + output_list = [] + check_list = [["My sister", "She"], ["a dog", "him"]] + for group in coref_groups: + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + assert output_list == check_list + + @data( + """ADDENDUM: +RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. +This also moderate-sized left pleural effusion. +HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, but old infarction consistent with past medical history. +ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum most likely secondary to steoporosis. +These can be followed by repeat imaging as an outpatient.""" + ) + def test_medical_notes(self, input_data): + for pack in self.pl.process_dataset(input_data): + for article in pack.get(MedicalArticle): + has_coref = article.has_coref + assert has_coref == True + + coref_groups = article.coref_groups + output_list = [] + check_list = [["HEAD CT", "Head CT", "Abdominal CT"]] + for group in coref_groups: + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + assert output_list == check_list From 85bf9bb2526d50e297f78bdf3021965dff4e2d08 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 12:51:53 +0800 Subject: [PATCH 12/63] remove duplicated definition --- .../processors/coreference_processor_test.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index ee2b3320..5bf70b35 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -16,7 +16,7 @@ """ import unittest -from ddt import data, ddt +from ddt import data from forte.data.data_pack import DataPack from forte.data.readers import StringReader @@ -77,24 +77,6 @@ def test_daily_language(self, input_data): output_list.append(mention_texts) assert output_list == check_list - @data("My sister has a dog. She loves him.") - def test_daily_language(self, input_data): - for pack in self.pl.process_dataset(input_data): - for article in pack.get(MedicalArticle): - has_coref = article.has_coref - assert has_coref == True - - coref_groups = article.coref_groups - output_list = [] - check_list = [["My sister", "She"], ["a dog", "him"]] - for group in coref_groups: - members = [member for member in group.get_members()] - members = sorted(members, key=lambda x: x.begin) - - mention_texts = [member.text for member in members] - output_list.append(mention_texts) - assert output_list == check_list - @data( """ADDENDUM: RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. From 957992303601f459de57610cc7a86964b28e3aaf Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 12:56:18 +0800 Subject: [PATCH 13/63] fix ddt --- tests/forte_medical/processors/coreference_processor_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 5bf70b35..d8544975 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -16,7 +16,7 @@ """ import unittest -from ddt import data +from ddt import data, ddt from forte.data.data_pack import DataPack from forte.data.readers import StringReader @@ -32,7 +32,7 @@ CoreferenceProcessor, ) - +@ddt class TestCoreferenceProcessor(unittest.TestCase): def setUp(self): self.pl = Pipeline[DataPack](enforce_consistency=True) From 4e41b349b26d6268ad51c57eaa481bf44fcdac90 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 12:57:49 +0800 Subject: [PATCH 14/63] fix import --- fortex/health/processors/coreference_processor.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index c8c80e85..6b6fbebd 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -14,14 +14,14 @@ """ Coreference Processor """ -from lib2to3.pgen2 import token from typing import Dict, Optional, Set import importlib -from numpy import append import spacy from spacy.language import Language +import neuralcoref + from forte.common import Resources, ProcessExecutionException from forte.common.configuration import Config from forte.data.data_pack import DataPack @@ -53,8 +53,6 @@ def __init__(self): self.spacy_nlp: Optional[Language] = None # TODO: a more elegant way def set_up(self, configs: Config): - import neuralcoref - # TODO: remove these comments # TODO: a more elegant way # self.spacy_nlp = self.resources.get("spacy_processor").nlp # borrow nlp from SpacyProcessor From 443f886d1094514f541551288aa257c1e4932925 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 13:04:50 +0800 Subject: [PATCH 15/63] formatting --- .../processors/coreference_processor_test.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index d8544975..0ed8858e 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -32,6 +32,7 @@ CoreferenceProcessor, ) + @ddt class TestCoreferenceProcessor(unittest.TestCase): def setUp(self): @@ -78,12 +79,16 @@ def test_daily_language(self, input_data): assert output_list == check_list @data( - """ADDENDUM: -RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. -This also moderate-sized left pleural effusion. -HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, but old infarction consistent with past medical history. -ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum most likely secondary to steoporosis. -These can be followed by repeat imaging as an outpatient.""" + "ADDENDUM:\n", + "RADIOLOGIC STUDIES: Radiologic studies also included ", + "a chest CT, which confirmed cavitary lesions ", + "in the left lung apex consistent with infectious process/tuberculosis.\n", + "This also moderate-sized left pleural effusion.\n", + "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, ", + "but old infarction consistent with past medical history.\n", + "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum ", + "most likely secondary to steoporosis.\n", + "These can be followed by repeat imaging as an outpatient.", ) def test_medical_notes(self, input_data): for pack in self.pl.process_dataset(input_data): From 5f3cf34e0552f3ef6230330e3c092cfaff8bb94b Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 13:10:07 +0800 Subject: [PATCH 16/63] remove long lines --- fortex/health/processors/coreference_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 6b6fbebd..8f9d28f7 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -55,7 +55,8 @@ def __init__(self): def set_up(self, configs: Config): # TODO: remove these comments # TODO: a more elegant way - # self.spacy_nlp = self.resources.get("spacy_processor").nlp # borrow nlp from SpacyProcessor + # borrow nlp from SpacyProcessor + # self.spacy_nlp = self.resources.get("spacy_processor").nlp self.spacy_nlp = spacy.load(configs.lang) if self.spacy_nlp is None: From a11f2090b8310a5dac805105ef279b9598897b00 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 13:13:39 +0800 Subject: [PATCH 17/63] remove unused import --- fortex/health/processors/coreference_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 8f9d28f7..4659402f 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -27,8 +27,8 @@ from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor -from ft.onto.base_ontology import CoreferenceGroup, Token, EntityMention -from ftx.medical.clinical_ontology import MedicalEntityMention, MedicalArticle +from ft.onto.base_ontology import CoreferenceGroup, Token +from ftx.medical.clinical_ontology import MedicalArticle __all__ = [ "CoreferenceProcessor", From 07dbb02d2233816b7e9ed8fb83cc49abd438bbc4 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 13:31:20 +0800 Subject: [PATCH 18/63] add cython to dependency --- requirements.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 19b1db42..bb8ee956 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ setuptools~=57.0.0 transformers~=4.2.2 # neuralcoref (build from source) for CoreferenceProcessor git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref +cython>=0.25 diff --git a/setup.py b/setup.py index 5d5a5f20..cab3d825 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", + "cython>=0.25", "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], extras_require={ From 008eb1b79addbc5a5b0971f405a2acddec19f70d Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:12:23 +0800 Subject: [PATCH 19/63] delay the installation of neuralcoref --- requirements.txt | 8 +++++--- setup.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index bb8ee956..8f6ed340 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,6 +26,8 @@ git+https://git@github.com/asyml/forte-wrappers.git#egg=forte.huggingface&subdir dataclasses~=0.8; python_version < '3.7' setuptools~=57.0.0 transformers~=4.2.2 -# neuralcoref (build from source) for CoreferenceProcessor -git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref -cython>=0.25 + +# It is annoying that if we install neuralcoref and spacy at the same +# time, neuralcoref will throw "Cython failed" during building. +# Therefore, we must install neuralcoref after spacy is installed. +# git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref \ No newline at end of file diff --git a/setup.py b/setup.py index cab3d825..eba440f4 100644 --- a/setup.py +++ b/setup.py @@ -41,8 +41,6 @@ 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", - "cython>=0.25", - "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], extras_require={ "test": [ @@ -50,6 +48,10 @@ "testfixtures", "transformers==4.2.2", "protobuf==3.19.4", + # It is annoying that if we install neuralcoref and spacy at the same + # time, neuralcoref will throw "Cython failed" during building. + # Therefore, we must install neuralcoref after spacy is installed. + # "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], }, entry_points={ From ce2fd6f23d5b7b7c982f47c422b08b519a1459b0 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:14:15 +0800 Subject: [PATCH 20/63] put installation of neuralcoref in workflow --- .github/workflows/main.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3f31b46a..5b5866e8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -89,6 +89,10 @@ jobs: cd forte-wrappers pip install src/spacy + - name: Install NeuralCoref + run: | + git install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref + - name: Test with pytest and run coverage run: | coverage run -m pytest tests/ From e832289f3b9a92a17415361697aae31fb7597b31 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:28:03 +0800 Subject: [PATCH 21/63] fix typo --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5b5866e8..89bd3a6e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -91,7 +91,7 @@ jobs: - name: Install NeuralCoref run: | - git install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref + pip install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref - name: Test with pytest and run coverage run: | From c18f7043ffd048a7e98f434c9e7b3610f07da62f Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:29:03 +0800 Subject: [PATCH 22/63] skip mypy's None is not callable bug --- fortex/health/processors/coreference_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 4659402f..cc5a609c 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -14,11 +14,11 @@ """ Coreference Processor """ -from typing import Dict, Optional, Set +from typing import Dict, Set #, Optional import importlib import spacy -from spacy.language import Language +# from spacy.language import Language import neuralcoref @@ -50,7 +50,7 @@ class CoreferenceProcessor(PackProcessor): def __init__(self): super().__init__() - self.spacy_nlp: Optional[Language] = None # TODO: a more elegant way + self.spacy_nlp = None # TODO: a more elegant way def set_up(self, configs: Config): # TODO: remove these comments From a9cf38d98a87e001bae64fe55c11dd5d657f29dc Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:32:42 +0800 Subject: [PATCH 23/63] black format --- fortex/health/processors/coreference_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index cc5a609c..6d12f8c2 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -14,10 +14,11 @@ """ Coreference Processor """ -from typing import Dict, Set #, Optional +from typing import Dict, Set # , Optional import importlib import spacy + # from spacy.language import Language import neuralcoref From 2c346fb6a757d21ac1bc0ef4170f6660ee26a753 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:38:49 +0800 Subject: [PATCH 24/63] add spacy --- requirements.txt | 1 + setup.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8f6ed340..49661324 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,7 @@ dataclasses~=0.8; python_version < '3.7' setuptools~=57.0.0 transformers~=4.2.2 +spacy>=2.3.0, <=2.3.5 # It is annoying that if we install neuralcoref and spacy at the same # time, neuralcoref will throw "Cython failed" during building. # Therefore, we must install neuralcoref after spacy is installed. diff --git a/setup.py b/setup.py index eba440f4..ce1e862b 100644 --- a/setup.py +++ b/setup.py @@ -6,23 +6,22 @@ long_description = (Path(__file__).parent / "README.md").read_text() if sys.version_info < (3, 6): - sys.exit('Python>=3.6 is required by forte-medical.') + sys.exit("Python>=3.6 is required by forte-medical.") setuptools.setup( name="forte.health", - version='0.1.0', + version="0.1.0", url="https://github.com/asyml/ForteHealth", description="NLP pipeline framework for biomedical and clinical domains", long_description=long_description, long_description_content_type="text/markdown", - license='Apache License Version 2.0', + license="Apache License Version 2.0", packages=setuptools.find_namespace_packages( - include=['fortex.health', 'ftx.*'], - exclude=["scripts*", "examples*", "tests*"] + include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"] ), namespace_packages=["fortex"], install_requires=[ - 'forte~=0.2.0', + "forte~=0.2.0", "sortedcontainers==2.1.0", "numpy>=1.16.6", "jsonpickle==1.4", @@ -41,6 +40,7 @@ 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", + "spacy>=2.3.0, <=2.3.5", ], extras_require={ "test": [ @@ -55,12 +55,12 @@ ], }, entry_points={ - 'console_scripts': [ + "console_scripts": [ "forte-medical-train=forte_medical_cli.train:main", "forte-medical-process=forte_medical_cli.process:main", "forte-medical-evaluate=forte_medical_cli.evaluate:main", ] }, include_package_data=True, - python_requires='>=3.6' + python_requires=">=3.6", ) From 037f33bc81d420243c50c246953927f62031d9f3 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 14:58:44 +0800 Subject: [PATCH 25/63] add cython and pytest --- requirements.txt | 2 ++ setup.py | 2 ++ tests/forte_medical/processors/coreference_processor_test.py | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 49661324..ee79c660 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,8 @@ setuptools~=57.0.0 transformers~=4.2.2 spacy>=2.3.0, <=2.3.5 +cython>=0.25 +pytest # It is annoying that if we install neuralcoref and spacy at the same # time, neuralcoref will throw "Cython failed" during building. # Therefore, we must install neuralcoref after spacy is installed. diff --git a/setup.py b/setup.py index ce1e862b..8ff7e22b 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,8 @@ "fastapi==0.65.2", "uvicorn==0.14.0", "spacy>=2.3.0, <=2.3.5", + "cython>=0.25", + "pytest", ], extras_require={ "test": [ diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 0ed8858e..b20e3050 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -65,7 +65,7 @@ def test_daily_language(self, input_data): for pack in self.pl.process_dataset(input_data): for article in pack.get(MedicalArticle): has_coref = article.has_coref - assert has_coref == True + assert has_coref is True coref_groups = article.coref_groups output_list = [] @@ -94,7 +94,7 @@ def test_medical_notes(self, input_data): for pack in self.pl.process_dataset(input_data): for article in pack.get(MedicalArticle): has_coref = article.has_coref - assert has_coref == True + assert has_coref is True coref_groups = article.coref_groups output_list = [] From 5856d15b4520dcf5aae64b14a32a04009667718c Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 15:00:30 +0800 Subject: [PATCH 26/63] remove commented code --- fortex/health/processors/coreference_processor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 6d12f8c2..4d9d4ad1 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -14,13 +14,11 @@ """ Coreference Processor """ -from typing import Dict, Set # , Optional +from typing import Dict, Set import importlib import spacy -# from spacy.language import Language - import neuralcoref from forte.common import Resources, ProcessExecutionException From 03b6d69eea50bdb2d0cb882fea252b9ec70bbd87 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 15:06:07 +0800 Subject: [PATCH 27/63] fix unit test data --- .../processors/coreference_processor_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index b20e3050..b8d68cda 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -80,14 +80,14 @@ def test_daily_language(self, input_data): @data( "ADDENDUM:\n", - "RADIOLOGIC STUDIES: Radiologic studies also included ", - "a chest CT, which confirmed cavitary lesions ", - "in the left lung apex consistent with infectious process/tuberculosis.\n", - "This also moderate-sized left pleural effusion.\n", - "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, ", - "but old infarction consistent with past medical history.\n", - "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum ", - "most likely secondary to steoporosis.\n", + "RADIOLOGIC STUDIES: Radiologic studies also included " + "a chest CT, which confirmed cavitary lesions " + "in the left lung apex consistent with infectious process/tuberculosis.\n" + "This also moderate-sized left pleural effusion.\n" + "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, " + "but old infarction consistent with past medical history.\n" + "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum " + "most likely secondary to steoporosis.\n" "These can be followed by repeat imaging as an outpatient.", ) def test_medical_notes(self, input_data): From 9e5521160e0a44289239410b09e0e14c61dc0eba Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 19:10:57 +0800 Subject: [PATCH 28/63] fix unit test data 2 --- tests/forte_medical/processors/coreference_processor_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index b8d68cda..7f78de83 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -79,7 +79,7 @@ def test_daily_language(self, input_data): assert output_list == check_list @data( - "ADDENDUM:\n", + "ADDENDUM:\n" "RADIOLOGIC STUDIES: Radiologic studies also included " "a chest CT, which confirmed cavitary lesions " "in the left lung apex consistent with infectious process/tuberculosis.\n" From 2730b885d57bd1b4d719dbf38176d2ec9dc1ebaf Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 20:40:35 +0800 Subject: [PATCH 29/63] remove the dependency of SpacyProcessor --- .../processors/coreference_processor.py | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 4d9d4ad1..cb2c011e 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -26,7 +26,7 @@ from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor -from ft.onto.base_ontology import CoreferenceGroup, Token +from ft.onto.base_ontology import CoreferenceGroup from ftx.medical.clinical_ontology import MedicalArticle __all__ = [ @@ -49,13 +49,9 @@ class CoreferenceProcessor(PackProcessor): def __init__(self): super().__init__() - self.spacy_nlp = None # TODO: a more elegant way + self.spacy_nlp = None def set_up(self, configs: Config): - # TODO: remove these comments - # TODO: a more elegant way - # borrow nlp from SpacyProcessor - # self.spacy_nlp = self.resources.get("spacy_processor").nlp self.spacy_nlp = spacy.load(configs.lang) if self.spacy_nlp is None: @@ -104,15 +100,6 @@ def load_module(string): for entry_specified in input_pack.get(entry_type=entry_type): result = self.spacy_nlp(entry_specified.text) - # TODO: remove these comments - # Marker155326 - # When tokenization is different from SpacyProcessor, this will be a bug: - token_begins = [] - token_ends = [] - for token in input_pack.get(Token, entry_specified): - token_begins.append(token.begin) - token_ends.append(token.end) - article = MedicalArticle( pack=input_pack, begin=entry_specified.span.begin, @@ -135,8 +122,8 @@ def load_module(string): for mention in cluster.mentions: mention = mention_type( input_pack, - token_begins[mention.start], - token_ends[mention.end - 1], + mention.start_char, + mention.end_char, ) mentions.append(mention) From 5f6d0242e4f168c536b66b70b984afb767b1be72 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 20:43:05 +0800 Subject: [PATCH 30/63] update unit test --- .../forte_medical/processors/coreference_processor_test.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 7f78de83..93ad1cfa 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -26,8 +26,6 @@ from ft.onto.base_ontology import ( Token, ) - -from fortex.spacy import SpacyProcessor from fortex.health.processors.coreference_processor import ( CoreferenceProcessor, ) @@ -38,10 +36,6 @@ class TestCoreferenceProcessor(unittest.TestCase): def setUp(self): self.pl = Pipeline[DataPack](enforce_consistency=True) self.pl.set_reader(StringReader()) - self.pl.add( - SpacyProcessor(), - {"processors": ["sentence", "tokenize"], "lang": "en_core_web_sm"}, - ) self.pl.add( CoreferenceProcessor(), { From c9d56a412802c86e5728e7f3c07a9b7acab3e7df Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 20:43:53 +0800 Subject: [PATCH 31/63] remove TODO --- fortex/health/processors/coreference_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index cb2c011e..e2e8423d 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -200,7 +200,7 @@ def record(self, record_meta: Dict[str, Set[str]]): Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. - """ # TODO: check docstring + """ record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = { "coref_groups", "has_coref", From 09114ad6fc1bbab71abf12d59c07e0ecc4278e84 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 20:55:22 +0800 Subject: [PATCH 32/63] add load_lang_model --- fortex/health/processors/coreference_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index e2e8423d..36e9b8c0 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -17,7 +17,7 @@ from typing import Dict, Set import importlib -import spacy +from fortex.spacy.spacy_processors import load_lang_model import neuralcoref @@ -52,7 +52,7 @@ def __init__(self): self.spacy_nlp = None def set_up(self, configs: Config): - self.spacy_nlp = spacy.load(configs.lang) + self.spacy_nlp = load_lang_model(configs.lang) if self.spacy_nlp is None: raise ProcessExecutionException( From cb676c38f4adb7eb819d6fcf21f7be27a8efa59b Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 20:59:59 +0800 Subject: [PATCH 33/63] remove spacy from requirements and setup --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index ee79c660..fc0f8c81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ dataclasses~=0.8; python_version < '3.7' setuptools~=57.0.0 transformers~=4.2.2 -spacy>=2.3.0, <=2.3.5 +# spacy>=2.3.0, <=2.3.5 # will be installed by forte.spacy cython>=0.25 pytest # It is annoying that if we install neuralcoref and spacy at the same diff --git a/setup.py b/setup.py index 8ff7e22b..ed849bfa 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", - "spacy>=2.3.0, <=2.3.5", + # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy "cython>=0.25", "pytest", ], From e5ef675970c5bbd869166d95fd9f4c36382c9cb3 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 17 Jun 2022 21:01:55 +0800 Subject: [PATCH 34/63] change import order --- fortex/health/processors/coreference_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 36e9b8c0..6c89ef1c 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -17,8 +17,6 @@ from typing import Dict, Set import importlib -from fortex.spacy.spacy_processors import load_lang_model - import neuralcoref from forte.common import Resources, ProcessExecutionException @@ -29,6 +27,8 @@ from ft.onto.base_ontology import CoreferenceGroup from ftx.medical.clinical_ontology import MedicalArticle +from fortex.spacy.spacy_processors import load_lang_model + __all__ = [ "CoreferenceProcessor", ] From 8b18e6740dc81c16c9495b743cc0b2a63ea5cc8f Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 12:02:12 +0800 Subject: [PATCH 35/63] use ddt data and unpack --- .../processors/coreference_processor_test.py | 54 +++++++------------ 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 93ad1cfa..d5eb3cbd 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -16,16 +16,13 @@ """ import unittest -from ddt import data, ddt +from ddt import data, ddt, unpack from forte.data.data_pack import DataPack from forte.data.readers import StringReader from forte.pipeline import Pipeline from ftx.medical.clinical_ontology import MedicalArticle -from ft.onto.base_ontology import ( - Token, -) from fortex.health.processors.coreference_processor import ( CoreferenceProcessor, ) @@ -54,37 +51,27 @@ def setUp(self): self.pl.initialize() - @data("My sister has a dog. She loves him.") - def test_daily_language(self, input_data): - for pack in self.pl.process_dataset(input_data): - for article in pack.get(MedicalArticle): - has_coref = article.has_coref - assert has_coref is True - - coref_groups = article.coref_groups - output_list = [] - check_list = [["My sister", "She"], ["a dog", "him"]] - for group in coref_groups: - members = [member for member in group.get_members()] - members = sorted(members, key=lambda x: x.begin) - - mention_texts = [member.text for member in members] - output_list.append(mention_texts) - assert output_list == check_list - @data( - "ADDENDUM:\n" - "RADIOLOGIC STUDIES: Radiologic studies also included " - "a chest CT, which confirmed cavitary lesions " - "in the left lung apex consistent with infectious process/tuberculosis.\n" - "This also moderate-sized left pleural effusion.\n" - "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, " - "but old infarction consistent with past medical history.\n" - "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum " - "most likely secondary to steoporosis.\n" - "These can be followed by repeat imaging as an outpatient.", + ( + "ADDENDUM:\n" + "RADIOLOGIC STUDIES: Radiologic studies also included " + "a chest CT, which confirmed cavitary lesions " + "in the left lung apex consistent with infectious process/tuberculosis.\n" + "This also moderate-sized left pleural effusion.\n" + "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, " + "but old infarction consistent with past medical history.\n" + "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum " + "most likely secondary to steoporosis.\n" + "These can be followed by repeat imaging as an outpatient.", + [["HEAD CT", "Head CT", "Abdominal CT"]], + ), + ( + "My sister has a dog. She loves him.", + [["My sister", "She"], ["a dog", "him"]], + ), ) - def test_medical_notes(self, input_data): + @unpack + def test_medical_notes(self, input_data, check_list): for pack in self.pl.process_dataset(input_data): for article in pack.get(MedicalArticle): has_coref = article.has_coref @@ -92,7 +79,6 @@ def test_medical_notes(self, input_data): coref_groups = article.coref_groups output_list = [] - check_list = [["HEAD CT", "Head CT", "Abdominal CT"]] for group in coref_groups: members = [member for member in group.get_members()] members = sorted(members, key=lambda x: x.begin) From 299d6d3421b01387ba47bbbd03502264380459d1 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 16:37:14 +0800 Subject: [PATCH 36/63] update config structure --- .../processors/coreference_processor.py | 121 +++++++++--------- 1 file changed, 62 insertions(+), 59 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 6c89ef1c..3680ad73 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -25,7 +25,6 @@ from forte.processors.base import PackProcessor from ft.onto.base_ontology import CoreferenceGroup -from ftx.medical.clinical_ontology import MedicalArticle from fortex.spacy.spacy_processors import load_lang_model @@ -60,15 +59,8 @@ def set_up(self, configs: Config): "haven't called the initialization function." ) - model = configs.model - cfg_inference = { - "greedyness": configs.greedyness, - "max_dist": configs.max_dist, - "max_dist_match": configs.max_dist_match, - "blacklist": configs.blacklist, - "store_scores": configs.store_scores, - "conv_dict": configs.conv_dict, - } + model = configs.model if configs.model != "use_default_model" else True + cfg_inference = configs.cfg_inference neuralcoref.add_to_pipe( self.spacy_nlp, model=model, cfg_inference=cfg_inference ) @@ -82,8 +74,7 @@ def _process(self, input_pack: DataPack): Coreference resolution is done by a spaCy pipeline with `NeuralCoref` added. - Then we translate the output to `CoreferenceGroup` and - `MedicalEntityMention` + Then we translate the output to `CoreferenceGroup`. """ def load_module(string): @@ -100,22 +91,9 @@ def load_module(string): for entry_specified in input_pack.get(entry_type=entry_type): result = self.spacy_nlp(entry_specified.text) - article = MedicalArticle( - pack=input_pack, - begin=entry_specified.span.begin, - end=entry_specified.span.end, - ) - if not result._.has_coref: - article.has_coref = False - article.coref_groups = [] - article.coref_resolved = result._.coref_resolved - article.coref_scores = {} + continue else: - article.has_coref = True - article.coref_groups = [] - article.coref_resolved = result._.coref_resolved - article.coref_scores = result._.coref_scores for cluster in result._.coref_clusters: mentions = [] @@ -130,48 +108,79 @@ def load_module(string): group = CoreferenceGroup(input_pack) group.add_members(mentions) - article.coref_groups.append(group) - @classmethod def default_configs(cls): r""" This defines a basic config structure for `CoreferenceProcessor`. Following are the keys for this dictionary: - - `entry_type`: Input entry type. Default: `"ft.onto.base_ontology.Document"`. - - `mention_type`: Output mention type. - Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. - It can also be set to `"ft.onto.base_ontology.EntityMention"`. - - `model`: the neural net model to be used by NeuralCoref. If set to `True`, - a new instance will be created with `NeuralCoref.Model()`. Default: `True`. - in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`. + - `entry_type`: Input entry type. You can change the context of + coreference resolution by setting this parameter. For example, + if you want to do coreference resolution within documents, set + it to `"ft.onto.base_ontology.Document"`. If you want to do + coreference resolution within sentences, set it to + `"ft.onto.base_ontology.Sentence"`. + Default: `"ft.onto.base_ontology.Document"`. + - `mention_type`: The type of members in `CoreferenceGroup`. + Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. + It can also be set to `"ft.onto.base_ontology.EntityMention"`. + - `model`: the neural net model to be used by NeuralCoref. If set to + `"use_default_model"`, a pre-trained neural net will be downloaded and cached. + If set to your customized model, the model needs to be a tuple containing a + `single_model` and a `pairs_model`. See `NeuralCoref.Model` method in + https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx + for reference of how the default model is defined. + Default: `"use_default_model"`. + - `cfg_inference`: A dict containing the inference configs of NeuralCoref. See + `get_default_cfg_inference` for default values, and see + https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters + for the meaing of these parameters. + + Returns: A dictionary with the default config for this processor. + """ + return { + "entry_type": "ft.onto.base_ontology.Document", + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "model": "use_default_model", + "cfg_inference": cls.get_default_cfg_inference(), + } + + @classmethod + def get_default_cfg_inference(cls): + """ + This defines the default inference config of NeuralCoref. + + Following are the keys for this dictionary: - `greedyness` (`float`): A number between 0 and 1 determining how greedy the model is about making coreference decisions - (more greedy means more coreference links). Default: `0.5`. + (more greedy means more coreference links). + Default: `0.5`. - `max_dist` (`int`): How many mentions back to look when considering possible antecedents of the current mention. Decreasing the value will cause - the system to run faster but less accurately. Default: `50`. + the system to run faster but less accurately. + Default: `50`. - `max_dist_match` (`int`): The system will consider linking the current mention to a preceding one further than max_dist away if they share a noun or - proper noun. In this case, it looks max_dist_match away instead. Default: `500`. + proper noun. In this case, it looks max_dist_match away instead. + Default: `500`. - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the - following list: ["i", "me", "my", "you", "your"]. Default `True`. + following list: ["i", "me", "my", "you", "your"]. + Default `True`. - `store_scores` (`bool`): Should the system store the scores for the coreferences - in annotations. Default: `True` + in annotations. + Default: `True` - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use to replace the embeddings of rare words (keys) by an average of the embeddings - of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}` - will help resolving coreferences for Angela by using the embeddings for the more - common woman and girl instead of the embedding of Angela. - This currently only works for single words (not for words groups). Default: `None`. + of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}` + will help resolving coreferences for Angela by using the embeddings for the more + common woman and girl instead of the embedding of Angela. + This currently only works for single words (not for words groups). + Default: `None`. - Returns: A dictionary with the default config for this processor. + Returns: A dictionary with the default inference config of NeuralCoref. """ return { - "entry_type": "ft.onto.base_ontology.Document", - "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", - "lang": "en_core_web_sm", - "model": True, "greedyness": 0.5, "max_dist": 50, "max_dist_match": 500, @@ -188,22 +197,16 @@ def expected_types_and_attributes(self): :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ - return {"ft.onto.base_ontology.Document": set()} + return {self.configs.entry_type: set("text")} def record(self, record_meta: Dict[str, Set[str]]): r""" Method to add output type record of `CoreferenceProcessor` which - is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute - `coref_groups`, `has_coref`, `coref_scores`, and `coref_resolved` - to :attr:`forte.data.data_pack.Meta.record`. + is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute + `members` to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ - record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = { - "coref_groups", - "has_coref", - "coref_scores", - "coref_resolved", - } + record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"} From 1a0e23926df2466a711790d530991d0beb540b2c Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 16:53:57 +0800 Subject: [PATCH 37/63] add comment for lang --- fortex/health/processors/coreference_processor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 3680ad73..f14ba9c8 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -124,6 +124,9 @@ def default_configs(cls): - `mention_type`: The type of members in `CoreferenceGroup`. Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. It can also be set to `"ft.onto.base_ontology.EntityMention"`. + - `lang`: The SpaCy pipeline to be used. The pipeline does the preprocessing + steps for NeuralCoref. + Default: `"en_core_web_sm"`. - `model`: the neural net model to be used by NeuralCoref. If set to `"use_default_model"`, a pre-trained neural net will be downloaded and cached. If set to your customized model, the model needs to be a tuple containing a From 36bcabade1d160eafaf6291e427657874ef38a6a Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 17:10:19 +0800 Subject: [PATCH 38/63] fix set() bug --- .../processors/coreference_processor.py | 2 +- .../processors/coreference_processor_test.py | 26 +++++++++---------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index f14ba9c8..317b9208 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -200,7 +200,7 @@ def expected_types_and_attributes(self): :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ - return {self.configs.entry_type: set("text")} + return {self.configs.entry_type: {"text"}} def record(self, record_meta: Dict[str, Set[str]]): r""" diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index d5eb3cbd..293ddae5 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -22,7 +22,7 @@ from forte.data.readers import StringReader from forte.pipeline import Pipeline -from ftx.medical.clinical_ontology import MedicalArticle +from ft.onto.base_ontology import Document, CoreferenceGroup from fortex.health.processors.coreference_processor import ( CoreferenceProcessor, ) @@ -39,13 +39,15 @@ def setUp(self): "entry_type": "ft.onto.base_ontology.Document", "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", "lang": "en_core_web_sm", - "model": True, - "greedyness": 0.5, - "max_dist": 50, - "max_dist_match": 500, - "blacklist": True, - "store_scores": True, - "conv_dict": None, + "model": "use_default_model", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "store_scores": True, + "conv_dict": None, + }, }, ) @@ -73,13 +75,9 @@ def setUp(self): @unpack def test_medical_notes(self, input_data, check_list): for pack in self.pl.process_dataset(input_data): - for article in pack.get(MedicalArticle): - has_coref = article.has_coref - assert has_coref is True - - coref_groups = article.coref_groups + for document in pack.get(Document): output_list = [] - for group in coref_groups: + for group in pack.get(CoreferenceGroup, document): members = [member for member in group.get_members()] members = sorted(members, key=lambda x: x.begin) From 3a20b8dbad909ff6879e8e3c18e139d6b51f1aeb Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 18:25:32 +0800 Subject: [PATCH 39/63] add offset calculation assertion --- fortex/health/processors/coreference_processor.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 317b9208..f9b56130 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -98,11 +98,16 @@ def load_module(string): mentions = [] for mention in cluster.mentions: + mention_text = mention.text mention = mention_type( input_pack, - mention.start_char, - mention.end_char, + mention.start_char + entry_specified.begin, + mention.end_char + entry_specified.begin, ) + assert (mention.text == mention_text, # TODO: remove assertion? + f"The processor extracted mention {mention.text}" + f" which is different from the original mention {mention_text}." + f"The offeset calculation is wrong.") mentions.append(mention) group = CoreferenceGroup(input_pack) @@ -200,7 +205,8 @@ def expected_types_and_attributes(self): :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ - return {self.configs.entry_type: {"text"}} + # return {self.configs.entry_type: {"text"}} # TODO: fix this + return {self.configs.entry_type: set()} def record(self, record_meta: Dict[str, Set[str]]): r""" From 26efabd216699311bcb67b12b0d498feb50a0839 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 18:32:35 +0800 Subject: [PATCH 40/63] formatting --- fortex/health/processors/coreference_processor.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index f9b56130..08d78219 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -104,10 +104,12 @@ def load_module(string): mention.start_char + entry_specified.begin, mention.end_char + entry_specified.begin, ) - assert (mention.text == mention_text, # TODO: remove assertion? - f"The processor extracted mention {mention.text}" - f" which is different from the original mention {mention_text}." - f"The offeset calculation is wrong.") + assert ( + mention.text == mention_text, # TODO: remove assertion? + f"The processor extracted mention {mention.text}" + f" which is different from the original mention {mention_text}." + f"The offeset calculation is wrong.", + ) mentions.append(mention) group = CoreferenceGroup(input_pack) From f7db0245aefdcffc2034f8b4e780e11f6663e649 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 18:34:12 +0800 Subject: [PATCH 41/63] udpate test --- .../processors/coreference_processor_test.py | 99 ++++++++++++++----- 1 file changed, 73 insertions(+), 26 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 293ddae5..71a892c3 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -15,6 +15,7 @@ Unit tests for CoreferenceProcessor """ +import importlib import unittest from ddt import data, ddt, unpack @@ -26,32 +27,33 @@ from fortex.health.processors.coreference_processor import ( CoreferenceProcessor, ) +from fortex.spacy import SpacyProcessor @ddt class TestCoreferenceProcessor(unittest.TestCase): - def setUp(self): - self.pl = Pipeline[DataPack](enforce_consistency=True) - self.pl.set_reader(StringReader()) - self.pl.add( - CoreferenceProcessor(), - { - "entry_type": "ft.onto.base_ontology.Document", - "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", - "lang": "en_core_web_sm", - "model": "use_default_model", - "cfg_inference": { - "greedyness": 0.5, - "max_dist": 50, - "max_dist_match": 500, - "blacklist": True, - "store_scores": True, - "conv_dict": None, - }, - }, - ) + # def setUp(self): + # self.pl = Pipeline[DataPack](enforce_consistency=True) + # self.pl.set_reader(StringReader()) + # self.pl.add( + # CoreferenceProcessor(), + # { + # "entry_type": "ft.onto.base_ontology.Document", + # "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + # "lang": "en_core_web_sm", + # "model": "use_default_model", + # "cfg_inference": { + # "greedyness": 0.5, + # "max_dist": 50, + # "max_dist_match": 500, + # "blacklist": True, + # "store_scores": True, + # "conv_dict": None, + # }, + # }, + # ) - self.pl.initialize() + # self.pl.initialize() @data( ( @@ -66,21 +68,66 @@ def setUp(self): "most likely secondary to steoporosis.\n" "These can be followed by repeat imaging as an outpatient.", [["HEAD CT", "Head CT", "Abdominal CT"]], + "ft.onto.base_ontology.Document", ), ( "My sister has a dog. She loves him.", [["My sister", "She"], ["a dog", "him"]], + "ft.onto.base_ontology.Document", + ), + ( + "My sister loves her dog. My aunt also loves him.", + [["My sister", "her"], ["My aunt", "him"]], + "ft.onto.base_ontology.Sentence", + ), + ( + "My sister loves her dog. My aunt also loves him.", + [["My sister", "her"], ["her dog", "him"]], + "ft.onto.base_ontology.Document", + # Document-level coref is different from sentence-level. ), ) @unpack - def test_medical_notes(self, input_data, check_list): + def test_medical_notes(self, input_data, check_list, entry_type): + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": entry_type, + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "model": "use_default_model", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "store_scores": True, + "conv_dict": None, + }, + }, + ) + + self.pl.initialize() + + def load_module(string): + path_str, module_str = string.rsplit(".", 1) + mod = importlib.import_module(path_str) + return getattr(mod, module_str) + + entry_type = load_module(entry_type) + for pack in self.pl.process_dataset(input_data): - for document in pack.get(Document): - output_list = [] - for group in pack.get(CoreferenceGroup, document): + output_list = [] + + for document in pack.get(entry_type): + for group in document.get(CoreferenceGroup): members = [member for member in group.get_members()] members = sorted(members, key=lambda x: x.begin) mention_texts = [member.text for member in members] output_list.append(mention_texts) - assert output_list == check_list + + self.assertEqual(output_list, check_list, f"input: {document.text}") From f453e60f94db9ccdd9b45132f0de26479e0eec40 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 18:44:37 +0800 Subject: [PATCH 42/63] shorten comment --- .../processors/coreference_processor.py | 96 +++++++++++-------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 08d78219..d591fbbe 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -35,12 +35,12 @@ class CoreferenceProcessor(PackProcessor): r""" - Implementation of this CoreferenceProcessor has been based on huggingface - NeuralCoref. You can find more details in the original repo. + Implementation of this CoreferenceProcessor has been based on + huggingface NeuralCoref. You can find more details in the original repo. Note that the NeuralCoref package from PyPI uses a dated spaCy - version (2.1), which can cause segmentation fault with the spaCy we use (2.3). - Please install NeuralCoref by building from source. + version (2.1), which can cause segmentation fault with the spaCy + we use (2.3). Please install NeuralCoref by building from source. Referred repository link: https://github.com/huggingface/neuralcoref @@ -59,7 +59,10 @@ def set_up(self, configs: Config): "haven't called the initialization function." ) - model = configs.model if configs.model != "use_default_model" else True + if configs.model != "use_default_model": + model = configs.model + else: + model = True cfg_inference = configs.cfg_inference neuralcoref.add_to_pipe( self.spacy_nlp, model=model, cfg_inference=cfg_inference @@ -104,11 +107,14 @@ def load_module(string): mention.start_char + entry_specified.begin, mention.end_char + entry_specified.begin, ) + + # TODO: remove assertion? assert ( - mention.text == mention_text, # TODO: remove assertion? + mention.text == mention_text, f"The processor extracted mention {mention.text}" - f" which is different from the original mention {mention_text}." - f"The offeset calculation is wrong.", + f" which is different from the original mention" + f" {mention_text}. The offeset calculation" + f" is wrong.", ) mentions.append(mention) @@ -131,18 +137,20 @@ def default_configs(cls): - `mention_type`: The type of members in `CoreferenceGroup`. Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. It can also be set to `"ft.onto.base_ontology.EntityMention"`. - - `lang`: The SpaCy pipeline to be used. The pipeline does the preprocessing - steps for NeuralCoref. + - `lang`: The SpaCy pipeline to be used. The pipeline does the + preprocessing steps for NeuralCoref. Default: `"en_core_web_sm"`. - - `model`: the neural net model to be used by NeuralCoref. If set to - `"use_default_model"`, a pre-trained neural net will be downloaded and cached. - If set to your customized model, the model needs to be a tuple containing a - `single_model` and a `pairs_model`. See `NeuralCoref.Model` method in + - `model`: the neural net model to be used by NeuralCoref. If set + to `"use_default_model"`, a pre-trained neural net will be + downloaded and cached. + If set to your customized model, the model needs to be a tuple + containing a `single_model` and a `pairs_model`. + See `NeuralCoref.Model` method in https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx for reference of how the default model is defined. Default: `"use_default_model"`. - - `cfg_inference`: A dict containing the inference configs of NeuralCoref. See - `get_default_cfg_inference` for default values, and see + - `cfg_inference`: A dict containing the inference configs of + NeuralCoref. See `get_default_cfg_inference` for default values, and see https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters for the meaing of these parameters. @@ -162,31 +170,37 @@ def get_default_cfg_inference(cls): This defines the default inference config of NeuralCoref. Following are the keys for this dictionary: - - `greedyness` (`float`): A number between 0 and 1 determining how greedy - the model is about making coreference decisions - (more greedy means more coreference links). - Default: `0.5`. - - `max_dist` (`int`): How many mentions back to look when considering possible - antecedents of the current mention. Decreasing the value will cause - the system to run faster but less accurately. - Default: `50`. - - `max_dist_match` (`int`): The system will consider linking the current mention - to a preceding one further than max_dist away if they share a noun or - proper noun. In this case, it looks max_dist_match away instead. - Default: `500`. - - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the - following list: ["i", "me", "my", "you", "your"]. - Default `True`. - - `store_scores` (`bool`): Should the system store the scores for the coreferences - in annotations. - Default: `True` - - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use - to replace the embeddings of rare words (keys) by an average of the embeddings - of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}` - will help resolving coreferences for Angela by using the embeddings for the more - common woman and girl instead of the embedding of Angela. - This currently only works for single words (not for words groups). - Default: `None`. + - `greedyness` (`float`): A number between 0 and 1 determining + how greedy the model is about making coreference decisions + (more greedy means more coreference links). + Default: `0.5`. + - `max_dist` (`int`): How many mentions back to look when + considering possible antecedents of the current mention. + Decreasing the value will cause the system to run faster + but less accurately. + Default: `50`. + - `max_dist_match` (`int`): The system will consider linking + the current mention + to a preceding one further than max_dist away if they share + a noun or proper noun. In this case, it looks max_dist_match + away instead. + Default: `500`. + - `blacklist` (`bool`): Should the system resolve coreferences + for pronouns in the following list: ["i", "me", "my", "you", "your"]. + Default `True`. + - `store_scores` (`bool`): Should the system store the scores + for the coreferences in annotations. + Default: `True` + - `conv_dict` (`dict(str, list(str))`): A conversion dictionary + that you can use + to replace the embeddings of rare words (keys) by an average + of the embeddings of a list of common words (values). + Ex: `conv_dict={"Angela": ["woman", "girl"]}` + will help resolving coreferences for Angela by using the + embeddings for the more common woman and girl instead of the + embedding of Angela. + This currently only works for single words (not for words groups). + Default: `None`. Returns: A dictionary with the default inference config of NeuralCoref. """ From 1d006fdb4e2e9ec2d63f71ae3332733f84b3d287 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 18:45:55 +0800 Subject: [PATCH 43/63] remove store_scores --- fortex/health/processors/coreference_processor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index d591fbbe..bc0bbe33 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -188,9 +188,6 @@ def get_default_cfg_inference(cls): - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the following list: ["i", "me", "my", "you", "your"]. Default `True`. - - `store_scores` (`bool`): Should the system store the scores - for the coreferences in annotations. - Default: `True` - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use to replace the embeddings of rare words (keys) by an average @@ -209,7 +206,6 @@ def get_default_cfg_inference(cls): "max_dist": 50, "max_dist_match": 500, "blacklist": True, - "store_scores": True, "conv_dict": None, } From dcec89faaeb95083f5f542e9d9ac8a8d89b45767 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 18:51:35 +0800 Subject: [PATCH 44/63] fix assertion --- fortex/health/processors/coreference_processor.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index bc0bbe33..438a476f 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -110,12 +110,10 @@ def load_module(string): # TODO: remove assertion? assert ( - mention.text == mention_text, - f"The processor extracted mention {mention.text}" - f" which is different from the original mention" - f" {mention_text}. The offeset calculation" - f" is wrong.", - ) + mention.text == mention_text + ), f"The processor extracted mention {mention.text}" + f" which is different from the original mention" + f" {mention_text}. The offeset calculation is wrong." mentions.append(mention) group = CoreferenceGroup(input_pack) From 465341d6c762c4119836fecd3dab301cf471163e Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 19:07:46 +0800 Subject: [PATCH 45/63] remove store_scores in test --- .../processors/coreference_processor_test.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 71a892c3..5e250b9d 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -32,29 +32,6 @@ @ddt class TestCoreferenceProcessor(unittest.TestCase): - # def setUp(self): - # self.pl = Pipeline[DataPack](enforce_consistency=True) - # self.pl.set_reader(StringReader()) - # self.pl.add( - # CoreferenceProcessor(), - # { - # "entry_type": "ft.onto.base_ontology.Document", - # "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", - # "lang": "en_core_web_sm", - # "model": "use_default_model", - # "cfg_inference": { - # "greedyness": 0.5, - # "max_dist": 50, - # "max_dist_match": 500, - # "blacklist": True, - # "store_scores": True, - # "conv_dict": None, - # }, - # }, - # ) - - # self.pl.initialize() - @data( ( "ADDENDUM:\n" @@ -104,7 +81,6 @@ def test_medical_notes(self, input_data, check_list, entry_type): "max_dist": 50, "max_dist_match": 500, "blacklist": True, - "store_scores": True, "conv_dict": None, }, }, From 3ee2f7bbba7be938ec1b6320bc8718f348cc408c Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 19:08:45 +0800 Subject: [PATCH 46/63] rename document to entry --- .../processors/coreference_processor_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 5e250b9d..f72e18cf 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -23,7 +23,7 @@ from forte.data.readers import StringReader from forte.pipeline import Pipeline -from ft.onto.base_ontology import Document, CoreferenceGroup +from ft.onto.base_ontology import CoreferenceGroup from fortex.health.processors.coreference_processor import ( CoreferenceProcessor, ) @@ -98,12 +98,12 @@ def load_module(string): for pack in self.pl.process_dataset(input_data): output_list = [] - for document in pack.get(entry_type): - for group in document.get(CoreferenceGroup): + for entry in pack.get(entry_type): + for group in entry.get(CoreferenceGroup): members = [member for member in group.get_members()] members = sorted(members, key=lambda x: x.begin) mention_texts = [member.text for member in members] output_list.append(mention_texts) - self.assertEqual(output_list, check_list, f"input: {document.text}") + self.assertEqual(output_list, check_list, f"input: {entry.text}") From 1d44ff738dcccf11e437bec422bf533ed04c879f Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 19:36:47 +0800 Subject: [PATCH 47/63] fix cfg_inference kwargs --- fortex/health/processors/coreference_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 438a476f..b1b11f81 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -65,7 +65,7 @@ def set_up(self, configs: Config): model = True cfg_inference = configs.cfg_inference neuralcoref.add_to_pipe( - self.spacy_nlp, model=model, cfg_inference=cfg_inference + self.spacy_nlp, model=model, **cfg_inference ) def initialize(self, resources: Resources, configs: Config): From 0ad95d8339ac3db82cef669ce4230a7deee03a4d Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 19:37:49 +0800 Subject: [PATCH 48/63] add conv_dict test --- .../processors/coreference_processor_test.py | 61 ++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index f72e18cf..e8639c5e 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -56,6 +56,7 @@ class TestCoreferenceProcessor(unittest.TestCase): "My sister loves her dog. My aunt also loves him.", [["My sister", "her"], ["My aunt", "him"]], "ft.onto.base_ontology.Sentence", + # Sentence-level coref resolution. ), ( "My sister loves her dog. My aunt also loves him.", @@ -65,7 +66,7 @@ class TestCoreferenceProcessor(unittest.TestCase): ), ) @unpack - def test_medical_notes(self, input_data, check_list, entry_type): + def test_inputs_and_entry_types(self, input_data, check_list, entry_type): self.pl = Pipeline[DataPack](enforce_consistency=True) self.pl.set_reader(StringReader()) self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) @@ -107,3 +108,61 @@ def load_module(string): output_list.append(mention_texts) self.assertEqual(output_list, check_list, f"input: {entry.text}") + + @data( + ( + "Deepika has a dog. She loves him. The movie star has always been fond of animals", + [["Deepika", "She", "him", "The movie star"]], + {}, + ), + ( + "Deepika has a dog. She loves him. The movie star has always been fond of animals", + [["Deepika", "She", "The movie star"], ["a dog", "him"]], + {"Deepika": ["woman", "actress"]}, + ), + ) + @unpack + def test_conv_dict(self, input_data, check_list, conv_dict): + entry_type = "ft.onto.base_ontology.Document" + + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": entry_type, + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "model": "use_default_model", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": conv_dict, + }, + }, + ) + + self.pl.initialize() + + def load_module(string): + path_str, module_str = string.rsplit(".", 1) + mod = importlib.import_module(path_str) + return getattr(mod, module_str) + + entry_type = load_module(entry_type) + + for pack in self.pl.process_dataset(input_data): + output_list = [] + + for entry in pack.get(entry_type): + for group in entry.get(CoreferenceGroup): + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + + self.assertEqual(output_list, check_list, f"input: {entry.text}") From 87b099686da89009160fb32120500942f108281f Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 19:42:53 +0800 Subject: [PATCH 49/63] black reformat --- fortex/health/processors/coreference_processor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index b1b11f81..4afc679e 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -64,9 +64,7 @@ def set_up(self, configs: Config): else: model = True cfg_inference = configs.cfg_inference - neuralcoref.add_to_pipe( - self.spacy_nlp, model=model, **cfg_inference - ) + neuralcoref.add_to_pipe(self.spacy_nlp, model=model, **cfg_inference) def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) From 148e91891cfb36e433515ff7928d8485b695523e Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 19:53:41 +0800 Subject: [PATCH 50/63] fix pylint --- fortex/health/processors/coreference_processor.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 4afc679e..507e9614 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -92,9 +92,7 @@ def load_module(string): for entry_specified in input_pack.get(entry_type=entry_type): result = self.spacy_nlp(entry_specified.text) - if not result._.has_coref: - continue - else: + if result._.has_coref: for cluster in result._.coref_clusters: mentions = [] @@ -107,11 +105,11 @@ def load_module(string): ) # TODO: remove assertion? - assert ( - mention.text == mention_text - ), f"The processor extracted mention {mention.text}" - f" which is different from the original mention" - f" {mention_text}. The offeset calculation is wrong." + assert mention.text == mention_text, ( + f"The processor extracted mention {mention.text}" + " which is different from the original mention" + f" {mention_text}. The offeset calculation is wrong." + ) mentions.append(mention) group = CoreferenceGroup(input_pack) From 2fefe18cc1615e38297ea9fef24e524912717956 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 20:51:19 +0800 Subject: [PATCH 51/63] update comment --- fortex/health/processors/coreference_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 507e9614..529e0b55 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -141,7 +141,7 @@ def default_configs(cls): containing a `single_model` and a `pairs_model`. See `NeuralCoref.Model` method in https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx - for reference of how the default model is defined. + for reference of how to define such a model. Default: `"use_default_model"`. - `cfg_inference`: A dict containing the inference configs of NeuralCoref. See `get_default_cfg_inference` for default values, and see From 662b32428de0d30bb5506adc48287e6dcd22f8b9 Mon Sep 17 00:00:00 2001 From: KiaLAN <1139479308@qq.com> Date: Fri, 1 Jul 2022 20:59:35 +0800 Subject: [PATCH 52/63] update comment --- fortex/health/processors/coreference_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 529e0b55..dd38989c 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -129,8 +129,9 @@ def default_configs(cls): `"ft.onto.base_ontology.Sentence"`. Default: `"ft.onto.base_ontology.Document"`. - `mention_type`: The type of members in `CoreferenceGroup`. + It can be set to `"ft.onto.base_ontology.EntityMention"` or + its subclasses. Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. - It can also be set to `"ft.onto.base_ontology.EntityMention"`. - `lang`: The SpaCy pipeline to be used. The pipeline does the preprocessing steps for NeuralCoref. Default: `"en_core_web_sm"`. From 6bcc58c195b9689480713383a2166b7fac4ca9f2 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 15:01:55 +0400 Subject: [PATCH 53/63] try one --- setup.py | 62 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index ed849bfa..9040a9c1 100644 --- a/setup.py +++ b/setup.py @@ -20,29 +20,61 @@ include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"] ), namespace_packages=["fortex"], + setup_requires=[ + "forte.spacy", # TODO: version + "cython>=0.25", + "pytest", + ], install_requires=[ + "enum34==1.1.10;python_version<'3.4'", + # "sortedcontainers>=2.1.0",` + # "numpy>=1.16.6", + # "jsonpickle>=1.4", + # "pyyaml>=5.4", + # "smart-open>=1.8.4", + # "typed_astunparse>=2.1.4", + # "funcsigs>=1.0.2", + # "typed_ast>=1.5.0", + # "jsonschema>=3.0.2", + # 'typing>=3.7.4;python_version<"3.5"', + # "typing-inspect>=0.6.0", + # 'dataclasses~=0.7;python_version<"3.7"', + # 'importlib-resources>=5.1.4;python`_version<"3.7"', + "asyml-utilities", + + "forte~=0.2.0", - "sortedcontainers==2.1.0", - "numpy>=1.16.6", - "jsonpickle==1.4", - "pyyaml==5.4", - "smart-open>=1.8.4", - "typed_astunparse==2.1.4", - "funcsigs==1.0.2", + # "sortedcontainers==2.1.0", + # "numpy>=1.16.6", + # "jsonpickle==1.4", + # "pyyaml==5.4", + # "smart-open>=1.8.4", + # "typed_astunparse==2.1.4", + # "funcsigs==1.0.2", "mypy_extensions==0.4.3", - "typed_ast>=1.4.3", - "jsonschema==3.0.2", + # "typed_ast>=1.4.3", + # "jsonschema==3.0.2", "texar-pytorch", - 'typing>=3.7.4;python_version<"3.5"', - "typing-inspect>=0.6.0", - 'dataclasses~=0.7;python_version<"3.7"', - 'importlib-resources==5.1.4;python_version<"3.7"', - 'dataclasses~=0.7;python_version<"3.7"', + # 'typing>=3.7.4;python_version<"3.5"', + # "typing-inspect>=0.6.0", + # 'dataclasses~=0.7;python_version<"3.7"', + # 'importlib-resources==5.1.4;python_version<"3.7"',S + # 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", - # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy + # # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy + "forte.spacy", # TODO: version "cython>=0.25", "pytest", + + "ddt", + "testfixtures", + "transformers==4.2.2", + "protobuf==3.19.4", + # It is annoying that if we install neuralcoref and spacy at the same + # time, neuralcoref will throw "Cython failed" during building. + # Therefore, we must install neuralcoref after spacy is installed. + "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], extras_require={ "test": [ From ae31363978ac1945c435a0ca4cc194777c781d51 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 16:00:54 +0400 Subject: [PATCH 54/63] use subprocess to install cython --- setup.py | 76 ++++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/setup.py b/setup.py index 9040a9c1..6d60718d 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ import sys from pathlib import Path import setuptools +import subprocess +import os long_description = (Path(__file__).parent / "README.md").read_text() @@ -8,6 +10,25 @@ if sys.version_info < (3, 6): sys.exit("Python>=3.6 is required by forte-medical.") +# If we install neuralcoref and spacy at the same +# time, neuralcoref will throw "Cython failed" during building, +# which is because neuralcoref does not set cython as dependency +# properly. +# Therefore, we must install neuralcoref after cython and spacy +# are installed. +p = subprocess.call( + [ + sys.executable, + "-m", + "pip", + "install", + "cython>=0.25", + ], + env=os.environ, +) +if p != 0: + raise RuntimeError("Installing NeuralCoref dependencies failed.") + setuptools.setup( name="forte.health", version="0.1.0", @@ -20,61 +41,14 @@ include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"] ), namespace_packages=["fortex"], - setup_requires=[ - "forte.spacy", # TODO: version - "cython>=0.25", - "pytest", - ], install_requires=[ - "enum34==1.1.10;python_version<'3.4'", - # "sortedcontainers>=2.1.0",` - # "numpy>=1.16.6", - # "jsonpickle>=1.4", - # "pyyaml>=5.4", - # "smart-open>=1.8.4", - # "typed_astunparse>=2.1.4", - # "funcsigs>=1.0.2", - # "typed_ast>=1.5.0", - # "jsonschema>=3.0.2", - # 'typing>=3.7.4;python_version<"3.5"', - # "typing-inspect>=0.6.0", - # 'dataclasses~=0.7;python_version<"3.7"', - # 'importlib-resources>=5.1.4;python`_version<"3.7"', - "asyml-utilities", - - "forte~=0.2.0", - # "sortedcontainers==2.1.0", - # "numpy>=1.16.6", - # "jsonpickle==1.4", - # "pyyaml==5.4", - # "smart-open>=1.8.4", - # "typed_astunparse==2.1.4", - # "funcsigs==1.0.2", "mypy_extensions==0.4.3", - # "typed_ast>=1.4.3", - # "jsonschema==3.0.2", "texar-pytorch", - # 'typing>=3.7.4;python_version<"3.5"', - # "typing-inspect>=0.6.0", - # 'dataclasses~=0.7;python_version<"3.7"', - # 'importlib-resources==5.1.4;python_version<"3.7"',S - # 'dataclasses~=0.7;python_version<"3.7"', "fastapi==0.65.2", "uvicorn==0.14.0", - # # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy - "forte.spacy", # TODO: version + "forte.spacy", # TODO: version "cython>=0.25", - "pytest", - - "ddt", - "testfixtures", - "transformers==4.2.2", - "protobuf==3.19.4", - # It is annoying that if we install neuralcoref and spacy at the same - # time, neuralcoref will throw "Cython failed" during building. - # Therefore, we must install neuralcoref after spacy is installed. - "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], extras_require={ "test": [ @@ -82,10 +56,8 @@ "testfixtures", "transformers==4.2.2", "protobuf==3.19.4", - # It is annoying that if we install neuralcoref and spacy at the same - # time, neuralcoref will throw "Cython failed" during building. - # Therefore, we must install neuralcoref after spacy is installed. - # "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", + "pytest", + "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], }, entry_points={ From 74ffad4719123adde254c3809bd6e39571181867 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 16:04:37 +0400 Subject: [PATCH 55/63] use subprocess to install cython and spacy --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6d60718d..a43a1b6b 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ # If we install neuralcoref and spacy at the same # time, neuralcoref will throw "Cython failed" during building, -# which is because neuralcoref does not set cython as dependency +# which is because neuralcoref does not set them as dependencies # properly. # Therefore, we must install neuralcoref after cython and spacy # are installed. @@ -22,6 +22,7 @@ "-m", "pip", "install", + "forte.spacy", # TODO: version "cython>=0.25", ], env=os.environ, From f3dc9e3cf0d455f62d8243fc8a70ae26fa50c50d Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 16:15:31 +0400 Subject: [PATCH 56/63] add extras_require for icd and coref --- setup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/setup.py b/setup.py index a43a1b6b..bc63e247 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,12 @@ "pytest", "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", ], + "icd_coding": [ + "transformers", + ], + "coreference": [ + "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref", + ], }, entry_points={ "console_scripts": [ From 2581edbd2cd362bb22e308f1449544bb79601818 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 16:27:10 +0400 Subject: [PATCH 57/63] remove spacy and neuralcoref stage from main.yml --- .github/workflows/main.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 89bd3a6e..1d4181a7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -83,16 +83,6 @@ jobs: run: | pip install --use-feature=2020-resolver --progress-bar off .[test] - - name: Install Forte-wrappers-spacy - run: | - git clone https://github.com/asyml/forte-wrappers.git - cd forte-wrappers - pip install src/spacy - - - name: Install NeuralCoref - run: | - pip install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref - - name: Test with pytest and run coverage run: | coverage run -m pytest tests/ From e09dfe72875e8466dfef455011a4c0b4129a91a3 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 16:41:52 +0400 Subject: [PATCH 58/63] replace load_module with get_class --- fortex/health/processors/coreference_processor.py | 10 +++------- .../processors/coreference_processor_test.py | 15 +++------------ 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index dd38989c..78be9aba 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -23,6 +23,7 @@ from forte.common.configuration import Config from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor +from forte.utils import get_class from ft.onto.base_ontology import CoreferenceGroup @@ -78,16 +79,11 @@ def _process(self, input_pack: DataPack): Then we translate the output to `CoreferenceGroup`. """ - def load_module(string): - path_str, module_str = string.rsplit(".", 1) - mod = importlib.import_module(path_str) - return getattr(mod, module_str) - # Default: Document - entry_type = load_module(self.configs.entry_type) + entry_type = get_class(self.configs.entry_type) # Default: MedicalEntityMention - mention_type = load_module(self.configs.mention_type) + mention_type = get_class(self.configs.mention_type) for entry_specified in input_pack.get(entry_type=entry_type): result = self.spacy_nlp(entry_specified.text) diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index e8639c5e..31d885e1 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -22,6 +22,7 @@ from forte.data.data_pack import DataPack from forte.data.readers import StringReader from forte.pipeline import Pipeline +from forte.utils import get_class from ft.onto.base_ontology import CoreferenceGroup from fortex.health.processors.coreference_processor import ( @@ -89,12 +90,7 @@ def test_inputs_and_entry_types(self, input_data, check_list, entry_type): self.pl.initialize() - def load_module(string): - path_str, module_str = string.rsplit(".", 1) - mod = importlib.import_module(path_str) - return getattr(mod, module_str) - - entry_type = load_module(entry_type) + entry_type = get_class(entry_type) for pack in self.pl.process_dataset(input_data): output_list = [] @@ -147,12 +143,7 @@ def test_conv_dict(self, input_data, check_list, conv_dict): self.pl.initialize() - def load_module(string): - path_str, module_str = string.rsplit(".", 1) - mod = importlib.import_module(path_str) - return getattr(mod, module_str) - - entry_type = load_module(entry_type) + entry_type = get_class(entry_type) for pack in self.pl.process_dataset(input_data): output_list = [] From c925f5bf1285632636759bac2e4decac3d2ca35c Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 16:44:16 +0400 Subject: [PATCH 59/63] remove 'model' argument --- .../health/processors/coreference_processor.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 78be9aba..5a046676 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -60,12 +60,8 @@ def set_up(self, configs: Config): "haven't called the initialization function." ) - if configs.model != "use_default_model": - model = configs.model - else: - model = True cfg_inference = configs.cfg_inference - neuralcoref.add_to_pipe(self.spacy_nlp, model=model, **cfg_inference) + neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference) def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) @@ -131,15 +127,6 @@ def default_configs(cls): - `lang`: The SpaCy pipeline to be used. The pipeline does the preprocessing steps for NeuralCoref. Default: `"en_core_web_sm"`. - - `model`: the neural net model to be used by NeuralCoref. If set - to `"use_default_model"`, a pre-trained neural net will be - downloaded and cached. - If set to your customized model, the model needs to be a tuple - containing a `single_model` and a `pairs_model`. - See `NeuralCoref.Model` method in - https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx - for reference of how to define such a model. - Default: `"use_default_model"`. - `cfg_inference`: A dict containing the inference configs of NeuralCoref. See `get_default_cfg_inference` for default values, and see https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters @@ -151,7 +138,6 @@ def default_configs(cls): "entry_type": "ft.onto.base_ontology.Document", "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", "lang": "en_core_web_sm", - "model": "use_default_model", "cfg_inference": cls.get_default_cfg_inference(), } From f6c1a8b6bf5899b3d045a92c006b8fa6c3dd3e9f Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 9 Jul 2022 17:38:32 +0400 Subject: [PATCH 60/63] fix rebundunt import and args --- fortex/health/processors/coreference_processor.py | 1 - tests/forte_medical/processors/coreference_processor_test.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py index 5a046676..89c0dc51 100644 --- a/fortex/health/processors/coreference_processor.py +++ b/fortex/health/processors/coreference_processor.py @@ -15,7 +15,6 @@ Coreference Processor """ from typing import Dict, Set -import importlib import neuralcoref diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py index 31d885e1..62d38c38 100644 --- a/tests/forte_medical/processors/coreference_processor_test.py +++ b/tests/forte_medical/processors/coreference_processor_test.py @@ -15,7 +15,6 @@ Unit tests for CoreferenceProcessor """ -import importlib import unittest from ddt import data, ddt, unpack @@ -77,7 +76,6 @@ def test_inputs_and_entry_types(self, input_data, check_list, entry_type): "entry_type": entry_type, "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", "lang": "en_core_web_sm", - "model": "use_default_model", "cfg_inference": { "greedyness": 0.5, "max_dist": 50, @@ -130,7 +128,6 @@ def test_conv_dict(self, input_data, check_list, conv_dict): "entry_type": entry_type, "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", "lang": "en_core_web_sm", - "model": "use_default_model", "cfg_inference": { "greedyness": 0.5, "max_dist": 50, From 7b31bebc9eb69750b9fdb25207bf54e2b8dabdab Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 16 Jul 2022 12:07:43 +0400 Subject: [PATCH 61/63] fix merge conflict --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 7c064ccb..d654ac34 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ ), namespace_packages=["fortex"], install_requires=[ + "forte~=0.2.0", "forte.spacy", # TODO: version "cython>=0.25", ], From 5ef774a97f746cd20492404ffc212a276a725f87 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 16 Jul 2022 12:16:43 +0400 Subject: [PATCH 62/63] fix merge conflict --- .../processors/coreference_processor_test.py | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 tests/fortex/health/processors/coreference_processor_test.py diff --git a/tests/fortex/health/processors/coreference_processor_test.py b/tests/fortex/health/processors/coreference_processor_test.py new file mode 100644 index 00000000..89c0dc51 --- /dev/null +++ b/tests/fortex/health/processors/coreference_processor_test.py @@ -0,0 +1,209 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Coreference Processor +""" +from typing import Dict, Set + +import neuralcoref + +from forte.common import Resources, ProcessExecutionException +from forte.common.configuration import Config +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor +from forte.utils import get_class + +from ft.onto.base_ontology import CoreferenceGroup + +from fortex.spacy.spacy_processors import load_lang_model + +__all__ = [ + "CoreferenceProcessor", +] + + +class CoreferenceProcessor(PackProcessor): + r""" + Implementation of this CoreferenceProcessor has been based on + huggingface NeuralCoref. You can find more details in the original repo. + + Note that the NeuralCoref package from PyPI uses a dated spaCy + version (2.1), which can cause segmentation fault with the spaCy + we use (2.3). Please install NeuralCoref by building from source. + + Referred repository link: + https://github.com/huggingface/neuralcoref + """ + + def __init__(self): + super().__init__() + self.spacy_nlp = None + + def set_up(self, configs: Config): + self.spacy_nlp = load_lang_model(configs.lang) + + if self.spacy_nlp is None: + raise ProcessExecutionException( + "The SpaCy pipeline is not initialized, maybe you " + "haven't called the initialization function." + ) + + cfg_inference = configs.cfg_inference + neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference) + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.set_up(configs) + + def _process(self, input_pack: DataPack): + r""" + Coreference resolution is done by + a spaCy pipeline with `NeuralCoref` added. + + Then we translate the output to `CoreferenceGroup`. + """ + + # Default: Document + entry_type = get_class(self.configs.entry_type) + + # Default: MedicalEntityMention + mention_type = get_class(self.configs.mention_type) + + for entry_specified in input_pack.get(entry_type=entry_type): + result = self.spacy_nlp(entry_specified.text) + + if result._.has_coref: + for cluster in result._.coref_clusters: + + mentions = [] + for mention in cluster.mentions: + mention_text = mention.text + mention = mention_type( + input_pack, + mention.start_char + entry_specified.begin, + mention.end_char + entry_specified.begin, + ) + + # TODO: remove assertion? + assert mention.text == mention_text, ( + f"The processor extracted mention {mention.text}" + " which is different from the original mention" + f" {mention_text}. The offeset calculation is wrong." + ) + mentions.append(mention) + + group = CoreferenceGroup(input_pack) + group.add_members(mentions) + + @classmethod + def default_configs(cls): + r""" + This defines a basic config structure for `CoreferenceProcessor`. + + Following are the keys for this dictionary: + - `entry_type`: Input entry type. You can change the context of + coreference resolution by setting this parameter. For example, + if you want to do coreference resolution within documents, set + it to `"ft.onto.base_ontology.Document"`. If you want to do + coreference resolution within sentences, set it to + `"ft.onto.base_ontology.Sentence"`. + Default: `"ft.onto.base_ontology.Document"`. + - `mention_type`: The type of members in `CoreferenceGroup`. + It can be set to `"ft.onto.base_ontology.EntityMention"` or + its subclasses. + Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. + - `lang`: The SpaCy pipeline to be used. The pipeline does the + preprocessing steps for NeuralCoref. + Default: `"en_core_web_sm"`. + - `cfg_inference`: A dict containing the inference configs of + NeuralCoref. See `get_default_cfg_inference` for default values, and see + https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters + for the meaing of these parameters. + + Returns: A dictionary with the default config for this processor. + """ + return { + "entry_type": "ft.onto.base_ontology.Document", + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "cfg_inference": cls.get_default_cfg_inference(), + } + + @classmethod + def get_default_cfg_inference(cls): + """ + This defines the default inference config of NeuralCoref. + + Following are the keys for this dictionary: + - `greedyness` (`float`): A number between 0 and 1 determining + how greedy the model is about making coreference decisions + (more greedy means more coreference links). + Default: `0.5`. + - `max_dist` (`int`): How many mentions back to look when + considering possible antecedents of the current mention. + Decreasing the value will cause the system to run faster + but less accurately. + Default: `50`. + - `max_dist_match` (`int`): The system will consider linking + the current mention + to a preceding one further than max_dist away if they share + a noun or proper noun. In this case, it looks max_dist_match + away instead. + Default: `500`. + - `blacklist` (`bool`): Should the system resolve coreferences + for pronouns in the following list: ["i", "me", "my", "you", "your"]. + Default `True`. + - `conv_dict` (`dict(str, list(str))`): A conversion dictionary + that you can use + to replace the embeddings of rare words (keys) by an average + of the embeddings of a list of common words (values). + Ex: `conv_dict={"Angela": ["woman", "girl"]}` + will help resolving coreferences for Angela by using the + embeddings for the more common woman and girl instead of the + embedding of Angela. + This currently only works for single words (not for words groups). + Default: `None`. + + Returns: A dictionary with the default inference config of NeuralCoref. + """ + return { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": None, + } + + def expected_types_and_attributes(self): + r""" + Method to add user specified expected type which would be checked + before running the processor if the pipeline is initialized with + `enforce_consistency=True` or + :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for + the pipeline. + """ + # return {self.configs.entry_type: {"text"}} # TODO: fix this + return {self.configs.entry_type: set()} + + def record(self, record_meta: Dict[str, Set[str]]): + r""" + Method to add output type record of `CoreferenceProcessor` which + is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute + `members` to :attr:`forte.data.data_pack.Meta.record`. + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"} From ffe88f64d2401e1a747caa1843d4e900c9066154 Mon Sep 17 00:00:00 2001 From: kialan <1139479308@qq.com> Date: Sat, 16 Jul 2022 12:48:52 +0400 Subject: [PATCH 63/63] fix merge conflict: restore coref test --- .../processors/coreference_processor_test.py | 323 ++++++++---------- 1 file changed, 135 insertions(+), 188 deletions(-) diff --git a/tests/fortex/health/processors/coreference_processor_test.py b/tests/fortex/health/processors/coreference_processor_test.py index 89c0dc51..62d38c38 100644 --- a/tests/fortex/health/processors/coreference_processor_test.py +++ b/tests/fortex/health/processors/coreference_processor_test.py @@ -12,198 +12,145 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Coreference Processor +Unit tests for CoreferenceProcessor """ -from typing import Dict, Set -import neuralcoref +import unittest +from ddt import data, ddt, unpack -from forte.common import Resources, ProcessExecutionException -from forte.common.configuration import Config from forte.data.data_pack import DataPack -from forte.processors.base import PackProcessor +from forte.data.readers import StringReader +from forte.pipeline import Pipeline from forte.utils import get_class from ft.onto.base_ontology import CoreferenceGroup - -from fortex.spacy.spacy_processors import load_lang_model - -__all__ = [ - "CoreferenceProcessor", -] - - -class CoreferenceProcessor(PackProcessor): - r""" - Implementation of this CoreferenceProcessor has been based on - huggingface NeuralCoref. You can find more details in the original repo. - - Note that the NeuralCoref package from PyPI uses a dated spaCy - version (2.1), which can cause segmentation fault with the spaCy - we use (2.3). Please install NeuralCoref by building from source. - - Referred repository link: - https://github.com/huggingface/neuralcoref - """ - - def __init__(self): - super().__init__() - self.spacy_nlp = None - - def set_up(self, configs: Config): - self.spacy_nlp = load_lang_model(configs.lang) - - if self.spacy_nlp is None: - raise ProcessExecutionException( - "The SpaCy pipeline is not initialized, maybe you " - "haven't called the initialization function." - ) - - cfg_inference = configs.cfg_inference - neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference) - - def initialize(self, resources: Resources, configs: Config): - super().initialize(resources, configs) - self.set_up(configs) - - def _process(self, input_pack: DataPack): - r""" - Coreference resolution is done by - a spaCy pipeline with `NeuralCoref` added. - - Then we translate the output to `CoreferenceGroup`. - """ - - # Default: Document - entry_type = get_class(self.configs.entry_type) - - # Default: MedicalEntityMention - mention_type = get_class(self.configs.mention_type) - - for entry_specified in input_pack.get(entry_type=entry_type): - result = self.spacy_nlp(entry_specified.text) - - if result._.has_coref: - for cluster in result._.coref_clusters: - - mentions = [] - for mention in cluster.mentions: - mention_text = mention.text - mention = mention_type( - input_pack, - mention.start_char + entry_specified.begin, - mention.end_char + entry_specified.begin, - ) - - # TODO: remove assertion? - assert mention.text == mention_text, ( - f"The processor extracted mention {mention.text}" - " which is different from the original mention" - f" {mention_text}. The offeset calculation is wrong." - ) - mentions.append(mention) - - group = CoreferenceGroup(input_pack) - group.add_members(mentions) - - @classmethod - def default_configs(cls): - r""" - This defines a basic config structure for `CoreferenceProcessor`. - - Following are the keys for this dictionary: - - `entry_type`: Input entry type. You can change the context of - coreference resolution by setting this parameter. For example, - if you want to do coreference resolution within documents, set - it to `"ft.onto.base_ontology.Document"`. If you want to do - coreference resolution within sentences, set it to - `"ft.onto.base_ontology.Sentence"`. - Default: `"ft.onto.base_ontology.Document"`. - - `mention_type`: The type of members in `CoreferenceGroup`. - It can be set to `"ft.onto.base_ontology.EntityMention"` or - its subclasses. - Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`. - - `lang`: The SpaCy pipeline to be used. The pipeline does the - preprocessing steps for NeuralCoref. - Default: `"en_core_web_sm"`. - - `cfg_inference`: A dict containing the inference configs of - NeuralCoref. See `get_default_cfg_inference` for default values, and see - https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters - for the meaing of these parameters. - - Returns: A dictionary with the default config for this processor. - """ - return { - "entry_type": "ft.onto.base_ontology.Document", - "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", - "lang": "en_core_web_sm", - "cfg_inference": cls.get_default_cfg_inference(), - } - - @classmethod - def get_default_cfg_inference(cls): - """ - This defines the default inference config of NeuralCoref. - - Following are the keys for this dictionary: - - `greedyness` (`float`): A number between 0 and 1 determining - how greedy the model is about making coreference decisions - (more greedy means more coreference links). - Default: `0.5`. - - `max_dist` (`int`): How many mentions back to look when - considering possible antecedents of the current mention. - Decreasing the value will cause the system to run faster - but less accurately. - Default: `50`. - - `max_dist_match` (`int`): The system will consider linking - the current mention - to a preceding one further than max_dist away if they share - a noun or proper noun. In this case, it looks max_dist_match - away instead. - Default: `500`. - - `blacklist` (`bool`): Should the system resolve coreferences - for pronouns in the following list: ["i", "me", "my", "you", "your"]. - Default `True`. - - `conv_dict` (`dict(str, list(str))`): A conversion dictionary - that you can use - to replace the embeddings of rare words (keys) by an average - of the embeddings of a list of common words (values). - Ex: `conv_dict={"Angela": ["woman", "girl"]}` - will help resolving coreferences for Angela by using the - embeddings for the more common woman and girl instead of the - embedding of Angela. - This currently only works for single words (not for words groups). - Default: `None`. - - Returns: A dictionary with the default inference config of NeuralCoref. - """ - return { - "greedyness": 0.5, - "max_dist": 50, - "max_dist_match": 500, - "blacklist": True, - "conv_dict": None, - } - - def expected_types_and_attributes(self): - r""" - Method to add user specified expected type which would be checked - before running the processor if the pipeline is initialized with - `enforce_consistency=True` or - :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for - the pipeline. - """ - # return {self.configs.entry_type: {"text"}} # TODO: fix this - return {self.configs.entry_type: set()} - - def record(self, record_meta: Dict[str, Set[str]]): - r""" - Method to add output type record of `CoreferenceProcessor` which - is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute - `members` to :attr:`forte.data.data_pack.Meta.record`. - - Args: - record_meta: the field in the datapack for type record that need to - fill in for consistency checking. - """ - record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"} +from fortex.health.processors.coreference_processor import ( + CoreferenceProcessor, +) +from fortex.spacy import SpacyProcessor + + +@ddt +class TestCoreferenceProcessor(unittest.TestCase): + @data( + ( + "ADDENDUM:\n" + "RADIOLOGIC STUDIES: Radiologic studies also included " + "a chest CT, which confirmed cavitary lesions " + "in the left lung apex consistent with infectious process/tuberculosis.\n" + "This also moderate-sized left pleural effusion.\n" + "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, " + "but old infarction consistent with past medical history.\n" + "ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum " + "most likely secondary to steoporosis.\n" + "These can be followed by repeat imaging as an outpatient.", + [["HEAD CT", "Head CT", "Abdominal CT"]], + "ft.onto.base_ontology.Document", + ), + ( + "My sister has a dog. She loves him.", + [["My sister", "She"], ["a dog", "him"]], + "ft.onto.base_ontology.Document", + ), + ( + "My sister loves her dog. My aunt also loves him.", + [["My sister", "her"], ["My aunt", "him"]], + "ft.onto.base_ontology.Sentence", + # Sentence-level coref resolution. + ), + ( + "My sister loves her dog. My aunt also loves him.", + [["My sister", "her"], ["her dog", "him"]], + "ft.onto.base_ontology.Document", + # Document-level coref is different from sentence-level. + ), + ) + @unpack + def test_inputs_and_entry_types(self, input_data, check_list, entry_type): + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": entry_type, + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": None, + }, + }, + ) + + self.pl.initialize() + + entry_type = get_class(entry_type) + + for pack in self.pl.process_dataset(input_data): + output_list = [] + + for entry in pack.get(entry_type): + for group in entry.get(CoreferenceGroup): + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + + self.assertEqual(output_list, check_list, f"input: {entry.text}") + + @data( + ( + "Deepika has a dog. She loves him. The movie star has always been fond of animals", + [["Deepika", "She", "him", "The movie star"]], + {}, + ), + ( + "Deepika has a dog. She loves him. The movie star has always been fond of animals", + [["Deepika", "She", "The movie star"], ["a dog", "him"]], + {"Deepika": ["woman", "actress"]}, + ), + ) + @unpack + def test_conv_dict(self, input_data, check_list, conv_dict): + entry_type = "ft.onto.base_ontology.Document" + + self.pl = Pipeline[DataPack](enforce_consistency=True) + self.pl.set_reader(StringReader()) + self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"}) + self.pl.add( + CoreferenceProcessor(), + { + "entry_type": entry_type, + "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention", + "lang": "en_core_web_sm", + "cfg_inference": { + "greedyness": 0.5, + "max_dist": 50, + "max_dist_match": 500, + "blacklist": True, + "conv_dict": conv_dict, + }, + }, + ) + + self.pl.initialize() + + entry_type = get_class(entry_type) + + for pack in self.pl.process_dataset(input_data): + output_list = [] + + for entry in pack.get(entry_type): + for group in entry.get(CoreferenceGroup): + members = [member for member in group.get_members()] + members = sorted(members, key=lambda x: x.begin) + + mention_texts = [member.text for member in members] + output_list.append(mention_texts) + + self.assertEqual(output_list, check_list, f"input: {entry.text}")