From 880329f52d80042af4054ad44045958eecd2bee7 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Wed, 15 Jun 2022 09:39:10 +0800
Subject: [PATCH 01/63] add empty coref processor

---
 .../processors/coreference_processor.py       | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 fortex/health/processors/coreference_processor.py

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
new file mode 100644
index 00000000..65cc1b81
--- /dev/null
+++ b/fortex/health/processors/coreference_processor.py
@@ -0,0 +1,90 @@
+# Copyright 2022 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Coreference Processor
+"""
+import os
+import re
+from typing import Dict, List, Set
+
+from forte.common import Resources
+from forte.common.configuration import Config
+from forte.data.data_pack import DataPack
+from forte.processors.base import PackProcessor
+
+# from ft.onto.base_ontology import TODO
+# from ftx.medical.clinical_ontology import TODO
+
+__all__ = [
+    "CoreferenceProcessor",
+]
+
+
+class CoreferenceProcessor(PackProcessor):
+    r"""
+    TODO: Add docstring
+    """
+
+    def __init__(self):
+        super().__init__()
+        # TODO
+
+    def set_up(self, configs: Config):
+        pass
+        # TODO
+
+    def initialize(self, resources: Resources, configs: Config):
+        super().initialize(resources, configs)
+        self.set_up(configs)
+
+    def _process(self, input_pack: DataPack):
+        r"""
+        TODO: Add docstring
+        """
+        pass
+        # TODO    
+
+    @classmethod
+    def default_configs(cls):
+        r"""
+        TODO: Add docstring
+        """
+        return {
+            # TODO
+        }
+
+    def expected_types_and_attributes(self):
+        r"""
+        Method to add user specified expected type which would be checked
+        before running the processor if the pipeline is initialized with
+        `enforce_consistency=True` or
+        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
+        the pipeline.
+        """
+        return {
+            # TODO
+        }
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""
+        Method to add output type record of `CoreferenceProcessor` which
+        is `"ftx.onto.clinical.TODO"` with attribute
+        `TODO`
+        to :attr:`forte.data.data_pack.Meta.record`.
+
+        Args:
+            record_meta: the field in the datapack for type record that need to
+                fill in for consistency checking.
+        """
+        # TODO

From d9b3f71cdc134440ca370b0d3ec4d936140f69c6 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Wed, 15 Jun 2022 17:13:06 +0800
Subject: [PATCH 02/63] add process method. TODO: correct span

---
 .../processors/coreference_processor.py       | 67 ++++++++++++++++---
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 65cc1b81..76e1f579 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -17,14 +17,15 @@
 import os
 import re
 from typing import Dict, List, Set
+import importlib
 
-from forte.common import Resources
+from forte.common import Resources, ProcessExecutionException
 from forte.common.configuration import Config
 from forte.data.data_pack import DataPack
 from forte.processors.base import PackProcessor
 
-# from ft.onto.base_ontology import TODO
-# from ftx.medical.clinical_ontology import TODO
+from ft.onto.base_ontology import CoreferenceGroup, Token
+from ftx.medical.clinical_ontology import MedicalEntityMention, MedicalArticle
 
 __all__ = [
     "CoreferenceProcessor",
@@ -39,10 +40,19 @@ class CoreferenceProcessor(PackProcessor):
     def __init__(self):
         super().__init__()
         # TODO
+        self.coref = None # TODO: add type
+        self.spacy_nlp = None # TODO: find an elegant way to set this.
 
     def set_up(self, configs: Config):
-        pass
-        # TODO
+        import neuralcoref
+        self.spacy_nlp = self.resources.get('spacy_processor').nlp
+        if self.spacy_nlp is None:
+            raise ProcessExecutionException(
+                "The SpaCy pipeline is not initialized, maybe you "
+                "haven't called the initialization function."
+            )
+        kwargs = {} # TODO
+        neuralcoref.add_to_pipe(self.spacy_nlp)
 
     def initialize(self, resources: Resources, configs: Config):
         super().initialize(resources, configs)
@@ -52,8 +62,38 @@ def _process(self, input_pack: DataPack):
         r"""
         TODO: Add docstring
         """
-        pass
-        # TODO    
+        path_str, module_str = self.configs.entry_type.rsplit(".", 1)
+        # By default, path_str would be ft.onto.base_ontology and module_str would be Document # TODO: check
+
+        mod = importlib.import_module(path_str)
+        entry = getattr(mod, module_str)
+        for entry_specified in input_pack.get(entry_type=entry):
+            result = self.spacy_nlp(entry_specified.text)
+            tokens = [(token.text, token.pos) for token in input_pack.get(Token, entry_specified)]
+
+            article = MedicalArticle(
+                pack=input_pack,
+                begin=entry_specified.span.begin,
+                end=entry_specified.span.end,
+            )
+
+            if not result._.has_coref:
+                article.has_coref = False
+                article.coref_groups = []
+            else:
+                article.has_coref = True
+                article.coref_groups = []
+                for cluster in result._.coref_clusters:
+                    
+                    mentions = []
+                    for mention in cluster.mentions:
+                        mention = MedicalEntityMention(input_pack, mention.start, mention.end)
+                        mentions.append(mention)
+
+                    group = CoreferenceGroup(input_pack)    
+                    group.add_members(mentions)
+
+                    article.coref_groups.append(group)
 
     @classmethod
     def default_configs(cls):
@@ -61,7 +101,8 @@ def default_configs(cls):
         TODO: Add docstring
         """
         return {
-            # TODO
+            # TODO: remove unnecessaries
+            "entry_type": "ft.onto.base_ontology.Document",
         }
 
     def expected_types_and_attributes(self):
@@ -79,12 +120,16 @@ def expected_types_and_attributes(self):
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""
         Method to add output type record of `CoreferenceProcessor` which
-        is `"ftx.onto.clinical.TODO"` with attribute
-        `TODO`
+        is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute
+        `coref_clusters`
         to :attr:`forte.data.data_pack.Meta.record`.
 
         Args:
             record_meta: the field in the datapack for type record that need to
                 fill in for consistency checking.
-        """
+        """ # TODO: check docstring
         # TODO
+        record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = {
+            "coref_groups",
+            "has_coref"
+        }        

From da363726f1935b724ffa98e59d9fed90c015bcfa Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Thu, 16 Jun 2022 14:48:31 +0800
Subject: [PATCH 03/63] fix output span

---
 .../processors/coreference_processor.py       | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 76e1f579..cf4dc52e 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -40,18 +40,19 @@ class CoreferenceProcessor(PackProcessor):
     def __init__(self):
         super().__init__()
         # TODO
-        self.coref = None # TODO: add type
-        self.spacy_nlp = None # TODO: find an elegant way to set this.
+        self.coref = None  # TODO: add type
+        self.spacy_nlp = None  # TODO: find an elegant way to set this.
 
     def set_up(self, configs: Config):
         import neuralcoref
-        self.spacy_nlp = self.resources.get('spacy_processor').nlp
+
+        self.spacy_nlp = self.resources.get("spacy_processor").nlp
         if self.spacy_nlp is None:
             raise ProcessExecutionException(
                 "The SpaCy pipeline is not initialized, maybe you "
                 "haven't called the initialization function."
             )
-        kwargs = {} # TODO
+        kwargs = {}  # TODO
         neuralcoref.add_to_pipe(self.spacy_nlp)
 
     def initialize(self, resources: Resources, configs: Config):
@@ -69,7 +70,7 @@ def _process(self, input_pack: DataPack):
         entry = getattr(mod, module_str)
         for entry_specified in input_pack.get(entry_type=entry):
             result = self.spacy_nlp(entry_specified.text)
-            tokens = [(token.text, token.pos) for token in input_pack.get(Token, entry_specified)]
+            tokens = [(token) for token in input_pack.get(Token, entry_specified)]
 
             article = MedicalArticle(
                 pack=input_pack,
@@ -84,13 +85,17 @@ def _process(self, input_pack: DataPack):
                 article.has_coref = True
                 article.coref_groups = []
                 for cluster in result._.coref_clusters:
-                    
+
                     mentions = []
                     for mention in cluster.mentions:
-                        mention = MedicalEntityMention(input_pack, mention.start, mention.end)
+                        mention = MedicalEntityMention(
+                            input_pack,
+                            tokens[mention.start].begin,
+                            tokens[mention.end - 1].end,
+                        )
                         mentions.append(mention)
 
-                    group = CoreferenceGroup(input_pack)    
+                    group = CoreferenceGroup(input_pack)
                     group.add_members(mentions)
 
                     article.coref_groups.append(group)
@@ -127,9 +132,9 @@ def record(self, record_meta: Dict[str, Set[str]]):
         Args:
             record_meta: the field in the datapack for type record that need to
                 fill in for consistency checking.
-        """ # TODO: check docstring
+        """  # TODO: check docstring
         # TODO
         record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = {
             "coref_groups",
-            "has_coref"
-        }        
+            "has_coref",
+        }

From 6dafabebead99dc8dbf689c74eb6080b4979523c Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Thu, 16 Jun 2022 18:01:26 +0800
Subject: [PATCH 04/63] add more default configs and comment

---
 .../processors/coreference_processor.py       | 96 +++++++++++++++----
 1 file changed, 77 insertions(+), 19 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index cf4dc52e..d434aa4d 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -14,10 +14,11 @@
 """
 Coreference Processor
 """
-import os
-import re
-from typing import Dict, List, Set
+from typing import Dict, Optional, Set
 import importlib
+from boto import config
+
+from spacy.language import Language
 
 from forte.common import Resources, ProcessExecutionException
 from forte.common.configuration import Config
@@ -34,14 +35,17 @@
 
 class CoreferenceProcessor(PackProcessor):
     r"""
-    TODO: Add docstring
+    Implementation of this CoreferenceProcessor has been based on huggingface
+    NeuralCoref. Note that official released NeuralCoref uses a dated spaCy
+    version (2.1), which can cause segmentation fault with the spaCy we use (2.3).
+    Please install NeuralCoref by building from source:
+
+    https://github.com/huggingface/neuralcoref
     """
 
     def __init__(self):
         super().__init__()
-        # TODO
-        self.coref = None  # TODO: add type
-        self.spacy_nlp = None  # TODO: find an elegant way to set this.
+        self.spacy_nlp: Optional[Language] = None  # TODO: a more elegant way
 
     def set_up(self, configs: Config):
         import neuralcoref
@@ -52,8 +56,19 @@ def set_up(self, configs: Config):
                 "The SpaCy pipeline is not initialized, maybe you "
                 "haven't called the initialization function."
             )
-        kwargs = {}  # TODO
-        neuralcoref.add_to_pipe(self.spacy_nlp)
+
+        model = configs.model
+        cfg_inference = {
+            "greedyness": configs.greedyness,
+            "max_dist": configs.max_dist,
+            "max_dist_match": configs.max_dist_match,
+            "blacklist": configs.blacklist,
+            "store_scores": configs.store_scores,
+            "conv_dict": configs.conv_dict,
+        }
+        neuralcoref.add_to_pipe(
+            self.spacy_nlp, model=model, cfg_inference=cfg_inference
+        )
 
     def initialize(self, resources: Resources, configs: Config):
         super().initialize(resources, configs)
@@ -64,11 +79,12 @@ def _process(self, input_pack: DataPack):
         TODO: Add docstring
         """
         path_str, module_str = self.configs.entry_type.rsplit(".", 1)
-        # By default, path_str would be ft.onto.base_ontology and module_str would be Document # TODO: check
+        # By default, path_str would be ft.onto.base_ontology
+        # and module_str would be Document # TODO: check
 
         mod = importlib.import_module(path_str)
-        entry = getattr(mod, module_str)
-        for entry_specified in input_pack.get(entry_type=entry):
+        entry_type = getattr(mod, module_str)
+        for entry_specified in input_pack.get(entry_type=entry_type):
             result = self.spacy_nlp(entry_specified.text)
             tokens = [(token) for token in input_pack.get(Token, entry_specified)]
 
@@ -100,14 +116,59 @@ def _process(self, input_pack: DataPack):
 
                     article.coref_groups.append(group)
 
+    # @classmethod
+    # def default_configs(cls):
+    #     r"""
+    #     This defines a basic config structure for `CoreferenceProcessor`.
+
+    #     Following are the keys for this dictionary:
+    #      - `entry_type`: input entry type,
+    #      - `model`: the neural net model to be used by NeuralCoref. If set to True
+    #         (default), a new instance will be created with `NeuralCoref.Model()`
+    #         in NeuralCoref.from_disk() or NeuralCoref.from_bytes().
+    #      - `cfg_inference`: A dict of configuration of inference. If set to an empty
+    #         dict, the default configuration in NeuralCoref will be used. Available
+    #         entries: `greedyness` (default 0.5), `max_dist` (default 50),
+    #         `max_dist_match` (default 500), `blacklist` (default True),
+    #         `store_scores` (default True), `conv_dict` (default None),
+
+    #     Returns: A dictionary with the default config for this processor.
+    #     """
+    #     return {
+    #         # TODO: remove unnecessaries
+    #         "entry_type": "ft.onto.base_ontology.Document",
+    #         "model": True,
+    #         "cfg_inference": {},
+    #     }
+
     @classmethod
     def default_configs(cls):
         r"""
-        TODO: Add docstring
+        This defines a basic config structure for `CoreferenceProcessor`.
+
+        Following are the keys for this dictionary:
+         - `entry_type`: Input entry type. Default `"ft.onto.base_ontology.Document"`.
+         - `model`: the neural net model to be used by NeuralCoref. If set to `True`
+            (default), a new instance will be created with `NeuralCoref.Model()`
+            in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`.
+         - `greedyness`: TODO. Default `0.5`.
+         - `max_dist`: TODO. Default `50`.
+         - `max_dist_match`: TODO. Default `500`.
+         - `blacklist`: TODO. Default `True`.
+         - `store_scores`: TODO. Default `True`
+         - `conv_dict`: TODO. Default `None`.
+
+        Returns: A dictionary with the default config for this processor.
         """
         return {
-            # TODO: remove unnecessaries
             "entry_type": "ft.onto.base_ontology.Document",
+            "model": True,
+            "greedyness": 0.5,
+            "max_dist": 50,
+            "max_dist_match": 500,
+            "blacklist": True,
+            "store_scores": True,
+            "conv_dict": None,
         }
 
     def expected_types_and_attributes(self):
@@ -118,22 +179,19 @@ def expected_types_and_attributes(self):
         :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
         the pipeline.
         """
-        return {
-            # TODO
-        }
+        return {"ft.onto.base_ontology.Document": set()}
 
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""
         Method to add output type record of `CoreferenceProcessor` which
         is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute
-        `coref_clusters`
+        `coref_groups` and `has_coref`
         to :attr:`forte.data.data_pack.Meta.record`.
 
         Args:
             record_meta: the field in the datapack for type record that need to
                 fill in for consistency checking.
         """  # TODO: check docstring
-        # TODO
         record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = {
             "coref_groups",
             "has_coref",

From 0e18a8f370e24a442e5486d0ac9b478b78258e99 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 11:10:00 +0800
Subject: [PATCH 05/63] change nlp pipeline

---
 .../processors/coreference_processor.py       | 79 ++++++++++---------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index d434aa4d..7399a65c 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -14,10 +14,12 @@
 """
 Coreference Processor
 """
+from lib2to3.pgen2 import token
 from typing import Dict, Optional, Set
 import importlib
-from boto import config
+from numpy import append
 
+import spacy
 from spacy.language import Language
 
 from forte.common import Resources, ProcessExecutionException
@@ -25,7 +27,7 @@
 from forte.data.data_pack import DataPack
 from forte.processors.base import PackProcessor
 
-from ft.onto.base_ontology import CoreferenceGroup, Token
+from ft.onto.base_ontology import CoreferenceGroup, Token, EntityMention
 from ftx.medical.clinical_ontology import MedicalEntityMention, MedicalArticle
 
 __all__ = [
@@ -50,7 +52,11 @@ def __init__(self):
     def set_up(self, configs: Config):
         import neuralcoref
 
-        self.spacy_nlp = self.resources.get("spacy_processor").nlp
+        # TODO: remove these comments
+        # TODO: a more elegant way
+        # self.spacy_nlp = self.resources.get("spacy_processor").nlp # borrow nlp from SpacyProcessor
+        self.spacy_nlp = spacy.load(configs.lang)
+
         if self.spacy_nlp is None:
             raise ProcessExecutionException(
                 "The SpaCy pipeline is not initialized, maybe you "
@@ -76,17 +82,35 @@ def initialize(self, resources: Resources, configs: Config):
 
     def _process(self, input_pack: DataPack):
         r"""
-        TODO: Add docstring
+        Coreference resolution is done by
+        a spaCy pipeline with `NeuralCoref` in it.
+
+        We translate the output to `CoreferenceGroup` and
+        `MedicalEntityMention`
         """
-        path_str, module_str = self.configs.entry_type.rsplit(".", 1)
-        # By default, path_str would be ft.onto.base_ontology
-        # and module_str would be Document # TODO: check
 
-        mod = importlib.import_module(path_str)
-        entry_type = getattr(mod, module_str)
+        def load_module(string):
+            path_str, module_str = string.rsplit(".", 1)
+            mod = importlib.import_module(path_str)
+            return getattr(mod, module_str)
+
+        # Default: Document
+        entry_type = load_module(self.configs.entry_type)
+
+        # Default: MedicalEntityMention
+        mention_type = load_module(self.configs.mention_type)
+
         for entry_specified in input_pack.get(entry_type=entry_type):
             result = self.spacy_nlp(entry_specified.text)
-            tokens = [(token) for token in input_pack.get(Token, entry_specified)]
+
+            # TODO: remove these comments
+            # Marker155326
+            # When tokenization is different from SpacyProcessor, this will be a bug:
+            token_begins = []
+            token_ends = []
+            for token in input_pack.get(Token, entry_specified):
+                token_begins.append(token.begin)
+                token_ends.append(token.end)
 
             article = MedicalArticle(
                 pack=input_pack,
@@ -104,10 +128,10 @@ def _process(self, input_pack: DataPack):
 
                     mentions = []
                     for mention in cluster.mentions:
-                        mention = MedicalEntityMention(
+                        mention = mention_type(
                             input_pack,
-                            tokens[mention.start].begin,
-                            tokens[mention.end - 1].end,
+                            token_begins[mention.start],
+                            token_ends[mention.end - 1],
                         )
                         mentions.append(mention)
 
@@ -116,31 +140,6 @@ def _process(self, input_pack: DataPack):
 
                     article.coref_groups.append(group)
 
-    # @classmethod
-    # def default_configs(cls):
-    #     r"""
-    #     This defines a basic config structure for `CoreferenceProcessor`.
-
-    #     Following are the keys for this dictionary:
-    #      - `entry_type`: input entry type,
-    #      - `model`: the neural net model to be used by NeuralCoref. If set to True
-    #         (default), a new instance will be created with `NeuralCoref.Model()`
-    #         in NeuralCoref.from_disk() or NeuralCoref.from_bytes().
-    #      - `cfg_inference`: A dict of configuration of inference. If set to an empty
-    #         dict, the default configuration in NeuralCoref will be used. Available
-    #         entries: `greedyness` (default 0.5), `max_dist` (default 50),
-    #         `max_dist_match` (default 500), `blacklist` (default True),
-    #         `store_scores` (default True), `conv_dict` (default None),
-
-    #     Returns: A dictionary with the default config for this processor.
-    #     """
-    #     return {
-    #         # TODO: remove unnecessaries
-    #         "entry_type": "ft.onto.base_ontology.Document",
-    #         "model": True,
-    #         "cfg_inference": {},
-    #     }
-
     @classmethod
     def default_configs(cls):
         r"""
@@ -148,6 +147,8 @@ def default_configs(cls):
 
         Following are the keys for this dictionary:
          - `entry_type`: Input entry type. Default `"ft.onto.base_ontology.Document"`.
+         - `mention_type`: Output mention type. Default `ftx.medical.clinical_ontology.MedicalEntityMention`.
+            It can also be set to `ft.onto.base_ontology.EntityMention`.
          - `model`: the neural net model to be used by NeuralCoref. If set to `True`
             (default), a new instance will be created with `NeuralCoref.Model()`
             in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`.
@@ -162,6 +163,8 @@ def default_configs(cls):
         """
         return {
             "entry_type": "ft.onto.base_ontology.Document",
+            "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+            "lang": "en_core_web_sm",
             "model": True,
             "greedyness": 0.5,
             "max_dist": 50,

From fcafb22468ee905f3aca4076917a634a894e8dc6 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 11:31:48 +0800
Subject: [PATCH 06/63] add more entries to MedicalArticle; add comment

---
 .../processors/coreference_processor.py       | 44 ++++++++++++++-----
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 7399a65c..92f58950 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -121,9 +121,13 @@ def load_module(string):
             if not result._.has_coref:
                 article.has_coref = False
                 article.coref_groups = []
+                article.coref_resolved = result._.coref_resolved
+                article.coref_scores = {}
             else:
                 article.has_coref = True
                 article.coref_groups = []
+                article.coref_resolved = result._.coref_resolved
+                article.coref_scores = result._.coref_scores
                 for cluster in result._.coref_clusters:
 
                     mentions = []
@@ -146,18 +150,32 @@ def default_configs(cls):
         This defines a basic config structure for `CoreferenceProcessor`.
 
         Following are the keys for this dictionary:
-         - `entry_type`: Input entry type. Default `"ft.onto.base_ontology.Document"`.
-         - `mention_type`: Output mention type. Default `ftx.medical.clinical_ontology.MedicalEntityMention`.
-            It can also be set to `ft.onto.base_ontology.EntityMention`.
-         - `model`: the neural net model to be used by NeuralCoref. If set to `True`
-            (default), a new instance will be created with `NeuralCoref.Model()`
+         - `entry_type`: Input entry type. Default: `"ft.onto.base_ontology.Document"`.
+         - `mention_type`: Output mention type. 
+            Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
+            It can also be set to `"ft.onto.base_ontology.EntityMention"`.
+         - `model`: the neural net model to be used by NeuralCoref. If set to `True`,
+            a new instance will be created with `NeuralCoref.Model()`. Default: `True`.
             in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`.
-         - `greedyness`: TODO. Default `0.5`.
-         - `max_dist`: TODO. Default `50`.
-         - `max_dist_match`: TODO. Default `500`.
-         - `blacklist`: TODO. Default `True`.
-         - `store_scores`: TODO. Default `True`
-         - `conv_dict`: TODO. Default `None`.
+         - `greedyness` (`float`): A number between 0 and 1 determining how greedy 
+            the model is about making coreference decisions 
+            (more greedy means more coreference links). Default: `0.5`.
+         - `max_dist` (`int`): How many mentions back to look when considering possible 
+            antecedents of the current mention. Decreasing the value will cause 
+            the system to run faster but less accurately. Default: `50`.
+         - `max_dist_match` (`int`): The system will consider linking the current mention
+            to a preceding one further than max_dist away if they share a noun or
+            proper noun. In this case, it looks max_dist_match away instead. Default: `500`.
+         - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the
+            following list: ["i", "me", "my", "you", "your"]. Default `True`.
+         - `store_scores` (`bool`): Should the system store the scores for the coreferences
+            in annotations. Default: `True`
+         - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use
+            to replace the embeddings of rare words (keys) by an average of the embeddings
+             of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}`
+             will help resolving coreferences for Angela by using the embeddings for the more
+             common woman and girl instead of the embedding of Angela.
+             This currently only works for single words (not for words groups). Default: `None`.
 
         Returns: A dictionary with the default config for this processor.
         """
@@ -188,7 +206,7 @@ def record(self, record_meta: Dict[str, Set[str]]):
         r"""
         Method to add output type record of `CoreferenceProcessor` which
         is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute
-        `coref_groups` and `has_coref`
+        `coref_groups`, `has_coref`, `coref_scores`, and `coref_resolved`
         to :attr:`forte.data.data_pack.Meta.record`.
 
         Args:
@@ -198,4 +216,6 @@ def record(self, record_meta: Dict[str, Set[str]]):
         record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = {
             "coref_groups",
             "has_coref",
+            "coref_scores",
+            "coref_resolved",
         }

From 0b675aa63b65ec213c4d0c9f191a88add15102c3 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 11:36:33 +0800
Subject: [PATCH 07/63] fixed some comments

---
 fortex/health/processors/coreference_processor.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 92f58950..ae7a685f 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -38,10 +38,13 @@
 class CoreferenceProcessor(PackProcessor):
     r"""
     Implementation of this CoreferenceProcessor has been based on huggingface
-    NeuralCoref. Note that official released NeuralCoref uses a dated spaCy
+    NeuralCoref. You can find more details in the original repo.
+    
+    Note that official released NeuralCoref uses a dated spaCy
     version (2.1), which can cause segmentation fault with the spaCy we use (2.3).
-    Please install NeuralCoref by building from source:
+    Please install NeuralCoref by building from source.
 
+    Referred repository link:
     https://github.com/huggingface/neuralcoref
     """
 
@@ -83,9 +86,9 @@ def initialize(self, resources: Resources, configs: Config):
     def _process(self, input_pack: DataPack):
         r"""
         Coreference resolution is done by
-        a spaCy pipeline with `NeuralCoref` in it.
+        a spaCy pipeline with `NeuralCoref` added.
 
-        We translate the output to `CoreferenceGroup` and
+        Then we translate the output to `CoreferenceGroup` and
         `MedicalEntityMention`
         """
 

From d0812eee84e8fe4eece88a6f5c0412751fb5b411 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 11:51:38 +0800
Subject: [PATCH 08/63] fix comments and format files

---
 fortex/health/processors/coreference_processor.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index ae7a685f..c8c80e85 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -39,8 +39,8 @@ class CoreferenceProcessor(PackProcessor):
     r"""
     Implementation of this CoreferenceProcessor has been based on huggingface
     NeuralCoref. You can find more details in the original repo.
-    
-    Note that official released NeuralCoref uses a dated spaCy
+
+    Note that the NeuralCoref package from PyPI uses a dated spaCy
     version (2.1), which can cause segmentation fault with the spaCy we use (2.3).
     Please install NeuralCoref by building from source.
 
@@ -154,17 +154,17 @@ def default_configs(cls):
 
         Following are the keys for this dictionary:
          - `entry_type`: Input entry type. Default: `"ft.onto.base_ontology.Document"`.
-         - `mention_type`: Output mention type. 
+         - `mention_type`: Output mention type.
             Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
             It can also be set to `"ft.onto.base_ontology.EntityMention"`.
          - `model`: the neural net model to be used by NeuralCoref. If set to `True`,
             a new instance will be created with `NeuralCoref.Model()`. Default: `True`.
             in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`.
-         - `greedyness` (`float`): A number between 0 and 1 determining how greedy 
-            the model is about making coreference decisions 
+         - `greedyness` (`float`): A number between 0 and 1 determining how greedy
+            the model is about making coreference decisions
             (more greedy means more coreference links). Default: `0.5`.
-         - `max_dist` (`int`): How many mentions back to look when considering possible 
-            antecedents of the current mention. Decreasing the value will cause 
+         - `max_dist` (`int`): How many mentions back to look when considering possible
+            antecedents of the current mention. Decreasing the value will cause
             the system to run faster but less accurately. Default: `50`.
          - `max_dist_match` (`int`): The system will consider linking the current mention
             to a preceding one further than max_dist away if they share a noun or

From 07694534df6a0e284400d7b9996da56b567cacce Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 12:08:00 +0800
Subject: [PATCH 09/63] update requirements.txt

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index c3516774..19b1db42 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,3 +26,5 @@ git+https://git@github.com/asyml/forte-wrappers.git#egg=forte.huggingface&subdir
 dataclasses~=0.8; python_version < '3.7'
 setuptools~=57.0.0
 transformers~=4.2.2
+# neuralcoref (build from source) for CoreferenceProcessor
+git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref

From 5a55b419fbc4659e772959ff2243896cea8db975 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 12:18:10 +0800
Subject: [PATCH 10/63] update setup.py

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 44e57751..5d5a5f20 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,7 @@
         'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
+        "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
     ],
     extras_require={
         "test": [

From fdc343e714aa18a49fdb133b721b55acb4fa6ea1 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 12:50:46 +0800
Subject: [PATCH 11/63] add unit test

---
 .../processors/coreference_processor_test.py  | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 tests/forte_medical/processors/coreference_processor_test.py

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
new file mode 100644
index 00000000..ee2b3320
--- /dev/null
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -0,0 +1,121 @@
+# Copyright 2022 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for CoreferenceProcessor
+"""
+
+import unittest
+from ddt import data, ddt
+
+from forte.data.data_pack import DataPack
+from forte.data.readers import StringReader
+from forte.pipeline import Pipeline
+
+from ftx.medical.clinical_ontology import MedicalArticle
+from ft.onto.base_ontology import (
+    Token,
+)
+
+from fortex.spacy import SpacyProcessor
+from fortex.health.processors.coreference_processor import (
+    CoreferenceProcessor,
+)
+
+
+class TestCoreferenceProcessor(unittest.TestCase):
+    def setUp(self):
+        self.pl = Pipeline[DataPack](enforce_consistency=True)
+        self.pl.set_reader(StringReader())
+        self.pl.add(
+            SpacyProcessor(),
+            {"processors": ["sentence", "tokenize"], "lang": "en_core_web_sm"},
+        )
+        self.pl.add(
+            CoreferenceProcessor(),
+            {
+                "entry_type": "ft.onto.base_ontology.Document",
+                "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+                "lang": "en_core_web_sm",
+                "model": True,
+                "greedyness": 0.5,
+                "max_dist": 50,
+                "max_dist_match": 500,
+                "blacklist": True,
+                "store_scores": True,
+                "conv_dict": None,
+            },
+        )
+
+        self.pl.initialize()
+
+    @data("My sister has a dog. She loves him.")
+    def test_daily_language(self, input_data):
+        for pack in self.pl.process_dataset(input_data):
+            for article in pack.get(MedicalArticle):
+                has_coref = article.has_coref
+                assert has_coref == True
+
+                coref_groups = article.coref_groups
+                output_list = []
+                check_list = [["My sister", "She"], ["a dog", "him"]]
+                for group in coref_groups:
+                    members = [member for member in group.get_members()]
+                    members = sorted(members, key=lambda x: x.begin)
+
+                    mention_texts = [member.text for member in members]
+                    output_list.append(mention_texts)
+                assert output_list == check_list
+
+    @data("My sister has a dog. She loves him.")
+    def test_daily_language(self, input_data):
+        for pack in self.pl.process_dataset(input_data):
+            for article in pack.get(MedicalArticle):
+                has_coref = article.has_coref
+                assert has_coref == True
+
+                coref_groups = article.coref_groups
+                output_list = []
+                check_list = [["My sister", "She"], ["a dog", "him"]]
+                for group in coref_groups:
+                    members = [member for member in group.get_members()]
+                    members = sorted(members, key=lambda x: x.begin)
+
+                    mention_texts = [member.text for member in members]
+                    output_list.append(mention_texts)
+                assert output_list == check_list
+
+    @data(
+        """ADDENDUM:
+RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.
+This also moderate-sized left pleural effusion.
+HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, but old infarction consistent with past medical history.
+ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum most likely secondary to steoporosis.
+These can be followed by repeat imaging as an outpatient."""
+    )
+    def test_medical_notes(self, input_data):
+        for pack in self.pl.process_dataset(input_data):
+            for article in pack.get(MedicalArticle):
+                has_coref = article.has_coref
+                assert has_coref == True
+
+                coref_groups = article.coref_groups
+                output_list = []
+                check_list = [["HEAD CT", "Head CT", "Abdominal CT"]]
+                for group in coref_groups:
+                    members = [member for member in group.get_members()]
+                    members = sorted(members, key=lambda x: x.begin)
+
+                    mention_texts = [member.text for member in members]
+                    output_list.append(mention_texts)
+                assert output_list == check_list

From 85bf9bb2526d50e297f78bdf3021965dff4e2d08 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 12:51:53 +0800
Subject: [PATCH 12/63] remove duplicated definition

---
 .../processors/coreference_processor_test.py  | 20 +------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index ee2b3320..5bf70b35 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -16,7 +16,7 @@
 """
 
 import unittest
-from ddt import data, ddt
+from ddt import data
 
 from forte.data.data_pack import DataPack
 from forte.data.readers import StringReader
@@ -77,24 +77,6 @@ def test_daily_language(self, input_data):
                     output_list.append(mention_texts)
                 assert output_list == check_list
 
-    @data("My sister has a dog. She loves him.")
-    def test_daily_language(self, input_data):
-        for pack in self.pl.process_dataset(input_data):
-            for article in pack.get(MedicalArticle):
-                has_coref = article.has_coref
-                assert has_coref == True
-
-                coref_groups = article.coref_groups
-                output_list = []
-                check_list = [["My sister", "She"], ["a dog", "him"]]
-                for group in coref_groups:
-                    members = [member for member in group.get_members()]
-                    members = sorted(members, key=lambda x: x.begin)
-
-                    mention_texts = [member.text for member in members]
-                    output_list.append(mention_texts)
-                assert output_list == check_list
-
     @data(
         """ADDENDUM:
 RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.

From 957992303601f459de57610cc7a86964b28e3aaf Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 12:56:18 +0800
Subject: [PATCH 13/63] fix ddt

---
 tests/forte_medical/processors/coreference_processor_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 5bf70b35..d8544975 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -16,7 +16,7 @@
 """
 
 import unittest
-from ddt import data
+from ddt import data, ddt
 
 from forte.data.data_pack import DataPack
 from forte.data.readers import StringReader
@@ -32,7 +32,7 @@
     CoreferenceProcessor,
 )
 
-
+@ddt
 class TestCoreferenceProcessor(unittest.TestCase):
     def setUp(self):
         self.pl = Pipeline[DataPack](enforce_consistency=True)

From 4e41b349b26d6268ad51c57eaa481bf44fcdac90 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 12:57:49 +0800
Subject: [PATCH 14/63] fix import

---
 fortex/health/processors/coreference_processor.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index c8c80e85..6b6fbebd 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -14,14 +14,14 @@
 """
 Coreference Processor
 """
-from lib2to3.pgen2 import token
 from typing import Dict, Optional, Set
 import importlib
-from numpy import append
 
 import spacy
 from spacy.language import Language
 
+import neuralcoref
+
 from forte.common import Resources, ProcessExecutionException
 from forte.common.configuration import Config
 from forte.data.data_pack import DataPack
@@ -53,8 +53,6 @@ def __init__(self):
         self.spacy_nlp: Optional[Language] = None  # TODO: a more elegant way
 
     def set_up(self, configs: Config):
-        import neuralcoref
-
         # TODO: remove these comments
         # TODO: a more elegant way
         # self.spacy_nlp = self.resources.get("spacy_processor").nlp # borrow nlp from SpacyProcessor

From 443f886d1094514f541551288aa257c1e4932925 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 13:04:50 +0800
Subject: [PATCH 15/63] formatting

---
 .../processors/coreference_processor_test.py    | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index d8544975..0ed8858e 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -32,6 +32,7 @@
     CoreferenceProcessor,
 )
 
+
 @ddt
 class TestCoreferenceProcessor(unittest.TestCase):
     def setUp(self):
@@ -78,12 +79,16 @@ def test_daily_language(self, input_data):
                 assert output_list == check_list
 
     @data(
-        """ADDENDUM:
-RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.
-This also moderate-sized left pleural effusion.
-HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, but old infarction consistent with past medical history.
-ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum most likely secondary to steoporosis.
-These can be followed by repeat imaging as an outpatient."""
+        "ADDENDUM:\n",
+        "RADIOLOGIC STUDIES: Radiologic studies also included ",
+        "a chest CT, which confirmed cavitary lesions ",
+        "in the left lung apex consistent with infectious process/tuberculosis.\n",
+        "This also moderate-sized left pleural effusion.\n",
+        "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, ",
+        "but old infarction consistent with past medical history.\n",
+        "ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum ",
+        "most likely secondary to steoporosis.\n",
+        "These can be followed by repeat imaging as an outpatient.",
     )
     def test_medical_notes(self, input_data):
         for pack in self.pl.process_dataset(input_data):

From 5f3cf34e0552f3ef6230330e3c092cfaff8bb94b Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 13:10:07 +0800
Subject: [PATCH 16/63] remove long lines

---
 fortex/health/processors/coreference_processor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 6b6fbebd..8f9d28f7 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -55,7 +55,8 @@ def __init__(self):
     def set_up(self, configs: Config):
         # TODO: remove these comments
         # TODO: a more elegant way
-        # self.spacy_nlp = self.resources.get("spacy_processor").nlp # borrow nlp from SpacyProcessor
+        # borrow nlp from SpacyProcessor
+        # self.spacy_nlp = self.resources.get("spacy_processor").nlp
         self.spacy_nlp = spacy.load(configs.lang)
 
         if self.spacy_nlp is None:

From a11f2090b8310a5dac805105ef279b9598897b00 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 13:13:39 +0800
Subject: [PATCH 17/63] remove unused import

---
 fortex/health/processors/coreference_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 8f9d28f7..4659402f 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -27,8 +27,8 @@
 from forte.data.data_pack import DataPack
 from forte.processors.base import PackProcessor
 
-from ft.onto.base_ontology import CoreferenceGroup, Token, EntityMention
-from ftx.medical.clinical_ontology import MedicalEntityMention, MedicalArticle
+from ft.onto.base_ontology import CoreferenceGroup, Token
+from ftx.medical.clinical_ontology import MedicalArticle
 
 __all__ = [
     "CoreferenceProcessor",

From 07dbb02d2233816b7e9ed8fb83cc49abd438bbc4 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 13:31:20 +0800
Subject: [PATCH 18/63] add cython to dependency

---
 requirements.txt | 1 +
 setup.py         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 19b1db42..bb8ee956 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,3 +28,4 @@ setuptools~=57.0.0
 transformers~=4.2.2
 # neuralcoref (build from source) for CoreferenceProcessor
 git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
+cython>=0.25
diff --git a/setup.py b/setup.py
index 5d5a5f20..cab3d825 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,7 @@
         'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
+        "cython>=0.25",
         "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
     ],
     extras_require={

From 008eb1b79addbc5a5b0971f405a2acddec19f70d Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:12:23 +0800
Subject: [PATCH 19/63] delay the installation of neuralcoref

---
 requirements.txt | 8 +++++---
 setup.py         | 6 ++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bb8ee956..8f6ed340 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,6 +26,8 @@ git+https://git@github.com/asyml/forte-wrappers.git#egg=forte.huggingface&subdir
 dataclasses~=0.8; python_version < '3.7'
 setuptools~=57.0.0
 transformers~=4.2.2
-# neuralcoref (build from source) for CoreferenceProcessor
-git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
-cython>=0.25
+
+# It is annoying that if we install neuralcoref and spacy at the same
+# time, neuralcoref will throw "Cython failed" during building.
+# Therefore, we must install neuralcoref after spacy is installed.
+# git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
\ No newline at end of file
diff --git a/setup.py b/setup.py
index cab3d825..eba440f4 100644
--- a/setup.py
+++ b/setup.py
@@ -41,8 +41,6 @@
         'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
-        "cython>=0.25",
-        "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
     ],
     extras_require={
         "test": [
@@ -50,6 +48,10 @@
             "testfixtures",
             "transformers==4.2.2",
             "protobuf==3.19.4",
+            # It is annoying that if we install neuralcoref and spacy at the same
+            # time, neuralcoref will throw "Cython failed" during building.
+            # Therefore, we must install neuralcoref after spacy is installed.
+            # "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
         ],
     },
     entry_points={

From ce2fd6f23d5b7b7c982f47c422b08b519a1459b0 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:14:15 +0800
Subject: [PATCH 20/63] put installation of neuralcoref in workflow

---
 .github/workflows/main.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3f31b46a..5b5866e8 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -89,6 +89,10 @@ jobs:
         cd forte-wrappers
         pip install src/spacy
 
+    - name: Install NeuralCoref
+      run: |
+        git install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
+
     - name: Test with pytest and run coverage
       run: |
         coverage run -m pytest tests/

From e832289f3b9a92a17415361697aae31fb7597b31 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:28:03 +0800
Subject: [PATCH 21/63] fix typo

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5b5866e8..89bd3a6e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -91,7 +91,7 @@ jobs:
 
     - name: Install NeuralCoref
       run: |
-        git install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
+        pip install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
 
     - name: Test with pytest and run coverage
       run: |

From c18f7043ffd048a7e98f434c9e7b3610f07da62f Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:29:03 +0800
Subject: [PATCH 22/63] skip mypy's None is not callable bug

---
 fortex/health/processors/coreference_processor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 4659402f..cc5a609c 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -14,11 +14,11 @@
 """
 Coreference Processor
 """
-from typing import Dict, Optional, Set
+from typing import Dict, Set #, Optional
 import importlib
 
 import spacy
-from spacy.language import Language
+# from spacy.language import Language
 
 import neuralcoref
 
@@ -50,7 +50,7 @@ class CoreferenceProcessor(PackProcessor):
 
     def __init__(self):
         super().__init__()
-        self.spacy_nlp: Optional[Language] = None  # TODO: a more elegant way
+        self.spacy_nlp = None  # TODO: a more elegant way
 
     def set_up(self, configs: Config):
         # TODO: remove these comments

From a9cf38d98a87e001bae64fe55c11dd5d657f29dc Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:32:42 +0800
Subject: [PATCH 23/63] black format

---
 fortex/health/processors/coreference_processor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index cc5a609c..6d12f8c2 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -14,10 +14,11 @@
 """
 Coreference Processor
 """
-from typing import Dict, Set #, Optional
+from typing import Dict, Set  # , Optional
 import importlib
 
 import spacy
+
 # from spacy.language import Language
 
 import neuralcoref

From 2c346fb6a757d21ac1bc0ef4170f6660ee26a753 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:38:49 +0800
Subject: [PATCH 24/63] add spacy

---
 requirements.txt |  1 +
 setup.py         | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8f6ed340..49661324 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,6 +27,7 @@ dataclasses~=0.8; python_version < '3.7'
 setuptools~=57.0.0
 transformers~=4.2.2
 
+spacy>=2.3.0, <=2.3.5
 # It is annoying that if we install neuralcoref and spacy at the same
 # time, neuralcoref will throw "Cython failed" during building.
 # Therefore, we must install neuralcoref after spacy is installed.
diff --git a/setup.py b/setup.py
index eba440f4..ce1e862b 100644
--- a/setup.py
+++ b/setup.py
@@ -6,23 +6,22 @@
 long_description = (Path(__file__).parent / "README.md").read_text()
 
 if sys.version_info < (3, 6):
-    sys.exit('Python>=3.6 is required by forte-medical.')
+    sys.exit("Python>=3.6 is required by forte-medical.")
 
 setuptools.setup(
     name="forte.health",
-    version='0.1.0',
+    version="0.1.0",
     url="https://github.com/asyml/ForteHealth",
     description="NLP pipeline framework for biomedical and clinical domains",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    license='Apache License Version 2.0',
+    license="Apache License Version 2.0",
     packages=setuptools.find_namespace_packages(
-        include=['fortex.health', 'ftx.*'],
-        exclude=["scripts*", "examples*", "tests*"]
+        include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"]
     ),
     namespace_packages=["fortex"],
     install_requires=[
-        'forte~=0.2.0',
+        "forte~=0.2.0",
         "sortedcontainers==2.1.0",
         "numpy>=1.16.6",
         "jsonpickle==1.4",
@@ -41,6 +40,7 @@
         'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
+        "spacy>=2.3.0, <=2.3.5",
     ],
     extras_require={
         "test": [
@@ -55,12 +55,12 @@
         ],
     },
     entry_points={
-        'console_scripts': [
+        "console_scripts": [
             "forte-medical-train=forte_medical_cli.train:main",
             "forte-medical-process=forte_medical_cli.process:main",
             "forte-medical-evaluate=forte_medical_cli.evaluate:main",
         ]
     },
     include_package_data=True,
-    python_requires='>=3.6'
+    python_requires=">=3.6",
 )

From 037f33bc81d420243c50c246953927f62031d9f3 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 14:58:44 +0800
Subject: [PATCH 25/63] add cython and pytest

---
 requirements.txt                                             | 2 ++
 setup.py                                                     | 2 ++
 tests/forte_medical/processors/coreference_processor_test.py | 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 49661324..ee79c660 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,8 @@ setuptools~=57.0.0
 transformers~=4.2.2
 
 spacy>=2.3.0, <=2.3.5
+cython>=0.25
+pytest
 # It is annoying that if we install neuralcoref and spacy at the same
 # time, neuralcoref will throw "Cython failed" during building.
 # Therefore, we must install neuralcoref after spacy is installed.
diff --git a/setup.py b/setup.py
index ce1e862b..8ff7e22b 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,8 @@
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
         "spacy>=2.3.0, <=2.3.5",
+        "cython>=0.25",
+        "pytest",
     ],
     extras_require={
         "test": [
diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 0ed8858e..b20e3050 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -65,7 +65,7 @@ def test_daily_language(self, input_data):
         for pack in self.pl.process_dataset(input_data):
             for article in pack.get(MedicalArticle):
                 has_coref = article.has_coref
-                assert has_coref == True
+                assert has_coref is True
 
                 coref_groups = article.coref_groups
                 output_list = []
@@ -94,7 +94,7 @@ def test_medical_notes(self, input_data):
         for pack in self.pl.process_dataset(input_data):
             for article in pack.get(MedicalArticle):
                 has_coref = article.has_coref
-                assert has_coref == True
+                assert has_coref is True
 
                 coref_groups = article.coref_groups
                 output_list = []

From 5856d15b4520dcf5aae64b14a32a04009667718c Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 15:00:30 +0800
Subject: [PATCH 26/63] remove commented code

---
 fortex/health/processors/coreference_processor.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 6d12f8c2..4d9d4ad1 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -14,13 +14,11 @@
 """
 Coreference Processor
 """
-from typing import Dict, Set  # , Optional
+from typing import Dict, Set
 import importlib
 
 import spacy
 
-# from spacy.language import Language
-
 import neuralcoref
 
 from forte.common import Resources, ProcessExecutionException

From 03b6d69eea50bdb2d0cb882fea252b9ec70bbd87 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 15:06:07 +0800
Subject: [PATCH 27/63] fix unit test data

---
 .../processors/coreference_processor_test.py     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index b20e3050..b8d68cda 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -80,14 +80,14 @@ def test_daily_language(self, input_data):
 
     @data(
         "ADDENDUM:\n",
-        "RADIOLOGIC STUDIES: Radiologic studies also included ",
-        "a chest CT, which confirmed cavitary lesions ",
-        "in the left lung apex consistent with infectious process/tuberculosis.\n",
-        "This also moderate-sized left pleural effusion.\n",
-        "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, ",
-        "but old infarction consistent with past medical history.\n",
-        "ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum ",
-        "most likely secondary to steoporosis.\n",
+        "RADIOLOGIC STUDIES: Radiologic studies also included "
+        "a chest CT, which confirmed cavitary lesions "
+        "in the left lung apex consistent with infectious process/tuberculosis.\n"
+        "This also moderate-sized left pleural effusion.\n"
+        "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, "
+        "but old infarction consistent with past medical history.\n"
+        "ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum "
+        "most likely secondary to steoporosis.\n"
         "These can be followed by repeat imaging as an outpatient.",
     )
     def test_medical_notes(self, input_data):

From 9e5521160e0a44289239410b09e0e14c61dc0eba Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 19:10:57 +0800
Subject: [PATCH 28/63] fix unit test data 2

---
 tests/forte_medical/processors/coreference_processor_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index b8d68cda..7f78de83 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -79,7 +79,7 @@ def test_daily_language(self, input_data):
                 assert output_list == check_list
 
     @data(
-        "ADDENDUM:\n",
+        "ADDENDUM:\n"
         "RADIOLOGIC STUDIES: Radiologic studies also included "
         "a chest CT, which confirmed cavitary lesions "
         "in the left lung apex consistent with infectious process/tuberculosis.\n"

From 2730b885d57bd1b4d719dbf38176d2ec9dc1ebaf Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 20:40:35 +0800
Subject: [PATCH 29/63] remove the dependency of SpacyProcessor

---
 .../processors/coreference_processor.py       | 21 ++++---------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 4d9d4ad1..cb2c011e 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -26,7 +26,7 @@
 from forte.data.data_pack import DataPack
 from forte.processors.base import PackProcessor
 
-from ft.onto.base_ontology import CoreferenceGroup, Token
+from ft.onto.base_ontology import CoreferenceGroup
 from ftx.medical.clinical_ontology import MedicalArticle
 
 __all__ = [
@@ -49,13 +49,9 @@ class CoreferenceProcessor(PackProcessor):
 
     def __init__(self):
         super().__init__()
-        self.spacy_nlp = None  # TODO: a more elegant way
+        self.spacy_nlp = None
 
     def set_up(self, configs: Config):
-        # TODO: remove these comments
-        # TODO: a more elegant way
-        # borrow nlp from SpacyProcessor
-        # self.spacy_nlp = self.resources.get("spacy_processor").nlp
         self.spacy_nlp = spacy.load(configs.lang)
 
         if self.spacy_nlp is None:
@@ -104,15 +100,6 @@ def load_module(string):
         for entry_specified in input_pack.get(entry_type=entry_type):
             result = self.spacy_nlp(entry_specified.text)
 
-            # TODO: remove these comments
-            # Marker155326
-            # When tokenization is different from SpacyProcessor, this will be a bug:
-            token_begins = []
-            token_ends = []
-            for token in input_pack.get(Token, entry_specified):
-                token_begins.append(token.begin)
-                token_ends.append(token.end)
-
             article = MedicalArticle(
                 pack=input_pack,
                 begin=entry_specified.span.begin,
@@ -135,8 +122,8 @@ def load_module(string):
                     for mention in cluster.mentions:
                         mention = mention_type(
                             input_pack,
-                            token_begins[mention.start],
-                            token_ends[mention.end - 1],
+                            mention.start_char,
+                            mention.end_char,
                         )
                         mentions.append(mention)
 

From 5f6d0242e4f168c536b66b70b984afb767b1be72 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 20:43:05 +0800
Subject: [PATCH 30/63] update unit test

---
 .../forte_medical/processors/coreference_processor_test.py  | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 7f78de83..93ad1cfa 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -26,8 +26,6 @@
 from ft.onto.base_ontology import (
     Token,
 )
-
-from fortex.spacy import SpacyProcessor
 from fortex.health.processors.coreference_processor import (
     CoreferenceProcessor,
 )
@@ -38,10 +36,6 @@ class TestCoreferenceProcessor(unittest.TestCase):
     def setUp(self):
         self.pl = Pipeline[DataPack](enforce_consistency=True)
         self.pl.set_reader(StringReader())
-        self.pl.add(
-            SpacyProcessor(),
-            {"processors": ["sentence", "tokenize"], "lang": "en_core_web_sm"},
-        )
         self.pl.add(
             CoreferenceProcessor(),
             {

From c9d56a412802c86e5728e7f3c07a9b7acab3e7df Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 20:43:53 +0800
Subject: [PATCH 31/63] remove TODO

---
 fortex/health/processors/coreference_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index cb2c011e..e2e8423d 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -200,7 +200,7 @@ def record(self, record_meta: Dict[str, Set[str]]):
         Args:
             record_meta: the field in the datapack for type record that need to
                 fill in for consistency checking.
-        """  # TODO: check docstring
+        """
         record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = {
             "coref_groups",
             "has_coref",

From 09114ad6fc1bbab71abf12d59c07e0ecc4278e84 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 20:55:22 +0800
Subject: [PATCH 32/63] add load_lang_model

---
 fortex/health/processors/coreference_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index e2e8423d..36e9b8c0 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -17,7 +17,7 @@
 from typing import Dict, Set
 import importlib
 
-import spacy
+from fortex.spacy.spacy_processors import load_lang_model
 
 import neuralcoref
 
@@ -52,7 +52,7 @@ def __init__(self):
         self.spacy_nlp = None
 
     def set_up(self, configs: Config):
-        self.spacy_nlp = spacy.load(configs.lang)
+        self.spacy_nlp = load_lang_model(configs.lang)
 
         if self.spacy_nlp is None:
             raise ProcessExecutionException(

From cb676c38f4adb7eb819d6fcf21f7be27a8efa59b Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 20:59:59 +0800
Subject: [PATCH 33/63] remove spacy from requirements and setup

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ee79c660..fc0f8c81 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,7 +27,7 @@ dataclasses~=0.8; python_version < '3.7'
 setuptools~=57.0.0
 transformers~=4.2.2
 
-spacy>=2.3.0, <=2.3.5
+# spacy>=2.3.0, <=2.3.5 # will be installed by forte.spacy
 cython>=0.25
 pytest
 # It is annoying that if we install neuralcoref and spacy at the same
diff --git a/setup.py b/setup.py
index 8ff7e22b..ed849bfa 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@
         'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
-        "spacy>=2.3.0, <=2.3.5",
+        # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy
         "cython>=0.25",
         "pytest",
     ],

From e5ef675970c5bbd869166d95fd9f4c36382c9cb3 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 17 Jun 2022 21:01:55 +0800
Subject: [PATCH 34/63] change import order

---
 fortex/health/processors/coreference_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 36e9b8c0..6c89ef1c 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -17,8 +17,6 @@
 from typing import Dict, Set
 import importlib
 
-from fortex.spacy.spacy_processors import load_lang_model
-
 import neuralcoref
 
 from forte.common import Resources, ProcessExecutionException
@@ -29,6 +27,8 @@
 from ft.onto.base_ontology import CoreferenceGroup
 from ftx.medical.clinical_ontology import MedicalArticle
 
+from fortex.spacy.spacy_processors import load_lang_model
+
 __all__ = [
     "CoreferenceProcessor",
 ]

From 8b18e6740dc81c16c9495b743cc0b2a63ea5cc8f Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 12:02:12 +0800
Subject: [PATCH 35/63] use ddt data and unpack

---
 .../processors/coreference_processor_test.py  | 54 +++++++------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 93ad1cfa..d5eb3cbd 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -16,16 +16,13 @@
 """
 
 import unittest
-from ddt import data, ddt
+from ddt import data, ddt, unpack
 
 from forte.data.data_pack import DataPack
 from forte.data.readers import StringReader
 from forte.pipeline import Pipeline
 
 from ftx.medical.clinical_ontology import MedicalArticle
-from ft.onto.base_ontology import (
-    Token,
-)
 from fortex.health.processors.coreference_processor import (
     CoreferenceProcessor,
 )
@@ -54,37 +51,27 @@ def setUp(self):
 
         self.pl.initialize()
 
-    @data("My sister has a dog. She loves him.")
-    def test_daily_language(self, input_data):
-        for pack in self.pl.process_dataset(input_data):
-            for article in pack.get(MedicalArticle):
-                has_coref = article.has_coref
-                assert has_coref is True
-
-                coref_groups = article.coref_groups
-                output_list = []
-                check_list = [["My sister", "She"], ["a dog", "him"]]
-                for group in coref_groups:
-                    members = [member for member in group.get_members()]
-                    members = sorted(members, key=lambda x: x.begin)
-
-                    mention_texts = [member.text for member in members]
-                    output_list.append(mention_texts)
-                assert output_list == check_list
-
     @data(
-        "ADDENDUM:\n"
-        "RADIOLOGIC STUDIES: Radiologic studies also included "
-        "a chest CT, which confirmed cavitary lesions "
-        "in the left lung apex consistent with infectious process/tuberculosis.\n"
-        "This also moderate-sized left pleural effusion.\n"
-        "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, "
-        "but old infarction consistent with past medical history.\n"
-        "ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum "
-        "most likely secondary to steoporosis.\n"
-        "These can be followed by repeat imaging as an outpatient.",
+        (
+            "ADDENDUM:\n"
+            "RADIOLOGIC STUDIES: Radiologic studies also included "
+            "a chest CT, which confirmed cavitary lesions "
+            "in the left lung apex consistent with infectious process/tuberculosis.\n"
+            "This also moderate-sized left pleural effusion.\n"
+            "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, "
+            "but old infarction consistent with past medical history.\n"
+            "ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum "
+            "most likely secondary to steoporosis.\n"
+            "These can be followed by repeat imaging as an outpatient.",
+            [["HEAD CT", "Head CT", "Abdominal CT"]],
+        ),
+        (
+            "My sister has a dog. She loves him.",
+            [["My sister", "She"], ["a dog", "him"]],
+        ),
     )
-    def test_medical_notes(self, input_data):
+    @unpack
+    def test_medical_notes(self, input_data, check_list):
         for pack in self.pl.process_dataset(input_data):
             for article in pack.get(MedicalArticle):
                 has_coref = article.has_coref
@@ -92,7 +79,6 @@ def test_medical_notes(self, input_data):
 
                 coref_groups = article.coref_groups
                 output_list = []
-                check_list = [["HEAD CT", "Head CT", "Abdominal CT"]]
                 for group in coref_groups:
                     members = [member for member in group.get_members()]
                     members = sorted(members, key=lambda x: x.begin)

From 299d6d3421b01387ba47bbbd03502264380459d1 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 16:37:14 +0800
Subject: [PATCH 36/63] update config structure

---
 .../processors/coreference_processor.py       | 121 +++++++++---------
 1 file changed, 62 insertions(+), 59 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 6c89ef1c..3680ad73 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -25,7 +25,6 @@
 from forte.processors.base import PackProcessor
 
 from ft.onto.base_ontology import CoreferenceGroup
-from ftx.medical.clinical_ontology import MedicalArticle
 
 from fortex.spacy.spacy_processors import load_lang_model
 
@@ -60,15 +59,8 @@ def set_up(self, configs: Config):
                 "haven't called the initialization function."
             )
 
-        model = configs.model
-        cfg_inference = {
-            "greedyness": configs.greedyness,
-            "max_dist": configs.max_dist,
-            "max_dist_match": configs.max_dist_match,
-            "blacklist": configs.blacklist,
-            "store_scores": configs.store_scores,
-            "conv_dict": configs.conv_dict,
-        }
+        model = configs.model if configs.model != "use_default_model" else True
+        cfg_inference = configs.cfg_inference
         neuralcoref.add_to_pipe(
             self.spacy_nlp, model=model, cfg_inference=cfg_inference
         )
@@ -82,8 +74,7 @@ def _process(self, input_pack: DataPack):
         Coreference resolution is done by
         a spaCy pipeline with `NeuralCoref` added.
 
-        Then we translate the output to `CoreferenceGroup` and
-        `MedicalEntityMention`
+        Then we translate the output to `CoreferenceGroup`.
         """
 
         def load_module(string):
@@ -100,22 +91,9 @@ def load_module(string):
         for entry_specified in input_pack.get(entry_type=entry_type):
             result = self.spacy_nlp(entry_specified.text)
 
-            article = MedicalArticle(
-                pack=input_pack,
-                begin=entry_specified.span.begin,
-                end=entry_specified.span.end,
-            )
-
             if not result._.has_coref:
-                article.has_coref = False
-                article.coref_groups = []
-                article.coref_resolved = result._.coref_resolved
-                article.coref_scores = {}
+                continue
             else:
-                article.has_coref = True
-                article.coref_groups = []
-                article.coref_resolved = result._.coref_resolved
-                article.coref_scores = result._.coref_scores
                 for cluster in result._.coref_clusters:
 
                     mentions = []
@@ -130,48 +108,79 @@ def load_module(string):
                     group = CoreferenceGroup(input_pack)
                     group.add_members(mentions)
 
-                    article.coref_groups.append(group)
-
     @classmethod
     def default_configs(cls):
         r"""
         This defines a basic config structure for `CoreferenceProcessor`.
 
         Following are the keys for this dictionary:
-         - `entry_type`: Input entry type. Default: `"ft.onto.base_ontology.Document"`.
-         - `mention_type`: Output mention type.
-            Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
-            It can also be set to `"ft.onto.base_ontology.EntityMention"`.
-         - `model`: the neural net model to be used by NeuralCoref. If set to `True`,
-            a new instance will be created with `NeuralCoref.Model()`. Default: `True`.
-            in `NeuralCoref.from_disk()` or `NeuralCoref.from_bytes()`.
+        - `entry_type`: Input entry type. You can change the context of
+          coreference resolution by setting this parameter. For example,
+          if you want to do coreference resolution within documents, set
+          it to `"ft.onto.base_ontology.Document"`. If you want to do
+          coreference resolution within sentences, set it to
+          `"ft.onto.base_ontology.Sentence"`.
+          Default: `"ft.onto.base_ontology.Document"`.
+        - `mention_type`: The type of members in `CoreferenceGroup`.
+          Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
+          It can also be set to `"ft.onto.base_ontology.EntityMention"`.
+        - `model`: the neural net model to be used by NeuralCoref. If set to
+          `"use_default_model"`, a pre-trained neural net will be downloaded and cached.
+          If set to your customized model, the model needs to be a tuple containing a
+          `single_model` and a `pairs_model`. See `NeuralCoref.Model` method in
+          https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx
+          for reference of how the default model is defined.
+          Default: `"use_default_model"`.
+        - `cfg_inference`: A dict containing the inference configs of NeuralCoref. See
+          `get_default_cfg_inference` for default values, and see
+          https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters
+          for the meaing of these parameters.
+
+        Returns: A dictionary with the default config for this processor.
+        """
+        return {
+            "entry_type": "ft.onto.base_ontology.Document",
+            "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+            "lang": "en_core_web_sm",
+            "model": "use_default_model",
+            "cfg_inference": cls.get_default_cfg_inference(),
+        }
+
+    @classmethod
+    def get_default_cfg_inference(cls):
+        """
+        This defines the default inference config of NeuralCoref.
+
+        Following are the keys for this dictionary:
          - `greedyness` (`float`): A number between 0 and 1 determining how greedy
             the model is about making coreference decisions
-            (more greedy means more coreference links). Default: `0.5`.
+            (more greedy means more coreference links).
+            Default: `0.5`.
          - `max_dist` (`int`): How many mentions back to look when considering possible
             antecedents of the current mention. Decreasing the value will cause
-            the system to run faster but less accurately. Default: `50`.
+            the system to run faster but less accurately.
+            Default: `50`.
          - `max_dist_match` (`int`): The system will consider linking the current mention
             to a preceding one further than max_dist away if they share a noun or
-            proper noun. In this case, it looks max_dist_match away instead. Default: `500`.
+            proper noun. In this case, it looks max_dist_match away instead.
+            Default: `500`.
          - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the
-            following list: ["i", "me", "my", "you", "your"]. Default `True`.
+            following list: ["i", "me", "my", "you", "your"].
+            Default `True`.
          - `store_scores` (`bool`): Should the system store the scores for the coreferences
-            in annotations. Default: `True`
+            in annotations.
+            Default: `True`
          - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use
             to replace the embeddings of rare words (keys) by an average of the embeddings
-             of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}`
-             will help resolving coreferences for Angela by using the embeddings for the more
-             common woman and girl instead of the embedding of Angela.
-             This currently only works for single words (not for words groups). Default: `None`.
+            of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}`
+            will help resolving coreferences for Angela by using the embeddings for the more
+            common woman and girl instead of the embedding of Angela.
+            This currently only works for single words (not for words groups).
+            Default: `None`.
 
-        Returns: A dictionary with the default config for this processor.
+        Returns: A dictionary with the default inference config of NeuralCoref.
         """
         return {
-            "entry_type": "ft.onto.base_ontology.Document",
-            "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
-            "lang": "en_core_web_sm",
-            "model": True,
             "greedyness": 0.5,
             "max_dist": 50,
             "max_dist_match": 500,
@@ -188,22 +197,16 @@ def expected_types_and_attributes(self):
         :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
         the pipeline.
         """
-        return {"ft.onto.base_ontology.Document": set()}
+        return {self.configs.entry_type: set("text")}
 
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""
         Method to add output type record of `CoreferenceProcessor` which
-        is `"ftx.medical.clinical_ontology.MedicalArticle"` with attribute
-        `coref_groups`, `has_coref`, `coref_scores`, and `coref_resolved`
-        to :attr:`forte.data.data_pack.Meta.record`.
+        is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute
+        `members` to :attr:`forte.data.data_pack.Meta.record`.
 
         Args:
             record_meta: the field in the datapack for type record that need to
                 fill in for consistency checking.
         """
-        record_meta["ftx.medical.clinical_ontology.MedicalArticle"] = {
-            "coref_groups",
-            "has_coref",
-            "coref_scores",
-            "coref_resolved",
-        }
+        record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"}

From 1a0e23926df2466a711790d530991d0beb540b2c Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 16:53:57 +0800
Subject: [PATCH 37/63] add comment for lang

---
 fortex/health/processors/coreference_processor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 3680ad73..f14ba9c8 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -124,6 +124,9 @@ def default_configs(cls):
         - `mention_type`: The type of members in `CoreferenceGroup`.
           Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
           It can also be set to `"ft.onto.base_ontology.EntityMention"`.
+        - `lang`: The SpaCy pipeline to be used. The pipeline does the preprocessing
+          steps for NeuralCoref.
+          Default: `"en_core_web_sm"`.
         - `model`: the neural net model to be used by NeuralCoref. If set to
           `"use_default_model"`, a pre-trained neural net will be downloaded and cached.
           If set to your customized model, the model needs to be a tuple containing a

From 36bcabade1d160eafaf6291e427657874ef38a6a Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 17:10:19 +0800
Subject: [PATCH 38/63] fix set() bug

---
 .../processors/coreference_processor.py       |  2 +-
 .../processors/coreference_processor_test.py  | 26 +++++++++----------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index f14ba9c8..317b9208 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -200,7 +200,7 @@ def expected_types_and_attributes(self):
         :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
         the pipeline.
         """
-        return {self.configs.entry_type: set("text")}
+        return {self.configs.entry_type: {"text"}}
 
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""
diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index d5eb3cbd..293ddae5 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -22,7 +22,7 @@
 from forte.data.readers import StringReader
 from forte.pipeline import Pipeline
 
-from ftx.medical.clinical_ontology import MedicalArticle
+from ft.onto.base_ontology import Document, CoreferenceGroup
 from fortex.health.processors.coreference_processor import (
     CoreferenceProcessor,
 )
@@ -39,13 +39,15 @@ def setUp(self):
                 "entry_type": "ft.onto.base_ontology.Document",
                 "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
                 "lang": "en_core_web_sm",
-                "model": True,
-                "greedyness": 0.5,
-                "max_dist": 50,
-                "max_dist_match": 500,
-                "blacklist": True,
-                "store_scores": True,
-                "conv_dict": None,
+                "model": "use_default_model",
+                "cfg_inference": {
+                    "greedyness": 0.5,
+                    "max_dist": 50,
+                    "max_dist_match": 500,
+                    "blacklist": True,
+                    "store_scores": True,
+                    "conv_dict": None,
+                },
             },
         )
 
@@ -73,13 +75,9 @@ def setUp(self):
     @unpack
     def test_medical_notes(self, input_data, check_list):
         for pack in self.pl.process_dataset(input_data):
-            for article in pack.get(MedicalArticle):
-                has_coref = article.has_coref
-                assert has_coref is True
-
-                coref_groups = article.coref_groups
+            for document in pack.get(Document):
                 output_list = []
-                for group in coref_groups:
+                for group in pack.get(CoreferenceGroup, document):
                     members = [member for member in group.get_members()]
                     members = sorted(members, key=lambda x: x.begin)
 

From 3a20b8dbad909ff6879e8e3c18e139d6b51f1aeb Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 18:25:32 +0800
Subject: [PATCH 39/63] add offset calculation assertion

---
 fortex/health/processors/coreference_processor.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 317b9208..f9b56130 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -98,11 +98,16 @@ def load_module(string):
 
                     mentions = []
                     for mention in cluster.mentions:
+                        mention_text = mention.text
                         mention = mention_type(
                             input_pack,
-                            mention.start_char,
-                            mention.end_char,
+                            mention.start_char + entry_specified.begin,
+                            mention.end_char + entry_specified.begin,
                         )
+                        assert (mention.text == mention_text, # TODO: remove assertion?
+                        f"The processor extracted mention {mention.text}" 
+                        f" which is different from the original mention {mention_text}."
+                        f"The offeset calculation is wrong.") 
                         mentions.append(mention)
 
                     group = CoreferenceGroup(input_pack)
@@ -200,7 +205,8 @@ def expected_types_and_attributes(self):
         :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
         the pipeline.
         """
-        return {self.configs.entry_type: {"text"}}
+        # return {self.configs.entry_type: {"text"}} # TODO: fix this
+        return {self.configs.entry_type: set()}
 
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""

From 26efabd216699311bcb67b12b0d498feb50a0839 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 18:32:35 +0800
Subject: [PATCH 40/63] formatting

---
 fortex/health/processors/coreference_processor.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index f9b56130..08d78219 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -104,10 +104,12 @@ def load_module(string):
                             mention.start_char + entry_specified.begin,
                             mention.end_char + entry_specified.begin,
                         )
-                        assert (mention.text == mention_text, # TODO: remove assertion?
-                        f"The processor extracted mention {mention.text}" 
-                        f" which is different from the original mention {mention_text}."
-                        f"The offeset calculation is wrong.") 
+                        assert (
+                            mention.text == mention_text,  # TODO: remove assertion?
+                            f"The processor extracted mention {mention.text}"
+                            f" which is different from the original mention {mention_text}."
+                            f"The offeset calculation is wrong.",
+                        )
                         mentions.append(mention)
 
                     group = CoreferenceGroup(input_pack)

From f7db0245aefdcffc2034f8b4e780e11f6663e649 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 18:34:12 +0800
Subject: [PATCH 41/63] udpate test

---
 .../processors/coreference_processor_test.py  | 99 ++++++++++++++-----
 1 file changed, 73 insertions(+), 26 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 293ddae5..71a892c3 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -15,6 +15,7 @@
 Unit tests for CoreferenceProcessor
 """
 
+import importlib
 import unittest
 from ddt import data, ddt, unpack
 
@@ -26,32 +27,33 @@
 from fortex.health.processors.coreference_processor import (
     CoreferenceProcessor,
 )
+from fortex.spacy import SpacyProcessor
 
 
 @ddt
 class TestCoreferenceProcessor(unittest.TestCase):
-    def setUp(self):
-        self.pl = Pipeline[DataPack](enforce_consistency=True)
-        self.pl.set_reader(StringReader())
-        self.pl.add(
-            CoreferenceProcessor(),
-            {
-                "entry_type": "ft.onto.base_ontology.Document",
-                "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
-                "lang": "en_core_web_sm",
-                "model": "use_default_model",
-                "cfg_inference": {
-                    "greedyness": 0.5,
-                    "max_dist": 50,
-                    "max_dist_match": 500,
-                    "blacklist": True,
-                    "store_scores": True,
-                    "conv_dict": None,
-                },
-            },
-        )
+    # def setUp(self):
+    #     self.pl = Pipeline[DataPack](enforce_consistency=True)
+    #     self.pl.set_reader(StringReader())
+    #     self.pl.add(
+    #         CoreferenceProcessor(),
+    #         {
+    #             "entry_type": "ft.onto.base_ontology.Document",
+    #             "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+    #             "lang": "en_core_web_sm",
+    #             "model": "use_default_model",
+    #             "cfg_inference": {
+    #                 "greedyness": 0.5,
+    #                 "max_dist": 50,
+    #                 "max_dist_match": 500,
+    #                 "blacklist": True,
+    #                 "store_scores": True,
+    #                 "conv_dict": None,
+    #             },
+    #         },
+    #     )
 
-        self.pl.initialize()
+    #     self.pl.initialize()
 
     @data(
         (
@@ -66,21 +68,66 @@ def setUp(self):
             "most likely secondary to steoporosis.\n"
             "These can be followed by repeat imaging as an outpatient.",
             [["HEAD CT", "Head CT", "Abdominal CT"]],
+            "ft.onto.base_ontology.Document",
         ),
         (
             "My sister has a dog. She loves him.",
             [["My sister", "She"], ["a dog", "him"]],
+            "ft.onto.base_ontology.Document",
+        ),
+        (
+            "My sister loves her dog. My aunt also loves him.",
+            [["My sister", "her"], ["My aunt", "him"]],
+            "ft.onto.base_ontology.Sentence",
+        ),
+        (
+            "My sister loves her dog. My aunt also loves him.",
+            [["My sister", "her"], ["her dog", "him"]],
+            "ft.onto.base_ontology.Document",
+            # Document-level coref is different from sentence-level.
         ),
     )
     @unpack
-    def test_medical_notes(self, input_data, check_list):
+    def test_medical_notes(self, input_data, check_list, entry_type):
+        self.pl = Pipeline[DataPack](enforce_consistency=True)
+        self.pl.set_reader(StringReader())
+        self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"})
+        self.pl.add(
+            CoreferenceProcessor(),
+            {
+                "entry_type": entry_type,
+                "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+                "lang": "en_core_web_sm",
+                "model": "use_default_model",
+                "cfg_inference": {
+                    "greedyness": 0.5,
+                    "max_dist": 50,
+                    "max_dist_match": 500,
+                    "blacklist": True,
+                    "store_scores": True,
+                    "conv_dict": None,
+                },
+            },
+        )
+
+        self.pl.initialize()
+
+        def load_module(string):
+            path_str, module_str = string.rsplit(".", 1)
+            mod = importlib.import_module(path_str)
+            return getattr(mod, module_str)
+
+        entry_type = load_module(entry_type)
+
         for pack in self.pl.process_dataset(input_data):
-            for document in pack.get(Document):
-                output_list = []
-                for group in pack.get(CoreferenceGroup, document):
+            output_list = []
+
+            for document in pack.get(entry_type):
+                for group in document.get(CoreferenceGroup):
                     members = [member for member in group.get_members()]
                     members = sorted(members, key=lambda x: x.begin)
 
                     mention_texts = [member.text for member in members]
                     output_list.append(mention_texts)
-                assert output_list == check_list
+
+            self.assertEqual(output_list, check_list, f"input: {document.text}")

From f453e60f94db9ccdd9b45132f0de26479e0eec40 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 18:44:37 +0800
Subject: [PATCH 42/63] shorten comment

---
 .../processors/coreference_processor.py       | 96 +++++++++++--------
 1 file changed, 55 insertions(+), 41 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 08d78219..d591fbbe 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -35,12 +35,12 @@
 
 class CoreferenceProcessor(PackProcessor):
     r"""
-    Implementation of this CoreferenceProcessor has been based on huggingface
-    NeuralCoref. You can find more details in the original repo.
+    Implementation of this CoreferenceProcessor has been based on
+    huggingface NeuralCoref. You can find more details in the original repo.
 
     Note that the NeuralCoref package from PyPI uses a dated spaCy
-    version (2.1), which can cause segmentation fault with the spaCy we use (2.3).
-    Please install NeuralCoref by building from source.
+    version (2.1), which can cause segmentation fault with the spaCy
+    we use (2.3). Please install NeuralCoref by building from source.
 
     Referred repository link:
     https://github.com/huggingface/neuralcoref
@@ -59,7 +59,10 @@ def set_up(self, configs: Config):
                 "haven't called the initialization function."
             )
 
-        model = configs.model if configs.model != "use_default_model" else True
+        if configs.model != "use_default_model":
+            model = configs.model
+        else:
+            model = True
         cfg_inference = configs.cfg_inference
         neuralcoref.add_to_pipe(
             self.spacy_nlp, model=model, cfg_inference=cfg_inference
@@ -104,11 +107,14 @@ def load_module(string):
                             mention.start_char + entry_specified.begin,
                             mention.end_char + entry_specified.begin,
                         )
+
+                        # TODO: remove assertion?
                         assert (
-                            mention.text == mention_text,  # TODO: remove assertion?
+                            mention.text == mention_text,
                             f"The processor extracted mention {mention.text}"
-                            f" which is different from the original mention {mention_text}."
-                            f"The offeset calculation is wrong.",
+                            f" which is different from the original mention"
+                            f" {mention_text}. The offeset calculation"
+                            f" is wrong.",
                         )
                         mentions.append(mention)
 
@@ -131,18 +137,20 @@ def default_configs(cls):
         - `mention_type`: The type of members in `CoreferenceGroup`.
           Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
           It can also be set to `"ft.onto.base_ontology.EntityMention"`.
-        - `lang`: The SpaCy pipeline to be used. The pipeline does the preprocessing
-          steps for NeuralCoref.
+        - `lang`: The SpaCy pipeline to be used. The pipeline does the
+          preprocessing steps for NeuralCoref.
           Default: `"en_core_web_sm"`.
-        - `model`: the neural net model to be used by NeuralCoref. If set to
-          `"use_default_model"`, a pre-trained neural net will be downloaded and cached.
-          If set to your customized model, the model needs to be a tuple containing a
-          `single_model` and a `pairs_model`. See `NeuralCoref.Model` method in
+        - `model`: the neural net model to be used by NeuralCoref. If set
+          to `"use_default_model"`, a pre-trained neural net will be
+          downloaded and cached.
+          If set to your customized model, the model needs to be a tuple
+          containing a `single_model` and a `pairs_model`.
+          See `NeuralCoref.Model` method in
           https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx
           for reference of how the default model is defined.
           Default: `"use_default_model"`.
-        - `cfg_inference`: A dict containing the inference configs of NeuralCoref. See
-          `get_default_cfg_inference` for default values, and see
+        - `cfg_inference`: A dict containing the inference configs of
+          NeuralCoref. See `get_default_cfg_inference` for default values, and see
           https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters
           for the meaing of these parameters.
 
@@ -162,31 +170,37 @@ def get_default_cfg_inference(cls):
         This defines the default inference config of NeuralCoref.
 
         Following are the keys for this dictionary:
-         - `greedyness` (`float`): A number between 0 and 1 determining how greedy
-            the model is about making coreference decisions
-            (more greedy means more coreference links).
-            Default: `0.5`.
-         - `max_dist` (`int`): How many mentions back to look when considering possible
-            antecedents of the current mention. Decreasing the value will cause
-            the system to run faster but less accurately.
-            Default: `50`.
-         - `max_dist_match` (`int`): The system will consider linking the current mention
-            to a preceding one further than max_dist away if they share a noun or
-            proper noun. In this case, it looks max_dist_match away instead.
-            Default: `500`.
-         - `blacklist` (`bool`): Should the system resolve coreferences for pronouns in the
-            following list: ["i", "me", "my", "you", "your"].
-            Default `True`.
-         - `store_scores` (`bool`): Should the system store the scores for the coreferences
-            in annotations.
-            Default: `True`
-         - `conv_dict` (`dict(str, list(str))`): A conversion dictionary that you can use
-            to replace the embeddings of rare words (keys) by an average of the embeddings
-            of a list of common words (values). Ex: `conv_dict={"Angela": ["woman", "girl"]}`
-            will help resolving coreferences for Angela by using the embeddings for the more
-            common woman and girl instead of the embedding of Angela.
-            This currently only works for single words (not for words groups).
-            Default: `None`.
+        - `greedyness` (`float`): A number between 0 and 1 determining
+           how greedy the model is about making coreference decisions
+           (more greedy means more coreference links).
+           Default: `0.5`.
+        - `max_dist` (`int`): How many mentions back to look when
+           considering possible antecedents of the current mention.
+           Decreasing the value will cause the system to run faster
+           but less accurately.
+           Default: `50`.
+        - `max_dist_match` (`int`): The system will consider linking
+           the current mention
+           to a preceding one further than max_dist away if they share
+           a noun or proper noun. In this case, it looks max_dist_match
+           away instead.
+           Default: `500`.
+        - `blacklist` (`bool`): Should the system resolve coreferences
+           for pronouns in the following list: ["i", "me", "my", "you", "your"].
+           Default `True`.
+        - `store_scores` (`bool`): Should the system store the scores
+           for the coreferences in annotations.
+           Default: `True`
+        - `conv_dict` (`dict(str, list(str))`): A conversion dictionary
+           that you can use
+           to replace the embeddings of rare words (keys) by an average
+           of the embeddings of a list of common words (values).
+           Ex: `conv_dict={"Angela": ["woman", "girl"]}`
+           will help resolving coreferences for Angela by using the
+           embeddings for the more common woman and girl instead of the
+           embedding of Angela.
+           This currently only works for single words (not for words groups).
+           Default: `None`.
 
         Returns: A dictionary with the default inference config of NeuralCoref.
         """

From 1d006fdb4e2e9ec2d63f71ae3332733f84b3d287 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 18:45:55 +0800
Subject: [PATCH 43/63] remove store_scores

---
 fortex/health/processors/coreference_processor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index d591fbbe..bc0bbe33 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -188,9 +188,6 @@ def get_default_cfg_inference(cls):
         - `blacklist` (`bool`): Should the system resolve coreferences
            for pronouns in the following list: ["i", "me", "my", "you", "your"].
            Default `True`.
-        - `store_scores` (`bool`): Should the system store the scores
-           for the coreferences in annotations.
-           Default: `True`
         - `conv_dict` (`dict(str, list(str))`): A conversion dictionary
            that you can use
            to replace the embeddings of rare words (keys) by an average
@@ -209,7 +206,6 @@ def get_default_cfg_inference(cls):
             "max_dist": 50,
             "max_dist_match": 500,
             "blacklist": True,
-            "store_scores": True,
             "conv_dict": None,
         }
 

From dcec89faaeb95083f5f542e9d9ac8a8d89b45767 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 18:51:35 +0800
Subject: [PATCH 44/63] fix assertion

---
 fortex/health/processors/coreference_processor.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index bc0bbe33..438a476f 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -110,12 +110,10 @@ def load_module(string):
 
                         # TODO: remove assertion?
                         assert (
-                            mention.text == mention_text,
-                            f"The processor extracted mention {mention.text}"
-                            f" which is different from the original mention"
-                            f" {mention_text}. The offeset calculation"
-                            f" is wrong.",
-                        )
+                            mention.text == mention_text
+                        ), f"The processor extracted mention {mention.text}"
+                        f" which is different from the original mention"
+                        f" {mention_text}. The offeset calculation is wrong."
                         mentions.append(mention)
 
                     group = CoreferenceGroup(input_pack)

From 465341d6c762c4119836fecd3dab301cf471163e Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 19:07:46 +0800
Subject: [PATCH 45/63] remove store_scores in test

---
 .../processors/coreference_processor_test.py  | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 71a892c3..5e250b9d 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -32,29 +32,6 @@
 
 @ddt
 class TestCoreferenceProcessor(unittest.TestCase):
-    # def setUp(self):
-    #     self.pl = Pipeline[DataPack](enforce_consistency=True)
-    #     self.pl.set_reader(StringReader())
-    #     self.pl.add(
-    #         CoreferenceProcessor(),
-    #         {
-    #             "entry_type": "ft.onto.base_ontology.Document",
-    #             "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
-    #             "lang": "en_core_web_sm",
-    #             "model": "use_default_model",
-    #             "cfg_inference": {
-    #                 "greedyness": 0.5,
-    #                 "max_dist": 50,
-    #                 "max_dist_match": 500,
-    #                 "blacklist": True,
-    #                 "store_scores": True,
-    #                 "conv_dict": None,
-    #             },
-    #         },
-    #     )
-
-    #     self.pl.initialize()
-
     @data(
         (
             "ADDENDUM:\n"
@@ -104,7 +81,6 @@ def test_medical_notes(self, input_data, check_list, entry_type):
                     "max_dist": 50,
                     "max_dist_match": 500,
                     "blacklist": True,
-                    "store_scores": True,
                     "conv_dict": None,
                 },
             },

From 3ee2f7bbba7be938ec1b6320bc8718f348cc408c Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 19:08:45 +0800
Subject: [PATCH 46/63] rename document to entry

---
 .../processors/coreference_processor_test.py              | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 5e250b9d..f72e18cf 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -23,7 +23,7 @@
 from forte.data.readers import StringReader
 from forte.pipeline import Pipeline
 
-from ft.onto.base_ontology import Document, CoreferenceGroup
+from ft.onto.base_ontology import CoreferenceGroup
 from fortex.health.processors.coreference_processor import (
     CoreferenceProcessor,
 )
@@ -98,12 +98,12 @@ def load_module(string):
         for pack in self.pl.process_dataset(input_data):
             output_list = []
 
-            for document in pack.get(entry_type):
-                for group in document.get(CoreferenceGroup):
+            for entry in pack.get(entry_type):
+                for group in entry.get(CoreferenceGroup):
                     members = [member for member in group.get_members()]
                     members = sorted(members, key=lambda x: x.begin)
 
                     mention_texts = [member.text for member in members]
                     output_list.append(mention_texts)
 
-            self.assertEqual(output_list, check_list, f"input: {document.text}")
+            self.assertEqual(output_list, check_list, f"input: {entry.text}")

From 1d44ff738dcccf11e437bec422bf533ed04c879f Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 19:36:47 +0800
Subject: [PATCH 47/63] fix cfg_inference kwargs

---
 fortex/health/processors/coreference_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 438a476f..b1b11f81 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -65,7 +65,7 @@ def set_up(self, configs: Config):
             model = True
         cfg_inference = configs.cfg_inference
         neuralcoref.add_to_pipe(
-            self.spacy_nlp, model=model, cfg_inference=cfg_inference
+            self.spacy_nlp, model=model, **cfg_inference
         )
 
     def initialize(self, resources: Resources, configs: Config):

From 0ad95d8339ac3db82cef669ce4230a7deee03a4d Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 19:37:49 +0800
Subject: [PATCH 48/63] add conv_dict test

---
 .../processors/coreference_processor_test.py  | 61 ++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index f72e18cf..e8639c5e 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -56,6 +56,7 @@ class TestCoreferenceProcessor(unittest.TestCase):
             "My sister loves her dog. My aunt also loves him.",
             [["My sister", "her"], ["My aunt", "him"]],
             "ft.onto.base_ontology.Sentence",
+            # Sentence-level coref resolution.
         ),
         (
             "My sister loves her dog. My aunt also loves him.",
@@ -65,7 +66,7 @@ class TestCoreferenceProcessor(unittest.TestCase):
         ),
     )
     @unpack
-    def test_medical_notes(self, input_data, check_list, entry_type):
+    def test_inputs_and_entry_types(self, input_data, check_list, entry_type):
         self.pl = Pipeline[DataPack](enforce_consistency=True)
         self.pl.set_reader(StringReader())
         self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"})
@@ -107,3 +108,61 @@ def load_module(string):
                     output_list.append(mention_texts)
 
             self.assertEqual(output_list, check_list, f"input: {entry.text}")
+
+    @data(
+        (
+            "Deepika has a dog. She loves him. The movie star has always been fond of animals",
+            [["Deepika", "She", "him", "The movie star"]],
+            {},
+        ),
+        (
+            "Deepika has a dog. She loves him. The movie star has always been fond of animals",
+            [["Deepika", "She", "The movie star"], ["a dog", "him"]],
+            {"Deepika": ["woman", "actress"]},
+        ),
+    )
+    @unpack
+    def test_conv_dict(self, input_data, check_list, conv_dict):
+        entry_type = "ft.onto.base_ontology.Document"
+
+        self.pl = Pipeline[DataPack](enforce_consistency=True)
+        self.pl.set_reader(StringReader())
+        self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"})
+        self.pl.add(
+            CoreferenceProcessor(),
+            {
+                "entry_type": entry_type,
+                "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+                "lang": "en_core_web_sm",
+                "model": "use_default_model",
+                "cfg_inference": {
+                    "greedyness": 0.5,
+                    "max_dist": 50,
+                    "max_dist_match": 500,
+                    "blacklist": True,
+                    "conv_dict": conv_dict,
+                },
+            },
+        )
+
+        self.pl.initialize()
+
+        def load_module(string):
+            path_str, module_str = string.rsplit(".", 1)
+            mod = importlib.import_module(path_str)
+            return getattr(mod, module_str)
+
+        entry_type = load_module(entry_type)
+
+        for pack in self.pl.process_dataset(input_data):
+            output_list = []
+
+            for entry in pack.get(entry_type):
+                for group in entry.get(CoreferenceGroup):
+                    members = [member for member in group.get_members()]
+                    members = sorted(members, key=lambda x: x.begin)
+
+                    mention_texts = [member.text for member in members]
+                    output_list.append(mention_texts)
+
+            self.assertEqual(output_list, check_list, f"input: {entry.text}")

From 87b099686da89009160fb32120500942f108281f Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 19:42:53 +0800
Subject: [PATCH 49/63] black reformat

---
 fortex/health/processors/coreference_processor.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index b1b11f81..4afc679e 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -64,9 +64,7 @@ def set_up(self, configs: Config):
         else:
             model = True
         cfg_inference = configs.cfg_inference
-        neuralcoref.add_to_pipe(
-            self.spacy_nlp, model=model, **cfg_inference
-        )
+        neuralcoref.add_to_pipe(self.spacy_nlp, model=model, **cfg_inference)
 
     def initialize(self, resources: Resources, configs: Config):
         super().initialize(resources, configs)

From 148e91891cfb36e433515ff7928d8485b695523e Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 19:53:41 +0800
Subject: [PATCH 50/63] fix pylint

---
 fortex/health/processors/coreference_processor.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 4afc679e..507e9614 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -92,9 +92,7 @@ def load_module(string):
         for entry_specified in input_pack.get(entry_type=entry_type):
             result = self.spacy_nlp(entry_specified.text)
 
-            if not result._.has_coref:
-                continue
-            else:
+            if result._.has_coref:
                 for cluster in result._.coref_clusters:
 
                     mentions = []
@@ -107,11 +105,11 @@ def load_module(string):
                         )
 
                         # TODO: remove assertion?
-                        assert (
-                            mention.text == mention_text
-                        ), f"The processor extracted mention {mention.text}"
-                        f" which is different from the original mention"
-                        f" {mention_text}. The offeset calculation is wrong."
+                        assert mention.text == mention_text, (
+                            f"The processor extracted mention {mention.text}"
+                            " which is different from the original mention"
+                            f" {mention_text}. The offeset calculation is wrong."
+                        )
                         mentions.append(mention)
 
                     group = CoreferenceGroup(input_pack)

From 2fefe18cc1615e38297ea9fef24e524912717956 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 20:51:19 +0800
Subject: [PATCH 51/63] update comment

---
 fortex/health/processors/coreference_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 507e9614..529e0b55 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -141,7 +141,7 @@ def default_configs(cls):
           containing a `single_model` and a `pairs_model`.
           See `NeuralCoref.Model` method in
           https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx
-          for reference of how the default model is defined.
+          for reference of how to define such a model.
           Default: `"use_default_model"`.
         - `cfg_inference`: A dict containing the inference configs of
           NeuralCoref. See `get_default_cfg_inference` for default values, and see

From 662b32428de0d30bb5506adc48287e6dcd22f8b9 Mon Sep 17 00:00:00 2001
From: KiaLAN <1139479308@qq.com>
Date: Fri, 1 Jul 2022 20:59:35 +0800
Subject: [PATCH 52/63] update comment

---
 fortex/health/processors/coreference_processor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 529e0b55..dd38989c 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -129,8 +129,9 @@ def default_configs(cls):
           `"ft.onto.base_ontology.Sentence"`.
           Default: `"ft.onto.base_ontology.Document"`.
         - `mention_type`: The type of members in `CoreferenceGroup`.
+          It can be set to `"ft.onto.base_ontology.EntityMention"` or
+          its subclasses.
           Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
-          It can also be set to `"ft.onto.base_ontology.EntityMention"`.
         - `lang`: The SpaCy pipeline to be used. The pipeline does the
           preprocessing steps for NeuralCoref.
           Default: `"en_core_web_sm"`.

From 6bcc58c195b9689480713383a2166b7fac4ca9f2 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 15:01:55 +0400
Subject: [PATCH 53/63] try one

---
 setup.py | 62 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/setup.py b/setup.py
index ed849bfa..9040a9c1 100644
--- a/setup.py
+++ b/setup.py
@@ -20,29 +20,61 @@
         include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"]
     ),
     namespace_packages=["fortex"],
+    setup_requires=[
+        "forte.spacy", # TODO: version
+        "cython>=0.25",
+        "pytest",
+    ],
     install_requires=[
+        "enum34==1.1.10;python_version<'3.4'",
+        # "sortedcontainers>=2.1.0",`
+        # "numpy>=1.16.6",
+        # "jsonpickle>=1.4",
+        # "pyyaml>=5.4",
+        # "smart-open>=1.8.4",
+        # "typed_astunparse>=2.1.4",
+        # "funcsigs>=1.0.2",
+        # "typed_ast>=1.5.0",
+        # "jsonschema>=3.0.2",
+        # 'typing>=3.7.4;python_version<"3.5"',
+        # "typing-inspect>=0.6.0",
+        # 'dataclasses~=0.7;python_version<"3.7"',
+        # 'importlib-resources>=5.1.4;python`_version<"3.7"',
+        "asyml-utilities",
+
+
         "forte~=0.2.0",
-        "sortedcontainers==2.1.0",
-        "numpy>=1.16.6",
-        "jsonpickle==1.4",
-        "pyyaml==5.4",
-        "smart-open>=1.8.4",
-        "typed_astunparse==2.1.4",
-        "funcsigs==1.0.2",
+        # "sortedcontainers==2.1.0",
+        # "numpy>=1.16.6",
+        # "jsonpickle==1.4",
+        # "pyyaml==5.4",
+        # "smart-open>=1.8.4",
+        # "typed_astunparse==2.1.4",
+        # "funcsigs==1.0.2",
         "mypy_extensions==0.4.3",
-        "typed_ast>=1.4.3",
-        "jsonschema==3.0.2",
+        # "typed_ast>=1.4.3",
+        # "jsonschema==3.0.2",
         "texar-pytorch",
-        'typing>=3.7.4;python_version<"3.5"',
-        "typing-inspect>=0.6.0",
-        'dataclasses~=0.7;python_version<"3.7"',
-        'importlib-resources==5.1.4;python_version<"3.7"',
-        'dataclasses~=0.7;python_version<"3.7"',
+        # 'typing>=3.7.4;python_version<"3.5"',
+        # "typing-inspect>=0.6.0",
+        # 'dataclasses~=0.7;python_version<"3.7"',
+        # 'importlib-resources==5.1.4;python_version<"3.7"',S
+        # 'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
-        # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy
+        # # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy
+        "forte.spacy", # TODO: version
         "cython>=0.25",
         "pytest",
+
+        "ddt",
+            "testfixtures",
+            "transformers==4.2.2",
+            "protobuf==3.19.4",
+            # It is annoying that if we install neuralcoref and spacy at the same
+            # time, neuralcoref will throw "Cython failed" during building.
+            # Therefore, we must install neuralcoref after spacy is installed.
+        "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
     ],
     extras_require={
         "test": [

From ae31363978ac1945c435a0ca4cc194777c781d51 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 16:00:54 +0400
Subject: [PATCH 54/63] use subprocess to install cython

---
 setup.py | 76 ++++++++++++++++++--------------------------------------
 1 file changed, 24 insertions(+), 52 deletions(-)

diff --git a/setup.py b/setup.py
index 9040a9c1..6d60718d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,8 @@
 import sys
 from pathlib import Path
 import setuptools
+import subprocess
+import os
 
 
 long_description = (Path(__file__).parent / "README.md").read_text()
@@ -8,6 +10,25 @@
 if sys.version_info < (3, 6):
     sys.exit("Python>=3.6 is required by forte-medical.")
 
+# If we install neuralcoref and spacy at the same
+# time, neuralcoref will throw "Cython failed" during building,
+# which is because neuralcoref does not set cython as dependency
+# properly.
+# Therefore, we must install neuralcoref after cython and spacy
+# are installed.
+p = subprocess.call(
+    [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "cython>=0.25",
+    ],
+    env=os.environ,
+)
+if p != 0:
+    raise RuntimeError("Installing NeuralCoref dependencies failed.")
+
 setuptools.setup(
     name="forte.health",
     version="0.1.0",
@@ -20,61 +41,14 @@
         include=["fortex.health", "ftx.*"], exclude=["scripts*", "examples*", "tests*"]
     ),
     namespace_packages=["fortex"],
-    setup_requires=[
-        "forte.spacy", # TODO: version
-        "cython>=0.25",
-        "pytest",
-    ],
     install_requires=[
-        "enum34==1.1.10;python_version<'3.4'",
-        # "sortedcontainers>=2.1.0",`
-        # "numpy>=1.16.6",
-        # "jsonpickle>=1.4",
-        # "pyyaml>=5.4",
-        # "smart-open>=1.8.4",
-        # "typed_astunparse>=2.1.4",
-        # "funcsigs>=1.0.2",
-        # "typed_ast>=1.5.0",
-        # "jsonschema>=3.0.2",
-        # 'typing>=3.7.4;python_version<"3.5"',
-        # "typing-inspect>=0.6.0",
-        # 'dataclasses~=0.7;python_version<"3.7"',
-        # 'importlib-resources>=5.1.4;python`_version<"3.7"',
-        "asyml-utilities",
-
-
         "forte~=0.2.0",
-        # "sortedcontainers==2.1.0",
-        # "numpy>=1.16.6",
-        # "jsonpickle==1.4",
-        # "pyyaml==5.4",
-        # "smart-open>=1.8.4",
-        # "typed_astunparse==2.1.4",
-        # "funcsigs==1.0.2",
         "mypy_extensions==0.4.3",
-        # "typed_ast>=1.4.3",
-        # "jsonschema==3.0.2",
         "texar-pytorch",
-        # 'typing>=3.7.4;python_version<"3.5"',
-        # "typing-inspect>=0.6.0",
-        # 'dataclasses~=0.7;python_version<"3.7"',
-        # 'importlib-resources==5.1.4;python_version<"3.7"',S
-        # 'dataclasses~=0.7;python_version<"3.7"',
         "fastapi==0.65.2",
         "uvicorn==0.14.0",
-        # # "spacy>=2.3.0, <=2.3.5", # will be installed by forte.spacy
-        "forte.spacy", # TODO: version
+        "forte.spacy",  # TODO: version
         "cython>=0.25",
-        "pytest",
-
-        "ddt",
-            "testfixtures",
-            "transformers==4.2.2",
-            "protobuf==3.19.4",
-            # It is annoying that if we install neuralcoref and spacy at the same
-            # time, neuralcoref will throw "Cython failed" during building.
-            # Therefore, we must install neuralcoref after spacy is installed.
-        "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
     ],
     extras_require={
         "test": [
@@ -82,10 +56,8 @@
             "testfixtures",
             "transformers==4.2.2",
             "protobuf==3.19.4",
-            # It is annoying that if we install neuralcoref and spacy at the same
-            # time, neuralcoref will throw "Cython failed" during building.
-            # Therefore, we must install neuralcoref after spacy is installed.
-            # "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
+            "pytest",
+            "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
         ],
     },
     entry_points={

From 74ffad4719123adde254c3809bd6e39571181867 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 16:04:37 +0400
Subject: [PATCH 55/63] use subprocess to install cython and spacy

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6d60718d..a43a1b6b 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 
 # If we install neuralcoref and spacy at the same
 # time, neuralcoref will throw "Cython failed" during building,
-# which is because neuralcoref does not set cython as dependency
+# which is because neuralcoref does not set them as dependencies
 # properly.
 # Therefore, we must install neuralcoref after cython and spacy
 # are installed.
@@ -22,6 +22,7 @@
         "-m",
         "pip",
         "install",
+        "forte.spacy",  # TODO: version
         "cython>=0.25",
     ],
     env=os.environ,

From f3dc9e3cf0d455f62d8243fc8a70ae26fa50c50d Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 16:15:31 +0400
Subject: [PATCH 56/63] add extras_require for icd and coref

---
 setup.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/setup.py b/setup.py
index a43a1b6b..bc63e247 100644
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,12 @@
             "pytest",
             "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
         ],
+        "icd_coding": [
+            "transformers",
+        ],
+        "coreference": [
+            "neuralcoref @ git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref",
+        ],
     },
     entry_points={
         "console_scripts": [

From 2581edbd2cd362bb22e308f1449544bb79601818 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 16:27:10 +0400
Subject: [PATCH 57/63] remove spacy and neuralcoref stage from main.yml

---
 .github/workflows/main.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 89bd3a6e..1d4181a7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -83,16 +83,6 @@ jobs:
       run: |
         pip install --use-feature=2020-resolver --progress-bar off .[test]
 
-    - name: Install Forte-wrappers-spacy
-      run: |        
-        git clone https://github.com/asyml/forte-wrappers.git
-        cd forte-wrappers
-        pip install src/spacy
-
-    - name: Install NeuralCoref
-      run: |
-        pip install git+https://git@github.com/huggingface/neuralcoref.git@4.0.0#egg=neuralcoref
-
     - name: Test with pytest and run coverage
       run: |
         coverage run -m pytest tests/

From e09dfe72875e8466dfef455011a4c0b4129a91a3 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 16:41:52 +0400
Subject: [PATCH 58/63] replace load_module with get_class

---
 fortex/health/processors/coreference_processor.py | 10 +++-------
 .../processors/coreference_processor_test.py      | 15 +++------------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index dd38989c..78be9aba 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -23,6 +23,7 @@
 from forte.common.configuration import Config
 from forte.data.data_pack import DataPack
 from forte.processors.base import PackProcessor
+from forte.utils import get_class
 
 from ft.onto.base_ontology import CoreferenceGroup
 
@@ -78,16 +79,11 @@ def _process(self, input_pack: DataPack):
         Then we translate the output to `CoreferenceGroup`.
         """
 
-        def load_module(string):
-            path_str, module_str = string.rsplit(".", 1)
-            mod = importlib.import_module(path_str)
-            return getattr(mod, module_str)
-
         # Default: Document
-        entry_type = load_module(self.configs.entry_type)
+        entry_type = get_class(self.configs.entry_type)
 
         # Default: MedicalEntityMention
-        mention_type = load_module(self.configs.mention_type)
+        mention_type = get_class(self.configs.mention_type)
 
         for entry_specified in input_pack.get(entry_type=entry_type):
             result = self.spacy_nlp(entry_specified.text)
diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index e8639c5e..31d885e1 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -22,6 +22,7 @@
 from forte.data.data_pack import DataPack
 from forte.data.readers import StringReader
 from forte.pipeline import Pipeline
+from forte.utils import get_class
 
 from ft.onto.base_ontology import CoreferenceGroup
 from fortex.health.processors.coreference_processor import (
@@ -89,12 +90,7 @@ def test_inputs_and_entry_types(self, input_data, check_list, entry_type):
 
         self.pl.initialize()
 
-        def load_module(string):
-            path_str, module_str = string.rsplit(".", 1)
-            mod = importlib.import_module(path_str)
-            return getattr(mod, module_str)
-
-        entry_type = load_module(entry_type)
+        entry_type = get_class(entry_type)
 
         for pack in self.pl.process_dataset(input_data):
             output_list = []
@@ -147,12 +143,7 @@ def test_conv_dict(self, input_data, check_list, conv_dict):
 
         self.pl.initialize()
 
-        def load_module(string):
-            path_str, module_str = string.rsplit(".", 1)
-            mod = importlib.import_module(path_str)
-            return getattr(mod, module_str)
-
-        entry_type = load_module(entry_type)
+        entry_type = get_class(entry_type)
 
         for pack in self.pl.process_dataset(input_data):
             output_list = []

From c925f5bf1285632636759bac2e4decac3d2ca35c Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 16:44:16 +0400
Subject: [PATCH 59/63] remove 'model' argument

---
 .../health/processors/coreference_processor.py   | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 78be9aba..5a046676 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -60,12 +60,8 @@ def set_up(self, configs: Config):
                 "haven't called the initialization function."
             )
 
-        if configs.model != "use_default_model":
-            model = configs.model
-        else:
-            model = True
         cfg_inference = configs.cfg_inference
-        neuralcoref.add_to_pipe(self.spacy_nlp, model=model, **cfg_inference)
+        neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference)
 
     def initialize(self, resources: Resources, configs: Config):
         super().initialize(resources, configs)
@@ -131,15 +127,6 @@ def default_configs(cls):
         - `lang`: The SpaCy pipeline to be used. The pipeline does the
           preprocessing steps for NeuralCoref.
           Default: `"en_core_web_sm"`.
-        - `model`: the neural net model to be used by NeuralCoref. If set
-          to `"use_default_model"`, a pre-trained neural net will be
-          downloaded and cached.
-          If set to your customized model, the model needs to be a tuple
-          containing a `single_model` and a `pairs_model`.
-          See `NeuralCoref.Model` method in
-          https://github.com/huggingface/neuralcoref/blob/master/neuralcoref/neuralcoref.pyx
-          for reference of how to define such a model.
-          Default: `"use_default_model"`.
         - `cfg_inference`: A dict containing the inference configs of
           NeuralCoref. See `get_default_cfg_inference` for default values, and see
           https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters
@@ -151,7 +138,6 @@ def default_configs(cls):
             "entry_type": "ft.onto.base_ontology.Document",
             "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
             "lang": "en_core_web_sm",
-            "model": "use_default_model",
             "cfg_inference": cls.get_default_cfg_inference(),
         }
 

From f6c1a8b6bf5899b3d045a92c006b8fa6c3dd3e9f Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 9 Jul 2022 17:38:32 +0400
Subject: [PATCH 60/63] fix rebundunt import and args

---
 fortex/health/processors/coreference_processor.py            | 1 -
 tests/forte_medical/processors/coreference_processor_test.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/fortex/health/processors/coreference_processor.py b/fortex/health/processors/coreference_processor.py
index 5a046676..89c0dc51 100644
--- a/fortex/health/processors/coreference_processor.py
+++ b/fortex/health/processors/coreference_processor.py
@@ -15,7 +15,6 @@
 Coreference Processor
 """
 from typing import Dict, Set
-import importlib
 
 import neuralcoref
 
diff --git a/tests/forte_medical/processors/coreference_processor_test.py b/tests/forte_medical/processors/coreference_processor_test.py
index 31d885e1..62d38c38 100644
--- a/tests/forte_medical/processors/coreference_processor_test.py
+++ b/tests/forte_medical/processors/coreference_processor_test.py
@@ -15,7 +15,6 @@
 Unit tests for CoreferenceProcessor
 """
 
-import importlib
 import unittest
 from ddt import data, ddt, unpack
 
@@ -77,7 +76,6 @@ def test_inputs_and_entry_types(self, input_data, check_list, entry_type):
                 "entry_type": entry_type,
                 "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
                 "lang": "en_core_web_sm",
-                "model": "use_default_model",
                 "cfg_inference": {
                     "greedyness": 0.5,
                     "max_dist": 50,
@@ -130,7 +128,6 @@ def test_conv_dict(self, input_data, check_list, conv_dict):
                 "entry_type": entry_type,
                 "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
                 "lang": "en_core_web_sm",
-                "model": "use_default_model",
                 "cfg_inference": {
                     "greedyness": 0.5,
                     "max_dist": 50,

From 7b31bebc9eb69750b9fdb25207bf54e2b8dabdab Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 16 Jul 2022 12:07:43 +0400
Subject: [PATCH 61/63] fix merge conflict

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 7c064ccb..d654ac34 100644
--- a/setup.py
+++ b/setup.py
@@ -43,6 +43,7 @@
     ),
     namespace_packages=["fortex"],
     install_requires=[
+        "forte~=0.2.0",
         "forte.spacy",  # TODO: version
         "cython>=0.25",
     ],

From 5ef774a97f746cd20492404ffc212a276a725f87 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 16 Jul 2022 12:16:43 +0400
Subject: [PATCH 62/63] fix merge conflict

---
 .../processors/coreference_processor_test.py  | 209 ++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100644 tests/fortex/health/processors/coreference_processor_test.py

diff --git a/tests/fortex/health/processors/coreference_processor_test.py b/tests/fortex/health/processors/coreference_processor_test.py
new file mode 100644
index 00000000..89c0dc51
--- /dev/null
+++ b/tests/fortex/health/processors/coreference_processor_test.py
@@ -0,0 +1,209 @@
+# Copyright 2022 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Coreference Processor
+"""
+from typing import Dict, Set
+
+import neuralcoref
+
+from forte.common import Resources, ProcessExecutionException
+from forte.common.configuration import Config
+from forte.data.data_pack import DataPack
+from forte.processors.base import PackProcessor
+from forte.utils import get_class
+
+from ft.onto.base_ontology import CoreferenceGroup
+
+from fortex.spacy.spacy_processors import load_lang_model
+
+__all__ = [
+    "CoreferenceProcessor",
+]
+
+
+class CoreferenceProcessor(PackProcessor):
+    r"""
+    Implementation of this CoreferenceProcessor has been based on
+    huggingface NeuralCoref. You can find more details in the original repo.
+
+    Note that the NeuralCoref package from PyPI uses a dated spaCy
+    version (2.1), which can cause segmentation fault with the spaCy
+    we use (2.3). Please install NeuralCoref by building from source.
+
+    Referred repository link:
+    https://github.com/huggingface/neuralcoref
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.spacy_nlp = None
+
+    def set_up(self, configs: Config):
+        self.spacy_nlp = load_lang_model(configs.lang)
+
+        if self.spacy_nlp is None:
+            raise ProcessExecutionException(
+                "The SpaCy pipeline is not initialized, maybe you "
+                "haven't called the initialization function."
+            )
+
+        cfg_inference = configs.cfg_inference
+        neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference)
+
+    def initialize(self, resources: Resources, configs: Config):
+        super().initialize(resources, configs)
+        self.set_up(configs)
+
+    def _process(self, input_pack: DataPack):
+        r"""
+        Coreference resolution is done by
+        a spaCy pipeline with `NeuralCoref` added.
+
+        Then we translate the output to `CoreferenceGroup`.
+        """
+
+        # Default: Document
+        entry_type = get_class(self.configs.entry_type)
+
+        # Default: MedicalEntityMention
+        mention_type = get_class(self.configs.mention_type)
+
+        for entry_specified in input_pack.get(entry_type=entry_type):
+            result = self.spacy_nlp(entry_specified.text)
+
+            if result._.has_coref:
+                for cluster in result._.coref_clusters:
+
+                    mentions = []
+                    for mention in cluster.mentions:
+                        mention_text = mention.text
+                        mention = mention_type(
+                            input_pack,
+                            mention.start_char + entry_specified.begin,
+                            mention.end_char + entry_specified.begin,
+                        )
+
+                        # TODO: remove assertion?
+                        assert mention.text == mention_text, (
+                            f"The processor extracted mention {mention.text}"
+                            " which is different from the original mention"
+                            f" {mention_text}. The offeset calculation is wrong."
+                        )
+                        mentions.append(mention)
+
+                    group = CoreferenceGroup(input_pack)
+                    group.add_members(mentions)
+
+    @classmethod
+    def default_configs(cls):
+        r"""
+        This defines a basic config structure for `CoreferenceProcessor`.
+
+        Following are the keys for this dictionary:
+        - `entry_type`: Input entry type. You can change the context of
+          coreference resolution by setting this parameter. For example,
+          if you want to do coreference resolution within documents, set
+          it to `"ft.onto.base_ontology.Document"`. If you want to do
+          coreference resolution within sentences, set it to
+          `"ft.onto.base_ontology.Sentence"`.
+          Default: `"ft.onto.base_ontology.Document"`.
+        - `mention_type`: The type of members in `CoreferenceGroup`.
+          It can be set to `"ft.onto.base_ontology.EntityMention"` or
+          its subclasses.
+          Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
+        - `lang`: The SpaCy pipeline to be used. The pipeline does the
+          preprocessing steps for NeuralCoref.
+          Default: `"en_core_web_sm"`.
+        - `cfg_inference`: A dict containing the inference configs of
+          NeuralCoref. See `get_default_cfg_inference` for default values, and see
+          https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters
+          for the meaing of these parameters.
+
+        Returns: A dictionary with the default config for this processor.
+        """
+        return {
+            "entry_type": "ft.onto.base_ontology.Document",
+            "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+            "lang": "en_core_web_sm",
+            "cfg_inference": cls.get_default_cfg_inference(),
+        }
+
+    @classmethod
+    def get_default_cfg_inference(cls):
+        """
+        This defines the default inference config of NeuralCoref.
+
+        Following are the keys for this dictionary:
+        - `greedyness` (`float`): A number between 0 and 1 determining
+           how greedy the model is about making coreference decisions
+           (more greedy means more coreference links).
+           Default: `0.5`.
+        - `max_dist` (`int`): How many mentions back to look when
+           considering possible antecedents of the current mention.
+           Decreasing the value will cause the system to run faster
+           but less accurately.
+           Default: `50`.
+        - `max_dist_match` (`int`): The system will consider linking
+           the current mention
+           to a preceding one further than max_dist away if they share
+           a noun or proper noun. In this case, it looks max_dist_match
+           away instead.
+           Default: `500`.
+        - `blacklist` (`bool`): Should the system resolve coreferences
+           for pronouns in the following list: ["i", "me", "my", "you", "your"].
+           Default `True`.
+        - `conv_dict` (`dict(str, list(str))`): A conversion dictionary
+           that you can use
+           to replace the embeddings of rare words (keys) by an average
+           of the embeddings of a list of common words (values).
+           Ex: `conv_dict={"Angela": ["woman", "girl"]}`
+           will help resolving coreferences for Angela by using the
+           embeddings for the more common woman and girl instead of the
+           embedding of Angela.
+           This currently only works for single words (not for words groups).
+           Default: `None`.
+
+        Returns: A dictionary with the default inference config of NeuralCoref.
+        """
+        return {
+            "greedyness": 0.5,
+            "max_dist": 50,
+            "max_dist_match": 500,
+            "blacklist": True,
+            "conv_dict": None,
+        }
+
+    def expected_types_and_attributes(self):
+        r"""
+        Method to add user specified expected type which would be checked
+        before running the processor if the pipeline is initialized with
+        `enforce_consistency=True` or
+        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
+        the pipeline.
+        """
+        # return {self.configs.entry_type: {"text"}} # TODO: fix this
+        return {self.configs.entry_type: set()}
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""
+        Method to add output type record of `CoreferenceProcessor` which
+        is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute
+        `members` to :attr:`forte.data.data_pack.Meta.record`.
+
+        Args:
+            record_meta: the field in the datapack for type record that need to
+                fill in for consistency checking.
+        """
+        record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"}

From ffe88f64d2401e1a747caa1843d4e900c9066154 Mon Sep 17 00:00:00 2001
From: kialan <1139479308@qq.com>
Date: Sat, 16 Jul 2022 12:48:52 +0400
Subject: [PATCH 63/63] fix merge conflict: restore coref test

---
 .../processors/coreference_processor_test.py  | 323 ++++++++----------
 1 file changed, 135 insertions(+), 188 deletions(-)

diff --git a/tests/fortex/health/processors/coreference_processor_test.py b/tests/fortex/health/processors/coreference_processor_test.py
index 89c0dc51..62d38c38 100644
--- a/tests/fortex/health/processors/coreference_processor_test.py
+++ b/tests/fortex/health/processors/coreference_processor_test.py
@@ -12,198 +12,145 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Coreference Processor
+Unit tests for CoreferenceProcessor
 """
-from typing import Dict, Set
 
-import neuralcoref
+import unittest
+from ddt import data, ddt, unpack
 
-from forte.common import Resources, ProcessExecutionException
-from forte.common.configuration import Config
 from forte.data.data_pack import DataPack
-from forte.processors.base import PackProcessor
+from forte.data.readers import StringReader
+from forte.pipeline import Pipeline
 from forte.utils import get_class
 
 from ft.onto.base_ontology import CoreferenceGroup
-
-from fortex.spacy.spacy_processors import load_lang_model
-
-__all__ = [
-    "CoreferenceProcessor",
-]
-
-
-class CoreferenceProcessor(PackProcessor):
-    r"""
-    Implementation of this CoreferenceProcessor has been based on
-    huggingface NeuralCoref. You can find more details in the original repo.
-
-    Note that the NeuralCoref package from PyPI uses a dated spaCy
-    version (2.1), which can cause segmentation fault with the spaCy
-    we use (2.3). Please install NeuralCoref by building from source.
-
-    Referred repository link:
-    https://github.com/huggingface/neuralcoref
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.spacy_nlp = None
-
-    def set_up(self, configs: Config):
-        self.spacy_nlp = load_lang_model(configs.lang)
-
-        if self.spacy_nlp is None:
-            raise ProcessExecutionException(
-                "The SpaCy pipeline is not initialized, maybe you "
-                "haven't called the initialization function."
-            )
-
-        cfg_inference = configs.cfg_inference
-        neuralcoref.add_to_pipe(self.spacy_nlp, model=True, **cfg_inference)
-
-    def initialize(self, resources: Resources, configs: Config):
-        super().initialize(resources, configs)
-        self.set_up(configs)
-
-    def _process(self, input_pack: DataPack):
-        r"""
-        Coreference resolution is done by
-        a spaCy pipeline with `NeuralCoref` added.
-
-        Then we translate the output to `CoreferenceGroup`.
-        """
-
-        # Default: Document
-        entry_type = get_class(self.configs.entry_type)
-
-        # Default: MedicalEntityMention
-        mention_type = get_class(self.configs.mention_type)
-
-        for entry_specified in input_pack.get(entry_type=entry_type):
-            result = self.spacy_nlp(entry_specified.text)
-
-            if result._.has_coref:
-                for cluster in result._.coref_clusters:
-
-                    mentions = []
-                    for mention in cluster.mentions:
-                        mention_text = mention.text
-                        mention = mention_type(
-                            input_pack,
-                            mention.start_char + entry_specified.begin,
-                            mention.end_char + entry_specified.begin,
-                        )
-
-                        # TODO: remove assertion?
-                        assert mention.text == mention_text, (
-                            f"The processor extracted mention {mention.text}"
-                            " which is different from the original mention"
-                            f" {mention_text}. The offeset calculation is wrong."
-                        )
-                        mentions.append(mention)
-
-                    group = CoreferenceGroup(input_pack)
-                    group.add_members(mentions)
-
-    @classmethod
-    def default_configs(cls):
-        r"""
-        This defines a basic config structure for `CoreferenceProcessor`.
-
-        Following are the keys for this dictionary:
-        - `entry_type`: Input entry type. You can change the context of
-          coreference resolution by setting this parameter. For example,
-          if you want to do coreference resolution within documents, set
-          it to `"ft.onto.base_ontology.Document"`. If you want to do
-          coreference resolution within sentences, set it to
-          `"ft.onto.base_ontology.Sentence"`.
-          Default: `"ft.onto.base_ontology.Document"`.
-        - `mention_type`: The type of members in `CoreferenceGroup`.
-          It can be set to `"ft.onto.base_ontology.EntityMention"` or
-          its subclasses.
-          Default: `"ftx.medical.clinical_ontology.MedicalEntityMention"`.
-        - `lang`: The SpaCy pipeline to be used. The pipeline does the
-          preprocessing steps for NeuralCoref.
-          Default: `"en_core_web_sm"`.
-        - `cfg_inference`: A dict containing the inference configs of
-          NeuralCoref. See `get_default_cfg_inference` for default values, and see
-          https://github.com/huggingface/neuralcoref/blob/master/README.md#parameters
-          for the meaing of these parameters.
-
-        Returns: A dictionary with the default config for this processor.
-        """
-        return {
-            "entry_type": "ft.onto.base_ontology.Document",
-            "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
-            "lang": "en_core_web_sm",
-            "cfg_inference": cls.get_default_cfg_inference(),
-        }
-
-    @classmethod
-    def get_default_cfg_inference(cls):
-        """
-        This defines the default inference config of NeuralCoref.
-
-        Following are the keys for this dictionary:
-        - `greedyness` (`float`): A number between 0 and 1 determining
-           how greedy the model is about making coreference decisions
-           (more greedy means more coreference links).
-           Default: `0.5`.
-        - `max_dist` (`int`): How many mentions back to look when
-           considering possible antecedents of the current mention.
-           Decreasing the value will cause the system to run faster
-           but less accurately.
-           Default: `50`.
-        - `max_dist_match` (`int`): The system will consider linking
-           the current mention
-           to a preceding one further than max_dist away if they share
-           a noun or proper noun. In this case, it looks max_dist_match
-           away instead.
-           Default: `500`.
-        - `blacklist` (`bool`): Should the system resolve coreferences
-           for pronouns in the following list: ["i", "me", "my", "you", "your"].
-           Default `True`.
-        - `conv_dict` (`dict(str, list(str))`): A conversion dictionary
-           that you can use
-           to replace the embeddings of rare words (keys) by an average
-           of the embeddings of a list of common words (values).
-           Ex: `conv_dict={"Angela": ["woman", "girl"]}`
-           will help resolving coreferences for Angela by using the
-           embeddings for the more common woman and girl instead of the
-           embedding of Angela.
-           This currently only works for single words (not for words groups).
-           Default: `None`.
-
-        Returns: A dictionary with the default inference config of NeuralCoref.
-        """
-        return {
-            "greedyness": 0.5,
-            "max_dist": 50,
-            "max_dist_match": 500,
-            "blacklist": True,
-            "conv_dict": None,
-        }
-
-    def expected_types_and_attributes(self):
-        r"""
-        Method to add user specified expected type which would be checked
-        before running the processor if the pipeline is initialized with
-        `enforce_consistency=True` or
-        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
-        the pipeline.
-        """
-        # return {self.configs.entry_type: {"text"}} # TODO: fix this
-        return {self.configs.entry_type: set()}
-
-    def record(self, record_meta: Dict[str, Set[str]]):
-        r"""
-        Method to add output type record of `CoreferenceProcessor` which
-        is `"ftx.medical.clinical_ontology.CoreferenceGroup"` with attribute
-        `members` to :attr:`forte.data.data_pack.Meta.record`.
-
-        Args:
-            record_meta: the field in the datapack for type record that need to
-                fill in for consistency checking.
-        """
-        record_meta["ft.onto.base_ontology.CoreferenceGroup"] = {"members"}
+from fortex.health.processors.coreference_processor import (
+    CoreferenceProcessor,
+)
+from fortex.spacy import SpacyProcessor
+
+
+@ddt
+class TestCoreferenceProcessor(unittest.TestCase):
+    @data(
+        (
+            "ADDENDUM:\n"
+            "RADIOLOGIC STUDIES: Radiologic studies also included "
+            "a chest CT, which confirmed cavitary lesions "
+            "in the left lung apex consistent with infectious process/tuberculosis.\n"
+            "This also moderate-sized left pleural effusion.\n"
+            "HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, "
+            "but old infarction consistent with past medical history.\n"
+            "ABDOMINAL CT:  Abdominal CT showed no lesions of T10 and sacrum "
+            "most likely secondary to steoporosis.\n"
+            "These can be followed by repeat imaging as an outpatient.",
+            [["HEAD CT", "Head CT", "Abdominal CT"]],
+            "ft.onto.base_ontology.Document",
+        ),
+        (
+            "My sister has a dog. She loves him.",
+            [["My sister", "She"], ["a dog", "him"]],
+            "ft.onto.base_ontology.Document",
+        ),
+        (
+            "My sister loves her dog. My aunt also loves him.",
+            [["My sister", "her"], ["My aunt", "him"]],
+            "ft.onto.base_ontology.Sentence",
+            # Sentence-level coref resolution.
+        ),
+        (
+            "My sister loves her dog. My aunt also loves him.",
+            [["My sister", "her"], ["her dog", "him"]],
+            "ft.onto.base_ontology.Document",
+            # Document-level coref is different from sentence-level.
+        ),
+    )
+    @unpack
+    def test_inputs_and_entry_types(self, input_data, check_list, entry_type):
+        self.pl = Pipeline[DataPack](enforce_consistency=True)
+        self.pl.set_reader(StringReader())
+        self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"})
+        self.pl.add(
+            CoreferenceProcessor(),
+            {
+                "entry_type": entry_type,
+                "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+                "lang": "en_core_web_sm",
+                "cfg_inference": {
+                    "greedyness": 0.5,
+                    "max_dist": 50,
+                    "max_dist_match": 500,
+                    "blacklist": True,
+                    "conv_dict": None,
+                },
+            },
+        )
+
+        self.pl.initialize()
+
+        entry_type = get_class(entry_type)
+
+        for pack in self.pl.process_dataset(input_data):
+            output_list = []
+
+            for entry in pack.get(entry_type):
+                for group in entry.get(CoreferenceGroup):
+                    members = [member for member in group.get_members()]
+                    members = sorted(members, key=lambda x: x.begin)
+
+                    mention_texts = [member.text for member in members]
+                    output_list.append(mention_texts)
+
+            self.assertEqual(output_list, check_list, f"input: {entry.text}")
+
+    @data(
+        (
+            "Deepika has a dog. She loves him. The movie star has always been fond of animals",
+            [["Deepika", "She", "him", "The movie star"]],
+            {},
+        ),
+        (
+            "Deepika has a dog. She loves him. The movie star has always been fond of animals",
+            [["Deepika", "She", "The movie star"], ["a dog", "him"]],
+            {"Deepika": ["woman", "actress"]},
+        ),
+    )
+    @unpack
+    def test_conv_dict(self, input_data, check_list, conv_dict):
+        entry_type = "ft.onto.base_ontology.Document"
+
+        self.pl = Pipeline[DataPack](enforce_consistency=True)
+        self.pl.set_reader(StringReader())
+        self.pl.add(SpacyProcessor(), config={"lang": "en_core_web_sm"})
+        self.pl.add(
+            CoreferenceProcessor(),
+            {
+                "entry_type": entry_type,
+                "mention_type": "ftx.medical.clinical_ontology.MedicalEntityMention",
+                "lang": "en_core_web_sm",
+                "cfg_inference": {
+                    "greedyness": 0.5,
+                    "max_dist": 50,
+                    "max_dist_match": 500,
+                    "blacklist": True,
+                    "conv_dict": conv_dict,
+                },
+            },
+        )
+
+        self.pl.initialize()
+
+        entry_type = get_class(entry_type)
+
+        for pack in self.pl.process_dataset(input_data):
+            output_list = []
+
+            for entry in pack.get(entry_type):
+                for group in entry.get(CoreferenceGroup):
+                    members = [member for member in group.get_members()]
+                    members = sorted(members, key=lambda x: x.begin)
+
+                    mention_texts = [member.text for member in members]
+                    output_list.append(mention_texts)
+
+            self.assertEqual(output_list, check_list, f"input: {entry.text}")