Merge pull request #21 from Pandora-Intelligence/#17-duplicate-loggign

#17 duplicate logging - #19 handling of error within missing tokens in model
davidberenstein1957 · Oct 9, 2022 · 3da123f · 3da123f
2 parents 1659eac + c23ad19
commit 3da123f
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -137,4 +137,5 @@ dmypy.json
 *.model.*
 /word2vec.wordvectors
 /word2vec.wordvectors.vectors.npy
-matching_patterns.json
+matching_patterns.json
+test.py
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ text = """
 
 nlp = spacy.load("en_core_web_lg", disable=["ner"])
 # ent_score for entity condifence scoring
-nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True})
+nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "verbose": True})
 doc = nlp(text)
 
 options = {"colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"},
@@ -70,14 +70,14 @@ displacy.render(doc, style="ent", options=options)
 ## Matching Pattern Rules
 
 ### Customizing Matching Pattern Rules
-Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized
-via the config passed to the spaCy pipeline.
+Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline.
 
  - `exclude_pos`: A list of POS tags to be excluded from the rule-based match.
  - `exclude_dep`: A list of dependencies to be excluded from the rule-based match.
  - `include_compound_words`:  If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity.
  - `case_sensitive`: Whether to match the case of the words in the text.
 
+
 ### Analyze Matching Pattern Rules
 To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behaviour can be changed by using the `json_path` variable via the config passed to the spaCy pipeline.
 

diff --git a/concise_concepts/__init__.py b/concise_concepts/__init__.py
@@ -30,6 +30,7 @@
         "include_compound_words": False,
         "case_sensitive": False,
         "json_path": "./matching_patterns.json",
+        "verbose": True,
     },
 )
 def make_concise_concepts(
@@ -45,6 +46,7 @@ def make_concise_concepts(
     include_compound_words: bool,
     case_sensitive: bool,
     json_path: str,
+    verbose: bool,
 ):
     return Conceptualizer(
         nlp=nlp,
@@ -59,4 +61,5 @@ def make_concise_concepts(
         include_compound_words=include_compound_words,
         case_sensitive=case_sensitive,
         json_path=json_path,
+        verbose=verbose,
     )
diff --git a/concise_concepts/conceptualizer/Conceptualizer.py b/concise_concepts/conceptualizer/Conceptualizer.py
@@ -41,6 +41,7 @@ def __init__(
         include_compound_words: bool = False,
         case_sensitive: bool = False,
         json_path: str = "./matching_patterns.json",
+        verbose: bool = True,
     ):
         """
         The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and
@@ -65,6 +66,8 @@ def __init__(
             if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional)
         :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)
         """
+        self.verbose = verbose
+        self.log_cache = {"key": list(), "word": list(), "word_key": list()}
         if Span.has_extension("ent_score"):
             Span.remove_extension("ent_score")
         if ent_score:
@@ -102,7 +105,7 @@ def run(self):
         self.check_validity_path()
         self.determine_topn()
         self.set_gensim_model()
-        self.verify_data()
+        self.verify_data(self.verbose)
         self.expand_concepts()
         self.verify_data(verbose=False)
         # settle words around overlapping concepts
@@ -196,20 +199,31 @@ def verify_data(self, verbose: bool = True):
             verified_values = []
             if not self.check_presence_vocab(key):
                 if verbose:
-                    logger.warning(f"key ´{key}´ not present in vector model")
+                    if key not in self.log_cache["key"]:
+                        logger.warning(f"key ´{key}´ not present in vector model")
+                        self.log_cache["key"].append(key)
             for word in value:
                 if self.check_presence_vocab(word):
                     verified_values.append(self.check_presence_vocab(word))
                 else:
                     if verbose:
-                        logger.warning(
-                            f"word ´{word}´ from key ´{key}´ not present in vector"
-                            " model"
-                        )
+                        if word not in self.log_cache["word"]:
+                            logger.warning(
+                                f"word ´{word}´ from key ´{key}´ not present in vector"
+                                " model"
+                            )
+                            self.log_cache["word"].append(word)
             verified_data[key] = verified_values
-            assert len(
-                verified_values
-            ), f"None of the entries for key {key} are present in the vector model"
+            if not len(verified_values):
+                msg = (
+                    f"None of the entries for key {key} are present in the vector"
+                    " model. "
+                )
+                if self.check_presence_vocab(key):
+                    logger.warning(msg + f"Using {key} as word to expand over instead.")
+                    verified_data[key] = self.check_presence_vocab(key)
+                else:
+                    raise Exception(msg)
         self.data = deepcopy(verified_data)
         self.original_data = deepcopy(self.data)
 
@@ -453,18 +467,23 @@ def assign_score_to_entities(self, doc: Doc):
                     ent._.ent_score = self.kv.n_similarity(entity, concept)
                 else:
                     ent._.ent_score = 0
-                    logger.warning(
-                        f"Entity ´{ent.text}´ and/or label ´{concept}´ not found in"
-                        " vector model. Nothing to compare to, so setting"
-                        " ent._.ent_score to 0."
-                    )
+                    if self.verbose:
+                        if f"{ent.text}_{concept}" not in self.log_cache["key_word"]:
+                            logger.warning(
+                                f"Entity ´{ent.text}´ and/or label ´{concept}´ not"
+                                " found in vector model. Nothing to compare to, so"
+                                " setting ent._.ent_score to 0."
+                            )
+                            self.log_cache["key_word"].append(f"{ent.text}_{concept}")
             else:
                 ent._.ent_score = 0
-
-                logger.warning(
-                    f"Entity ´{ent.text}´ not found in vector model. Nothing to compare"
-                    " to, so setting ent._.ent_score to 0."
-                )
+                if self.verbose:
+                    if ent.text not in self.log_cache["word"]:
+                        logger.warning(
+                            f"Entity ´{ent.text}´ not found in vector model. Nothing to"
+                            " compare to, so setting ent._.ent_score to 0."
+                        )
+                        self.log_cache["word"].append(ent.text)
         doc.ents = ents
         return doc
 

diff --git a/concise_concepts/examples/example_gensim_custom_model.py b/concise_concepts/examples/example_gensim_custom_model.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+import spacy
+from gensim.models import Word2Vec
+from gensim.test.utils import common_texts
+
+import concise_concepts  # noqa: F401
+
+data = {"human": ["trees"], "interface": ["computer"]}
+
+text = (
+    "believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is"
+    " an extremely slow mobile phone and takes ages to open and navigate. Forget about"
+    " heavy use, it can't handle normal regular use. I made a huge mistake but pls"
+    " don't buy this mobile. It's only a few months and I am thinking to change it. Its"
+    " dam SLOW SLOW SLOW."
+)
+
+model = Word2Vec(
+    sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4
+)
+model.save("word2vec.model")
+model_path = "word2vec.model"
+
+nlp = spacy.blank("en")
+nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
diff --git a/...oncepts/examples/example_gensim_custom.py → ...ts/examples/example_gensim_custom_path.py b/...oncepts/examples/example_gensim_custom.py → ...ts/examples/example_gensim_custom_path.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "concise-concepts"
-version = "0.6.2.1"
+version = "0.6.3"
 description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!"
 authors = ["David Berenstein <[email protected]>"]
 license = "MIT"

diff --git a/test.py b/test.py
diff --git a/tests/test_model_import.py b/tests/test_model_import.py
@@ -7,5 +7,9 @@ def test_gensim_default():
     from concise_concepts.examples import example_gensim_default  # noqa: F401
 
 
-def test_gensim_custom():
-    from concise_concepts.examples import example_gensim_custom  # noqa: F401
+def test_gensim_custom_path():
+    from concise_concepts.examples import example_gensim_custom_path  # noqa: F401
+
+
+def test_gensim_custom_model():
+    from concise_concepts.examples import example_gensim_custom_model  # noqa: F401