From 498677e39b24965b49149fc3f6ef7142b4037b8a Mon Sep 17 00:00:00 2001
From: david <david.m.berenstein@gmail.com>
Date: Sun, 9 Oct 2022 10:54:54 +0200
Subject: [PATCH 1/2] resolved #17 - removed duplicate logging of same mising
 keys - added verbose options to completely disable logging

---
 .gitignore                                    |  3 +-
 README.md                                     |  6 +--
 concise_concepts/__init__.py                  |  3 ++
 .../conceptualizer/Conceptualizer.py          | 44 ++++++++++++-------
 .../examples/example_gensim_custom_model.py   | 25 +++++++++++
 ...ustom.py => example_gensim_custom_path.py} |  0
 pyproject.toml                                |  2 +-
 test.py                                       | 39 ----------------
 tests/test_model_import.py                    |  8 +++-
 9 files changed, 68 insertions(+), 62 deletions(-)
 create mode 100644 concise_concepts/examples/example_gensim_custom_model.py
 rename concise_concepts/examples/{example_gensim_custom.py => example_gensim_custom_path.py} (100%)
 delete mode 100644 test.py

diff --git a/.gitignore b/.gitignore
index f60b7e8..2c57ea1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,4 +137,5 @@ dmypy.json
 *.model.*
 /word2vec.wordvectors
 /word2vec.wordvectors.vectors.npy
-matching_patterns.json
\ No newline at end of file
+matching_patterns.json
+test.py
\ No newline at end of file
diff --git a/README.md b/README.md
index b03804b..507749f 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ text = """
 
 nlp = spacy.load("en_core_web_lg", disable=["ner"])
 # ent_score for entity condifence scoring
-nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True})
+nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "verbose": True})
 doc = nlp(text)
 
 options = {"colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"},
@@ -70,14 +70,14 @@ displacy.render(doc, style="ent", options=options)
 ## Matching Pattern Rules
 
 ### Customizing Matching Pattern Rules
-Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized
-via the config passed to the spaCy pipeline.
+Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline.
 
  - `exclude_pos`: A list of POS tags to be excluded from the rule-based match.
  - `exclude_dep`: A list of dependencies to be excluded from the rule-based match.
  - `include_compound_words`:  If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity.
  - `case_sensitive`: Whether to match the case of the words in the text.
 
+
 ### Analyze Matching Pattern Rules
 To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behaviour can be changed by using the `json_path` variable via the config passed to the spaCy pipeline.
 
diff --git a/concise_concepts/__init__.py b/concise_concepts/__init__.py
index d903b98..e1a7591 100644
--- a/concise_concepts/__init__.py
+++ b/concise_concepts/__init__.py
@@ -30,6 +30,7 @@
         "include_compound_words": False,
         "case_sensitive": False,
         "json_path": "./matching_patterns.json",
+        "verbose": True,
     },
 )
 def make_concise_concepts(
@@ -45,6 +46,7 @@ def make_concise_concepts(
     include_compound_words: bool,
     case_sensitive: bool,
     json_path: str,
+    verbose: bool,
 ):
     return Conceptualizer(
         nlp=nlp,
@@ -59,4 +61,5 @@ def make_concise_concepts(
         include_compound_words=include_compound_words,
         case_sensitive=case_sensitive,
         json_path=json_path,
+        verbose=verbose,
     )
diff --git a/concise_concepts/conceptualizer/Conceptualizer.py b/concise_concepts/conceptualizer/Conceptualizer.py
index 2863690..28949b2 100644
--- a/concise_concepts/conceptualizer/Conceptualizer.py
+++ b/concise_concepts/conceptualizer/Conceptualizer.py
@@ -41,6 +41,7 @@ def __init__(
         include_compound_words: bool = False,
         case_sensitive: bool = False,
         json_path: str = "./matching_patterns.json",
+        verbose: bool = True,
     ):
         """
         The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and
@@ -65,6 +66,8 @@ def __init__(
             if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional)
         :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)
         """
+        self.verbose = verbose
+        self.log_cache = {"key": list(), "word": list(), "word_key": list()}
         if Span.has_extension("ent_score"):
             Span.remove_extension("ent_score")
         if ent_score:
@@ -102,7 +105,7 @@ def run(self):
         self.check_validity_path()
         self.determine_topn()
         self.set_gensim_model()
-        self.verify_data()
+        self.verify_data(self.verbose)
         self.expand_concepts()
         self.verify_data(verbose=False)
         # settle words around overlapping concepts
@@ -196,16 +199,20 @@ def verify_data(self, verbose: bool = True):
             verified_values = []
             if not self.check_presence_vocab(key):
                 if verbose:
-                    logger.warning(f"key ´{key}´ not present in vector model")
+                    if key not in self.log_cache["key"]:
+                        logger.warning(f"key ´{key}´ not present in vector model")
+                        self.log_cache["key"].append(key)
             for word in value:
                 if self.check_presence_vocab(word):
                     verified_values.append(self.check_presence_vocab(word))
                 else:
                     if verbose:
-                        logger.warning(
-                            f"word ´{word}´ from key ´{key}´ not present in vector"
-                            " model"
-                        )
+                        if word not in self.log_cache["word"]:
+                            logger.warning(
+                                f"word ´{word}´ from key ´{key}´ not present in vector"
+                                " model"
+                            )
+                            self.log_cache["word"].append(word)
             verified_data[key] = verified_values
             assert len(
                 verified_values
@@ -453,18 +460,23 @@ def assign_score_to_entities(self, doc: Doc):
                     ent._.ent_score = self.kv.n_similarity(entity, concept)
                 else:
                     ent._.ent_score = 0
-                    logger.warning(
-                        f"Entity ´{ent.text}´ and/or label ´{concept}´ not found in"
-                        " vector model. Nothing to compare to, so setting"
-                        " ent._.ent_score to 0."
-                    )
+                    if self.verbose:
+                        if f"{ent.text}_{concept}" not in self.log_cache["key_word"]:
+                            logger.warning(
+                                f"Entity ´{ent.text}´ and/or label ´{concept}´ not"
+                                " found in vector model. Nothing to compare to, so"
+                                " setting ent._.ent_score to 0."
+                            )
+                            self.log_cache["key_word"].append(f"{ent.text}_{concept}")
             else:
                 ent._.ent_score = 0
-
-                logger.warning(
-                    f"Entity ´{ent.text}´ not found in vector model. Nothing to compare"
-                    " to, so setting ent._.ent_score to 0."
-                )
+                if self.verbose:
+                    if ent.text not in self.log_cache["word"]:
+                        logger.warning(
+                            f"Entity ´{ent.text}´ not found in vector model. Nothing to"
+                            " compare to, so setting ent._.ent_score to 0."
+                        )
+                        self.log_cache["word"].append(ent.text)
         doc.ents = ents
         return doc
 
diff --git a/concise_concepts/examples/example_gensim_custom_model.py b/concise_concepts/examples/example_gensim_custom_model.py
new file mode 100644
index 0000000..34a9474
--- /dev/null
+++ b/concise_concepts/examples/example_gensim_custom_model.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+import spacy
+from gensim.models import Word2Vec
+from gensim.test.utils import common_texts
+
+import concise_concepts
+
+data = {"human": ["trees"], "interface": ["computer"]}
+
+text = (
+    "believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is"
+    " an extremely slow mobile phone and takes ages to open and navigate. Forget about"
+    " heavy use, it can't handle normal regular use. I made a huge mistake but pls"
+    " don't buy this mobile. It's only a few months and I am thinking to change it. Its"
+    " dam SLOW SLOW SLOW."
+)
+
+model = Word2Vec(
+    sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4
+)
+model.save("word2vec.model")
+model_path = "word2vec.model"
+
+nlp = spacy.blank("en")
+nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
diff --git a/concise_concepts/examples/example_gensim_custom.py b/concise_concepts/examples/example_gensim_custom_path.py
similarity index 100%
rename from concise_concepts/examples/example_gensim_custom.py
rename to concise_concepts/examples/example_gensim_custom_path.py
diff --git a/pyproject.toml b/pyproject.toml
index a8b95f9..ad5640e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "concise-concepts"
-version = "0.6.2.1"
+version = "0.6.3"
 description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!"
 authors = ["David Berenstein <david.m.berenstein@gmail.com>"]
 license = "MIT"
diff --git a/test.py b/test.py
deleted file mode 100644
index 28b44e4..0000000
--- a/test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import spacy
-
-import concise_concepts
-
-data = {
-    "performance": ["mileage", "speed", "fuel", "capacity", "transmisson"],
-    "engine": ["turbo", "cylinder", "gear", "engine"],
-    "brakes": ["disc", "suspension", "brakes"],
-    "dimensions": ["length", "width", "height", "seating", "doors"],
-    "comfort": [
-        "steering",
-        "heater",
-        "air",
-        "accessory",
-        "headrest",
-        "charger",
-        "luxury",
-    ],
-    "entertainment": ["radio", "speaker", "phone", "touch"],
-    "safety": ["alarm", "theft", "warning", "safety"],
-}
-
-text = """
-XUV 700 is first in class luxury SUV brought to you by Mahindra.
-The 13-inch instrument cluster and attached infotainment panel
-are just the best in class and add a classy look to the SUV.
-The XUV 700 is a beast of a car with that panoramic sunroof,
-and leather upholstery on the seats and doesn't compromise on the luxury of the passengers.
-The adaptive cruise control is a top-end feature clearly not offered in any
-other cars of this segment at this price point. Have always been a fan of the XUV 500
-but the successor is a beast with the luxury of its own kind the road experience is pretty smooth
-and the engine performs really well.
-"""
-model_path = "glove-wiki-gigaword-300"
-nlp = spacy.load("en_core_web_lg", disable=["ner"])
-nlp.add_pipe(
-    "concise_concepts",
-    config={"data": data, "model_path": model_path, "ent_score": True},
-)
diff --git a/tests/test_model_import.py b/tests/test_model_import.py
index 6c10109..2366679 100644
--- a/tests/test_model_import.py
+++ b/tests/test_model_import.py
@@ -7,5 +7,9 @@ def test_gensim_default():
     from concise_concepts.examples import example_gensim_default  # noqa: F401
 
 
-def test_gensim_custom():
-    from concise_concepts.examples import example_gensim_custom  # noqa: F401
+def test_gensim_custom_path():
+    from concise_concepts.examples import example_gensim_custom_path  # noqa: F401
+
+
+def test_gensim_custom_model():
+    from concise_concepts.examples import example_gensim_custom_model  # noqa: F401

From c23ad19c106995ceb79946e2033e272237d42a6e Mon Sep 17 00:00:00 2001
From: david <david.m.berenstein@gmail.com>
Date: Sun, 9 Oct 2022 12:14:16 +0200
Subject: [PATCH 2/2] partially resolved #19 - updated documentation - fallback
 to keyname if examples are not present in model

---
 concise_concepts/conceptualizer/Conceptualizer.py   | 13 ++++++++++---
 .../examples/example_gensim_custom_model.py         |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/concise_concepts/conceptualizer/Conceptualizer.py b/concise_concepts/conceptualizer/Conceptualizer.py
index 28949b2..92d38f0 100644
--- a/concise_concepts/conceptualizer/Conceptualizer.py
+++ b/concise_concepts/conceptualizer/Conceptualizer.py
@@ -214,9 +214,16 @@ def verify_data(self, verbose: bool = True):
                             )
                             self.log_cache["word"].append(word)
             verified_data[key] = verified_values
-            assert len(
-                verified_values
-            ), f"None of the entries for key {key} are present in the vector model"
+            if not len(verified_values):
+                msg = (
+                    f"None of the entries for key {key} are present in the vector"
+                    " model. "
+                )
+                if self.check_presence_vocab(key):
+                    logger.warning(msg + f"Using {key} as word to expand over instead.")
+                    verified_data[key] = self.check_presence_vocab(key)
+                else:
+                    raise Exception(msg)
         self.data = deepcopy(verified_data)
         self.original_data = deepcopy(self.data)
 
diff --git a/concise_concepts/examples/example_gensim_custom_model.py b/concise_concepts/examples/example_gensim_custom_model.py
index 34a9474..8d00c8d 100644
--- a/concise_concepts/examples/example_gensim_custom_model.py
+++ b/concise_concepts/examples/example_gensim_custom_model.py
@@ -3,7 +3,7 @@
 from gensim.models import Word2Vec
 from gensim.test.utils import common_texts
 
-import concise_concepts
+import concise_concepts  # noqa: F401
 
 data = {"human": ["trees"], "interface": ["computer"]}