From 498677e39b24965b49149fc3f6ef7142b4037b8a Mon Sep 17 00:00:00 2001 From: david Date: Sun, 9 Oct 2022 10:54:54 +0200 Subject: [PATCH 1/2] resolved #17 - removed duplicate logging of same mising keys - added verbose options to completely disable logging --- .gitignore | 3 +- README.md | 6 +-- concise_concepts/__init__.py | 3 ++ .../conceptualizer/Conceptualizer.py | 44 ++++++++++++------- .../examples/example_gensim_custom_model.py | 25 +++++++++++ ...ustom.py => example_gensim_custom_path.py} | 0 pyproject.toml | 2 +- test.py | 39 ---------------- tests/test_model_import.py | 8 +++- 9 files changed, 68 insertions(+), 62 deletions(-) create mode 100644 concise_concepts/examples/example_gensim_custom_model.py rename concise_concepts/examples/{example_gensim_custom.py => example_gensim_custom_path.py} (100%) delete mode 100644 test.py diff --git a/.gitignore b/.gitignore index f60b7e8..2c57ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -137,4 +137,5 @@ dmypy.json *.model.* /word2vec.wordvectors /word2vec.wordvectors.vectors.npy -matching_patterns.json \ No newline at end of file +matching_patterns.json +test.py \ No newline at end of file diff --git a/README.md b/README.md index b03804b..507749f 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ text = """ nlp = spacy.load("en_core_web_lg", disable=["ner"]) # ent_score for entity condifence scoring -nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True}) +nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "verbose": True}) doc = nlp(text) options = {"colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"}, @@ -70,14 +70,14 @@ displacy.render(doc, style="ent", options=options) ## Matching Pattern Rules ### Customizing Matching Pattern Rules -Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized -via the config passed to the spaCy pipeline. +Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline. - `exclude_pos`: A list of POS tags to be excluded from the rule-based match. - `exclude_dep`: A list of dependencies to be excluded from the rule-based match. - `include_compound_words`: If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity. - `case_sensitive`: Whether to match the case of the words in the text. + ### Analyze Matching Pattern Rules To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behaviour can be changed by using the `json_path` variable via the config passed to the spaCy pipeline. diff --git a/concise_concepts/__init__.py b/concise_concepts/__init__.py index d903b98..e1a7591 100644 --- a/concise_concepts/__init__.py +++ b/concise_concepts/__init__.py @@ -30,6 +30,7 @@ "include_compound_words": False, "case_sensitive": False, "json_path": "./matching_patterns.json", + "verbose": True, }, ) def make_concise_concepts( @@ -45,6 +46,7 @@ def make_concise_concepts( include_compound_words: bool, case_sensitive: bool, json_path: str, + verbose: bool, ): return Conceptualizer( nlp=nlp, @@ -59,4 +61,5 @@ def make_concise_concepts( include_compound_words=include_compound_words, case_sensitive=case_sensitive, json_path=json_path, + verbose=verbose, ) diff --git a/concise_concepts/conceptualizer/Conceptualizer.py b/concise_concepts/conceptualizer/Conceptualizer.py index 2863690..28949b2 100644 --- a/concise_concepts/conceptualizer/Conceptualizer.py +++ b/concise_concepts/conceptualizer/Conceptualizer.py @@ -41,6 +41,7 @@ def __init__( include_compound_words: bool = False, case_sensitive: bool = False, json_path: str = "./matching_patterns.json", + verbose: bool = True, ): """ The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and @@ -65,6 +66,8 @@ def __init__( if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional) :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional) """ + self.verbose = verbose + self.log_cache = {"key": list(), "word": list(), "word_key": list()} if Span.has_extension("ent_score"): Span.remove_extension("ent_score") if ent_score: @@ -102,7 +105,7 @@ def run(self): self.check_validity_path() self.determine_topn() self.set_gensim_model() - self.verify_data() + self.verify_data(self.verbose) self.expand_concepts() self.verify_data(verbose=False) # settle words around overlapping concepts @@ -196,16 +199,20 @@ def verify_data(self, verbose: bool = True): verified_values = [] if not self.check_presence_vocab(key): if verbose: - logger.warning(f"key ´{key}´ not present in vector model") + if key not in self.log_cache["key"]: + logger.warning(f"key ´{key}´ not present in vector model") + self.log_cache["key"].append(key) for word in value: if self.check_presence_vocab(word): verified_values.append(self.check_presence_vocab(word)) else: if verbose: - logger.warning( - f"word ´{word}´ from key ´{key}´ not present in vector" - " model" - ) + if word not in self.log_cache["word"]: + logger.warning( + f"word ´{word}´ from key ´{key}´ not present in vector" + " model" + ) + self.log_cache["word"].append(word) verified_data[key] = verified_values assert len( verified_values @@ -453,18 +460,23 @@ def assign_score_to_entities(self, doc: Doc): ent._.ent_score = self.kv.n_similarity(entity, concept) else: ent._.ent_score = 0 - logger.warning( - f"Entity ´{ent.text}´ and/or label ´{concept}´ not found in" - " vector model. Nothing to compare to, so setting" - " ent._.ent_score to 0." - ) + if self.verbose: + if f"{ent.text}_{concept}" not in self.log_cache["key_word"]: + logger.warning( + f"Entity ´{ent.text}´ and/or label ´{concept}´ not" + " found in vector model. Nothing to compare to, so" + " setting ent._.ent_score to 0." + ) + self.log_cache["key_word"].append(f"{ent.text}_{concept}") else: ent._.ent_score = 0 - - logger.warning( - f"Entity ´{ent.text}´ not found in vector model. Nothing to compare" - " to, so setting ent._.ent_score to 0." - ) + if self.verbose: + if ent.text not in self.log_cache["word"]: + logger.warning( + f"Entity ´{ent.text}´ not found in vector model. Nothing to" + " compare to, so setting ent._.ent_score to 0." + ) + self.log_cache["word"].append(ent.text) doc.ents = ents return doc diff --git a/concise_concepts/examples/example_gensim_custom_model.py b/concise_concepts/examples/example_gensim_custom_model.py new file mode 100644 index 0000000..34a9474 --- /dev/null +++ b/concise_concepts/examples/example_gensim_custom_model.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +import spacy +from gensim.models import Word2Vec +from gensim.test.utils import common_texts + +import concise_concepts + +data = {"human": ["trees"], "interface": ["computer"]} + +text = ( + "believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is" + " an extremely slow mobile phone and takes ages to open and navigate. Forget about" + " heavy use, it can't handle normal regular use. I made a huge mistake but pls" + " don't buy this mobile. It's only a few months and I am thinking to change it. Its" + " dam SLOW SLOW SLOW." +) + +model = Word2Vec( + sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4 +) +model.save("word2vec.model") +model_path = "word2vec.model" + +nlp = spacy.blank("en") +nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) diff --git a/concise_concepts/examples/example_gensim_custom.py b/concise_concepts/examples/example_gensim_custom_path.py similarity index 100% rename from concise_concepts/examples/example_gensim_custom.py rename to concise_concepts/examples/example_gensim_custom_path.py diff --git a/pyproject.toml b/pyproject.toml index a8b95f9..ad5640e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "concise-concepts" -version = "0.6.2.1" +version = "0.6.3" description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!" authors = ["David Berenstein "] license = "MIT" diff --git a/test.py b/test.py deleted file mode 100644 index 28b44e4..0000000 --- a/test.py +++ /dev/null @@ -1,39 +0,0 @@ -import spacy - -import concise_concepts - -data = { - "performance": ["mileage", "speed", "fuel", "capacity", "transmisson"], - "engine": ["turbo", "cylinder", "gear", "engine"], - "brakes": ["disc", "suspension", "brakes"], - "dimensions": ["length", "width", "height", "seating", "doors"], - "comfort": [ - "steering", - "heater", - "air", - "accessory", - "headrest", - "charger", - "luxury", - ], - "entertainment": ["radio", "speaker", "phone", "touch"], - "safety": ["alarm", "theft", "warning", "safety"], -} - -text = """ -XUV 700 is first in class luxury SUV brought to you by Mahindra. -The 13-inch instrument cluster and attached infotainment panel -are just the best in class and add a classy look to the SUV. -The XUV 700 is a beast of a car with that panoramic sunroof, -and leather upholstery on the seats and doesn't compromise on the luxury of the passengers. -The adaptive cruise control is a top-end feature clearly not offered in any -other cars of this segment at this price point. Have always been a fan of the XUV 500 -but the successor is a beast with the luxury of its own kind the road experience is pretty smooth -and the engine performs really well. -""" -model_path = "glove-wiki-gigaword-300" -nlp = spacy.load("en_core_web_lg", disable=["ner"]) -nlp.add_pipe( - "concise_concepts", - config={"data": data, "model_path": model_path, "ent_score": True}, -) diff --git a/tests/test_model_import.py b/tests/test_model_import.py index 6c10109..2366679 100644 --- a/tests/test_model_import.py +++ b/tests/test_model_import.py @@ -7,5 +7,9 @@ def test_gensim_default(): from concise_concepts.examples import example_gensim_default # noqa: F401 -def test_gensim_custom(): - from concise_concepts.examples import example_gensim_custom # noqa: F401 +def test_gensim_custom_path(): + from concise_concepts.examples import example_gensim_custom_path # noqa: F401 + + +def test_gensim_custom_model(): + from concise_concepts.examples import example_gensim_custom_model # noqa: F401 From c23ad19c106995ceb79946e2033e272237d42a6e Mon Sep 17 00:00:00 2001 From: david Date: Sun, 9 Oct 2022 12:14:16 +0200 Subject: [PATCH 2/2] partially resolved #19 - updated documentation - fallback to keyname if examples are not present in model --- concise_concepts/conceptualizer/Conceptualizer.py | 13 ++++++++++--- .../examples/example_gensim_custom_model.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/concise_concepts/conceptualizer/Conceptualizer.py b/concise_concepts/conceptualizer/Conceptualizer.py index 28949b2..92d38f0 100644 --- a/concise_concepts/conceptualizer/Conceptualizer.py +++ b/concise_concepts/conceptualizer/Conceptualizer.py @@ -214,9 +214,16 @@ def verify_data(self, verbose: bool = True): ) self.log_cache["word"].append(word) verified_data[key] = verified_values - assert len( - verified_values - ), f"None of the entries for key {key} are present in the vector model" + if not len(verified_values): + msg = ( + f"None of the entries for key {key} are present in the vector" + " model. " + ) + if self.check_presence_vocab(key): + logger.warning(msg + f"Using {key} as word to expand over instead.") + verified_data[key] = self.check_presence_vocab(key) + else: + raise Exception(msg) self.data = deepcopy(verified_data) self.original_data = deepcopy(self.data) diff --git a/concise_concepts/examples/example_gensim_custom_model.py b/concise_concepts/examples/example_gensim_custom_model.py index 34a9474..8d00c8d 100644 --- a/concise_concepts/examples/example_gensim_custom_model.py +++ b/concise_concepts/examples/example_gensim_custom_model.py @@ -3,7 +3,7 @@ from gensim.models import Word2Vec from gensim.test.utils import common_texts -import concise_concepts +import concise_concepts # noqa: F401 data = {"human": ["trees"], "interface": ["computer"]}