Skip to content

Commit

Permalink
Merge pull request #21 from Pandora-Intelligence/#17-duplicate-loggign
Browse files Browse the repository at this point in the history
#17 duplicate logging - #19 handling of error within missing tokens in model
  • Loading branch information
davidberenstein1957 authored Oct 9, 2022
2 parents 1659eac + c23ad19 commit 3da123f
Show file tree
Hide file tree
Showing 9 changed files with 78 additions and 65 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,5 @@ dmypy.json
*.model.*
/word2vec.wordvectors
/word2vec.wordvectors.vectors.npy
matching_patterns.json
matching_patterns.json
test.py
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ text = """

nlp = spacy.load("en_core_web_lg", disable=["ner"])
# ent_score for entity condifence scoring
nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True})
nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "verbose": True})
doc = nlp(text)

options = {"colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"},
Expand All @@ -70,14 +70,14 @@ displacy.render(doc, style="ent", options=options)
## Matching Pattern Rules

### Customizing Matching Pattern Rules
Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized
via the config passed to the spaCy pipeline.
Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline.

- `exclude_pos`: A list of POS tags to be excluded from the rule-based match.
- `exclude_dep`: A list of dependencies to be excluded from the rule-based match.
- `include_compound_words`: If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity.
- `case_sensitive`: Whether to match the case of the words in the text.


### Analyze Matching Pattern Rules
To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behaviour can be changed by using the `json_path` variable via the config passed to the spaCy pipeline.

Expand Down
3 changes: 3 additions & 0 deletions concise_concepts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"include_compound_words": False,
"case_sensitive": False,
"json_path": "./matching_patterns.json",
"verbose": True,
},
)
def make_concise_concepts(
Expand All @@ -45,6 +46,7 @@ def make_concise_concepts(
include_compound_words: bool,
case_sensitive: bool,
json_path: str,
verbose: bool,
):
return Conceptualizer(
nlp=nlp,
Expand All @@ -59,4 +61,5 @@ def make_concise_concepts(
include_compound_words=include_compound_words,
case_sensitive=case_sensitive,
json_path=json_path,
verbose=verbose,
)
57 changes: 38 additions & 19 deletions concise_concepts/conceptualizer/Conceptualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
include_compound_words: bool = False,
case_sensitive: bool = False,
json_path: str = "./matching_patterns.json",
verbose: bool = True,
):
"""
The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and
Expand All @@ -65,6 +66,8 @@ def __init__(
if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional)
:param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)
"""
self.verbose = verbose
self.log_cache = {"key": list(), "word": list(), "word_key": list()}
if Span.has_extension("ent_score"):
Span.remove_extension("ent_score")
if ent_score:
Expand Down Expand Up @@ -102,7 +105,7 @@ def run(self):
self.check_validity_path()
self.determine_topn()
self.set_gensim_model()
self.verify_data()
self.verify_data(self.verbose)
self.expand_concepts()
self.verify_data(verbose=False)
# settle words around overlapping concepts
Expand Down Expand Up @@ -196,20 +199,31 @@ def verify_data(self, verbose: bool = True):
verified_values = []
if not self.check_presence_vocab(key):
if verbose:
logger.warning(f"key ´{key}´ not present in vector model")
if key not in self.log_cache["key"]:
logger.warning(f"key ´{key}´ not present in vector model")
self.log_cache["key"].append(key)
for word in value:
if self.check_presence_vocab(word):
verified_values.append(self.check_presence_vocab(word))
else:
if verbose:
logger.warning(
f"word ´{word}´ from key ´{key}´ not present in vector"
" model"
)
if word not in self.log_cache["word"]:
logger.warning(
f"word ´{word}´ from key ´{key}´ not present in vector"
" model"
)
self.log_cache["word"].append(word)
verified_data[key] = verified_values
assert len(
verified_values
), f"None of the entries for key {key} are present in the vector model"
if not len(verified_values):
msg = (
f"None of the entries for key {key} are present in the vector"
" model. "
)
if self.check_presence_vocab(key):
logger.warning(msg + f"Using {key} as word to expand over instead.")
verified_data[key] = self.check_presence_vocab(key)
else:
raise Exception(msg)
self.data = deepcopy(verified_data)
self.original_data = deepcopy(self.data)

Expand Down Expand Up @@ -453,18 +467,23 @@ def assign_score_to_entities(self, doc: Doc):
ent._.ent_score = self.kv.n_similarity(entity, concept)
else:
ent._.ent_score = 0
logger.warning(
f"Entity ´{ent.text}´ and/or label ´{concept}´ not found in"
" vector model. Nothing to compare to, so setting"
" ent._.ent_score to 0."
)
if self.verbose:
if f"{ent.text}_{concept}" not in self.log_cache["key_word"]:
logger.warning(
f"Entity ´{ent.text}´ and/or label ´{concept}´ not"
" found in vector model. Nothing to compare to, so"
" setting ent._.ent_score to 0."
)
self.log_cache["key_word"].append(f"{ent.text}_{concept}")
else:
ent._.ent_score = 0

logger.warning(
f"Entity ´{ent.text}´ not found in vector model. Nothing to compare"
" to, so setting ent._.ent_score to 0."
)
if self.verbose:
if ent.text not in self.log_cache["word"]:
logger.warning(
f"Entity ´{ent.text}´ not found in vector model. Nothing to"
" compare to, so setting ent._.ent_score to 0."
)
self.log_cache["word"].append(ent.text)
doc.ents = ents
return doc

Expand Down
25 changes: 25 additions & 0 deletions concise_concepts/examples/example_gensim_custom_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
import spacy
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

import concise_concepts # noqa: F401

data = {"human": ["trees"], "interface": ["computer"]}

text = (
"believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is"
" an extremely slow mobile phone and takes ages to open and navigate. Forget about"
" heavy use, it can't handle normal regular use. I made a huge mistake but pls"
" don't buy this mobile. It's only a few months and I am thinking to change it. Its"
" dam SLOW SLOW SLOW."
)

model = Word2Vec(
sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4
)
model.save("word2vec.model")
model_path = "word2vec.model"

nlp = spacy.blank("en")
nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path})
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "concise-concepts"
version = "0.6.2.1"
version = "0.6.3"
description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!"
authors = ["David Berenstein <[email protected]>"]
license = "MIT"
Expand Down
39 changes: 0 additions & 39 deletions test.py

This file was deleted.

8 changes: 6 additions & 2 deletions tests/test_model_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@ def test_gensim_default():
from concise_concepts.examples import example_gensim_default # noqa: F401


def test_gensim_custom():
from concise_concepts.examples import example_gensim_custom # noqa: F401
def test_gensim_custom_path():
from concise_concepts.examples import example_gensim_custom_path # noqa: F401


def test_gensim_custom_model():
from concise_concepts.examples import example_gensim_custom_model # noqa: F401

0 comments on commit 3da123f

Please sign in to comment.