Skip to content

Commit

Permalink
Fix errors resulting from multiple redirects e.g. "restasis" -> "cicl…
Browse files Browse the repository at this point in the history
…osporin" -> "ciclosporine"
  • Loading branch information
woodthom2 committed Oct 4, 2024
1 parent 0b76ff4 commit f117455
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 7 deletions.
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors:
repository-code: 'https://github.com/fastdatascience/drug_named_entity_recognition'
url: 'https://fastdatascience.com/drug-named-entity-recognition-python-library/'
license: MIT
version: 2.0.0
date-released: '2024-04-14'
version: 2.0.1
date-released: '2024-10-04'
url: 'https://zenodo.org/doi/10.5281/zenodo.10970631'
doi: 10.5281/zenodo.10970631
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,12 @@ MIT License. Copyright (c) 2023 [Fast Data Science](https://fastdatascience.com)

## ✍️ Citing the Drug Named Entity Recognition library

Wood, T.A., Drug Named Entity Recognition [Computer software], Version 2.0.0, accessed at [https://fastdatascience.com/drug-named-entity-recognition-python-library](https://fastdatascience.com/drug-named-entity-recognition-python-library), Fast Data Science Ltd (2024)
Wood, T.A., Drug Named Entity Recognition [Computer software], Version 2.0.1, accessed at [https://fastdatascience.com/drug-named-entity-recognition-python-library](https://fastdatascience.com/drug-named-entity-recognition-python-library), Fast Data Science Ltd (2024)

```
@unpublished{drugnamedentityrecognition,
AUTHOR = {Wood, T.A.},
TITLE = {Drug Named Entity Recognition (Computer software), Version 2.0.0},
TITLE = {Drug Named Entity Recognition (Computer software), Version 2.0.1},
YEAR = {2024},
Note = {To appear},
url = {https://zenodo.org/doi/10.5281/zenodo.10970631},
Expand Down
18 changes: 17 additions & 1 deletion harvesting_data_from_source/05_combine_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@

from nltk.corpus import words

from inclusions import common_english_words_to_include_in_drugs_dictionary, extra_terms_to_exclude_from_drugs_dictionary, extra_mappings
from inclusions import common_english_words_to_include_in_drugs_dictionary, \
extra_terms_to_exclude_from_drugs_dictionary, extra_mappings

re_num = re.compile(r'^\d+$')
re_three_digits = re.compile(r'\d\d\d')
Expand Down Expand Up @@ -242,6 +243,21 @@ def get_brand_names_nhs(description: str):
with open("words_to_check_with_ai.txt", "w", encoding="utf-8") as f:
f.write("\n".join(words_to_check_with_ai))

# Find any redirects that go through twice

for i in range(3):
print(f"Normalising redirects step {i}")
redirects_needed = {}
for variant, canonicals in list(drug_variant_to_canonical.items()):
for canonical in canonicals:
if canonical in drug_variant_to_canonical:
for canonical_of_canonical in drug_variant_to_canonical[canonical]:
if canonical_of_canonical != canonical:
redirects_needed[variant] = drug_variant_to_canonical[canonical]
print(f"There are {len(redirects_needed)} drug names which are redirected twice. These need to be normalised")
for source, targets in redirects_needed.items():
drug_variant_to_canonical[source] = targets

with bz2.open("../src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2", "wb") as f:
pkl.dump(
{"drug_variant_to_canonical": drug_variant_to_canonical,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "drug-named-entity-recognition"
version = "2.0.0"
version = "2.0.1"
description = "Drug Named Entity Recognition library to find and resolve drug names in a string (drug named entity linking)"
readme = "README.md"
keywords = ['drug', 'bio', 'biomedical', 'medical', 'pharma', 'pharmaceutical', 'ner', 'nlp', 'named entity recognition', 'natural language processing', 'named entity linking']
Expand Down
2 changes: 1 addition & 1 deletion src/drug_named_entity_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
'''

__version__ = "2.0.0"
__version__ = "2.0.1"

from drug_named_entity_recognition.drugs_finder import find_drugs, add_custom_drug_synonym, add_custom_new_drug, \
reset_drugs_data, remove_drug_synonym
Binary file modified src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2
Binary file not shown.
5 changes: 5 additions & 0 deletions tests/test_drugs_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,8 @@ def test_mounjaro_misspelt(self):
print(json.dumps(drugs, indent=4))

self.assertEqual(1, len(drugs))

def test_restasis(self):
drugs = find_drugs("i bought some restasis".split(" "), is_include_structure=True)

self.assertEqual(1, len(drugs))

0 comments on commit f117455

Please sign in to comment.