Skip to content

Commit

Permalink
Refine exclusions. TODO: check against common English words
Browse files Browse the repository at this point in the history
  • Loading branch information
woodthom2 committed Oct 18, 2024
1 parent 33074cf commit ffce7f0
Show file tree
Hide file tree
Showing 5 changed files with 5,868 additions and 4,753 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def endElement(self, tagName):
# if True or self.title.upper() in drugs_finder.drug_variant_to_canonical:
is_include = False
for t in self.tree_numbers:
if t.startswith("C"):
if t.startswith("C") or t.startswith("F"):
is_include = True
# else:
# is_include = False
Expand Down
14 changes: 10 additions & 4 deletions harvesting_data_from_source/05_combine_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,16 @@ def add_synonym(synonym: str, canonical: str, synonym_data: dict = None):
inverted_index_lookup_canonical_to_variants[canonical].add(variant)

for term_to_delete in diseases_to_exclude_under_all_variants:
variants = inverted_index_lookup_canonical_to_variants[term_to_delete]
for variant in variants:
del disease_variant_to_canonical[variant]
del disease_canonical_to_data[term_to_delete]
if term_to_delete not in disease_variant_to_canonical:
print(f"Warning! tried to delete {term_to_delete} but couldn't find it")
continue
canonicals_to_delete = disease_variant_to_canonical[term_to_delete]
for canonical_to_delete in canonicals_to_delete:

variants = inverted_index_lookup_canonical_to_variants[canonical_to_delete]
for variant in variants:
del disease_variant_to_canonical[variant]
del disease_canonical_to_data[canonical_to_delete]

with bz2.open("../src/medical_named_entity_recognition/disease_ner_dictionary.pkl.bz2", "wb") as f:
pkl.dump(
Expand Down
Loading

0 comments on commit ffce7f0

Please sign in to comment.