Skip to content

Commit

Permalink
Refine exclusions after checking against common vocab
Browse files Browse the repository at this point in the history
  • Loading branch information
woodthom2 committed Oct 20, 2024
1 parent ffce7f0 commit 7df3490
Show file tree
Hide file tree
Showing 6 changed files with 1,585 additions and 35 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ You pass a list of strings to the `find_diseases` function.
Example 1

```
import re
re_tokenise = re.compile(r"((?:\w|'|’)+)")
from medical_named_entity_recognition import find_diseases
tokens = re_tokenise.findall("cystic fibrosis")
find_diseases(tokens, is_ignore_case=True)
find_diseases(tokens)
```

outputs a list of tuples.
Expand Down
17 changes: 17 additions & 0 deletions harvesting_data_from_source/05_combine_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,29 @@ def add_synonym(synonym: str, canonical: str, synonym_data: dict = None):
words_to_check_with_ai = set()
for word in list(disease_variant_to_canonical):
reason = None

canonical = disease_variant_to_canonical[word][0]

tree_ids = disease_canonical_to_data[canonical]["mesh_tree"]
is_psychological = False
for tree_id in tree_ids:
if tree_id.startswith("F"):
is_psychological= True

is_two_english_words = False
if len(word.split()) == 2:
words = word.split()
if words[0] in all_english_vocab and words[1] in all_english_vocab:
is_two_english_words =True

if word in stops:
reason = "it is an English word in stopword list"
elif word in extra_terms_to_exclude_from_disease_dictionary:
reason = "it is in the manual ignore list"
elif len(word) < 3:
reason = "it is short"
elif is_two_english_words and is_psychological:
words_to_check_with_ai.add(word)
# elif len(re_num.findall(word)) > 0:
# reason = "it is numeric"
# elif len(word) > 50:
Expand Down
85 changes: 85 additions & 0 deletions harvesting_data_from_source/06_optional_check_words_with_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
'''
MIT License
Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com)
Maintainer: Thomas Wood
Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''

# This is a script where we clean up the drug dictionary by identifying if any of the exclusion words should really be included as drugs.

import os
import re
import sys
import time
import traceback
from tqdm import tqdm
import requests

MODEL = 'gpt-4o-mini'

with open("words_to_check_with_ai.txt", "r", encoding="utf-8") as f:
words_to_check_with_ai = f.read().split("\n")


headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + os.environ["OPENAI_API_KEY"],
}

bot_responses = [""] * len(words_to_check_with_ai)

for idx in tqdm(range(len(words_to_check_with_ai))):
q = words_to_check_with_ai[idx]
print(f"Asking question: {idx+1}: {q}")

starttime = time.time()

json_data = {
'model': MODEL,
'messages': [
{"role": "user", "content": f"What is {q}? Is it a health condition? Answer in one word.\n"},
],
'max_completion_tokens':5
}
for attempt in range(3):
print("attempt calling GPT API:", attempt)
try:
response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=json_data)
r = response.json()["choices"][0]["message"]["content"]
break
except:
print("Try again")
traceback.print_exc()
time.sleep(10)
bot_responses[idx] = re.sub(r'\s+', ' ', r)

with open("ai_responses.txt", "w", encoding="utf-8") as f:
for j in range(idx):
f.write(words_to_check_with_ai[j] + "\t" + bot_responses[j] + "\n")

endtime = time.time()

print("\tReceived response: ", r)

Loading

0 comments on commit 7df3490

Please sign in to comment.