Refine exclusions after checking against common vocab

fastdatascience · Oct 20, 2024 · 7df3490 · 7df3490
1 parent ffce7f0
commit 7df3490
Show file tree

Hide file tree

Showing 6 changed files with 1,585 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -56,9 +56,11 @@ You pass a list of strings to the `find_diseases` function.
 Example 1
 
 ```
+import re
+re_tokenise = re.compile(r"((?:\w|'|’)+)")
 from medical_named_entity_recognition import find_diseases
 tokens = re_tokenise.findall("cystic fibrosis")
-find_diseases(tokens, is_ignore_case=True)
+find_diseases(tokens)
 ```
 
 outputs a list of tuples.

diff --git a/harvesting_data_from_source/05_combine_data_sources.py b/harvesting_data_from_source/05_combine_data_sources.py
@@ -121,12 +121,29 @@ def add_synonym(synonym: str, canonical: str, synonym_data: dict = None):
 words_to_check_with_ai = set()
 for word in list(disease_variant_to_canonical):
     reason = None
+
+    canonical = disease_variant_to_canonical[word][0]
+
+    tree_ids = disease_canonical_to_data[canonical]["mesh_tree"]
+    is_psychological = False
+    for tree_id in tree_ids:
+        if tree_id.startswith("F"):
+            is_psychological= True
+
+    is_two_english_words = False
+    if len(word.split()) == 2:
+        words = word.split()
+        if words[0] in all_english_vocab and words[1] in all_english_vocab:
+            is_two_english_words =True
+
     if word in stops:
         reason = "it is an English word in stopword list"
     elif word in extra_terms_to_exclude_from_disease_dictionary:
         reason = "it is in the manual ignore list"
     elif len(word) < 3:
         reason = "it is short"
+    elif is_two_english_words and is_psychological:
+        words_to_check_with_ai.add(word)
     # elif len(re_num.findall(word)) > 0:
     #     reason = "it is numeric"
     # elif len(word) > 50:

diff --git a/harvesting_data_from_source/06_optional_check_words_with_ai.py b/harvesting_data_from_source/06_optional_check_words_with_ai.py
@@ -0,0 +1,85 @@
+'''
+MIT License
+
+Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com)
+
+Maintainer: Thomas Wood
+
+Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+'''
+
+# This is a script where we clean up the drug dictionary by identifying if any of the exclusion words should really be included as drugs.
+
+import os
+import re
+import sys
+import time
+import traceback
+from tqdm import tqdm
+import requests
+
+MODEL = 'gpt-4o-mini'
+
+with open("words_to_check_with_ai.txt", "r", encoding="utf-8") as f:
+    words_to_check_with_ai = f.read().split("\n")
+
+
+headers = {
+    'Content-Type': 'application/json',
+    'Authorization': 'Bearer ' + os.environ["OPENAI_API_KEY"],
+}
+
+bot_responses = [""] * len(words_to_check_with_ai)
+
+for idx in tqdm(range(len(words_to_check_with_ai))):
+    q = words_to_check_with_ai[idx]
+    print(f"Asking question: {idx+1}: {q}")
+
+    starttime = time.time()
+
+    json_data = {
+        'model': MODEL,
+        'messages': [
+            {"role": "user", "content": f"What is {q}? Is it a health condition? Answer in one word.\n"},
+        ],
+        'max_completion_tokens':5
+    }
+    for attempt in range(3):
+        print("attempt calling GPT API:", attempt)
+        try:
+            response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=json_data)
+            r = response.json()["choices"][0]["message"]["content"]
+            break
+        except:
+            print("Try again")
+            traceback.print_exc()
+            time.sleep(10)
+    bot_responses[idx] = re.sub(r'\s+', ' ', r)
+
+    with open("ai_responses.txt", "w", encoding="utf-8") as f:
+        for j in range(idx):
+            f.write(words_to_check_with_ai[j] + "\t" + bot_responses[j] + "\n")
+
+    endtime = time.time()
+
+    print("\tReceived response: ", r)
+