initial commit

fastdatascience · Oct 17, 2024 · 0f29945 · 0f29945
1 parent 6e55179
commit 0f29945
Show file tree

Hide file tree

Showing 19 changed files with 10,744 additions and 0 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,29 @@
+name: Release Pypi package
+
+on:
+  release:
+    types: [created]
+  workflow_dispatch:
+
+jobs:
+  deploy:
+    name: "Build Distribution"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.x"
+      - name: Install pypa/build
+        run: >-
+          python3 -m
+          pip install
+          build twine
+      - name: Build and publish
+        env:
+          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+        run: |
+          python3 -m build
+          twine upload --repository pypi dist/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,32 @@
+name: Test Pypi package
+
+on:
+  push:
+    branches:
+      - main
+    paths-ignore:
+      - README.md
+  pull_request:
+    branches:
+      - main
+    paths-ignore:
+      - README.md
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: [3.10.11]
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install Tox and any other packages
+        run: pip install tox
+      - name: Run Tox
+        # Run tox using the version of Python in `PATH`
+        run: tox -e py
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,16 @@
+# This CITATION.cff file was generated with cffinit.
+# Visit https://bit.ly/cffinit to generate yours today!
+
+cff-version: 1.2.0
+title: Medical named entity recognition
+message: 'If you use this software, please cite it as below.'
+type: software
+authors:
+  - family-names: Wood
+    given-names: Thomas Andrew
+    orcid: 'https://orcid.org/0000-0001-8962-8571'
+repository-code: 'https://github.com/fastdatascience/medical_named_entity_recognition'
+url: 'https://fastdatascience.com/drug-named-entity-recognition-python-library/'
+license: MIT
+version: 0.1beta
+date-released: '2024-10-17'
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com). Maintainer: Thomas Wood. Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,13 @@
+include pyproject.toml
+include *.md
+include LICENSE
+recursive-include tests test*.py
+include *.cff
+include *.ipynb
+include *.png
+include cross_check_against_common_english_vocab.py
+include find_short_drug_names.py
+recursive-include harvesting_data_from_source *.py
+recursive-include harvesting_data_from_source *.csv
+recursive-include src *.bz2
+recursive-include harvesting_data_from_source *.json
diff --git a/...ing_data_from_source/02_mesh_download_mesh_dump_and_extract_disease_names_and_synonyms.py b/...ing_data_from_source/02_mesh_download_mesh_dump_and_extract_disease_names_and_synonyms.py
@@ -0,0 +1,146 @@
+
+'''
+MIT License
+
+Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com)
+
+Maintainer: Thomas Wood
+
+Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+'''
+
+import csv
+import datetime
+import os
+import subprocess
+import xml.sax
+from sys import platform
+
+# Example URL of MeSH dump: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2023.xml
+
+mesh_xml_file_name = f"desc{datetime.datetime.now().year}.xml"
+url = f"https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/{mesh_xml_file_name}"
+
+'''
+if os.path.exists(mesh_xml_file_name):
+    print(f"Removing old XML file {mesh_xml_file_name}.")
+    os.remove(mesh_xml_file_name)
+
+print(
+    f"Downloading MeSH XML dump from {url}. If this URL doesn't work, please navigate to https://www.nlm.nih.gov/ and search the site for a MeSH data dump in XML format.")
+
+print(f"Platform is {platform}.")
+if "win" in platform:  # if we are on Windows, use curl.exe (supported in Windows 10 and up)
+    wget = subprocess.Popen(["curl.exe", "--output", mesh_xml_file_name, "--url", url])
+else:
+    wget = subprocess.Popen(["wget", url])
+
+os.waitpid(wget.pid, 0)
+'''
+
+print(f"Downloaded MeSH XML dump from {url}.")
+
+IMPORTANT_TAGS = {'DescriptorName', 'String', 'DescriptorUI', 'DescriptorRecord', 'TreeNumber', 'Term'}
+
+
+# define a Custom ContentHandler class that extends ContenHandler
+class CustomContentHandler(xml.sax.ContentHandler):
+    def __init__(self, writer):
+        self.writer = writer
+        self.writer.writerow(["Mesh ID", "Generic name", "Common name", "Synonyms", "Tree"])
+        self.postCount = 0
+        self.entryCount = 0
+        self.is_in = dict([n, False] for n in IMPORTANT_TAGS)
+        self.title = ""
+        self.id = ""
+        self.tree_numbers = set()
+        self.terms = set()
+        self.generic_names = set()
+        self.path = []
+        self.RecordPreferredTermYN = ""
+
+    # Handle startElement
+    def startElement(self, tagName, attrs):
+        self.path.append(tagName)
+        if tagName == 'Term':
+            if "RecordPreferredTermYN" in attrs.getNames():
+                self.RecordPreferredTermYN = attrs.getValue("RecordPreferredTermYN")
+        if tagName in IMPORTANT_TAGS:
+            self.is_in[tagName] = True
+
+    # Handle endElement
+    def endElement(self, tagName):
+        self.path = self.path[:-1]
+        if tagName == "Term":
+            self.RecordPreferredTermYN = ""
+        if tagName == "DescriptorRecord":
+            # if True or self.title.upper() in drugs_finder.drug_variant_to_canonical:
+            is_include = False
+            for t in self.tree_numbers:
+                if t.startswith("C"):
+                    is_include = True
+                else:
+                    is_include = False
+                    break
+                # if len(t.split('.')) < 4:
+                #     is_include = False
+                #     break
+            if is_include:
+                self.writer.writerow([self.id, "|".join(self.generic_names), self.title, "|".join(self.terms), "|".join(self.tree_numbers)])
+                print(self.id, self.title, self.tree_numbers, self.terms)
+            self.title = ""
+            self.id = ""
+            self.tree_numbers = set()
+            self.terms = set()
+            self.generic_names = set()
+
+        if tagName in IMPORTANT_TAGS:
+            self.is_in[tagName] = False
+
+    # Handle text data
+    def characters(self, chars):
+        if self.is_in["DescriptorName"] and self.is_in["String"]:
+            if "/".join(self.path) == "DescriptorRecordSet/DescriptorRecord/DescriptorName/String":
+                self.title += chars
+        if self.is_in["Term"] and self.is_in["String"]:
+            self.terms.add(chars)
+            if self.RecordPreferredTermYN == "Y":
+                self.generic_names.add(chars)
+        if self.is_in["DescriptorUI"]:
+            self.id = chars
+        if self.is_in["TreeNumber"]:
+            self.tree_numbers.add(chars)
+
+    # Handle startDocument
+    def startDocument(self):
+        print('About to start!')
+
+    # Handle endDocument
+    def endDocument(self):
+        print('Finishing up!')
+
+
+with open("diseases_dictionary_mesh.csv", "w", encoding="utf-8") as fo:
+    writer = csv.writer(fo)
+
+    handler = CustomContentHandler(writer)
+    xml.sax.parse(mesh_xml_file_name, handler)