finddrugs.py

#!/usr/bin/python

#--------------------------------
# Written by Marzyeh Ghassemi, CSAIL, MIT 
# Sept 21, 2012
# Updated for Python 3, added Notebook, db connection
# by Tom J. Pollard 13 Nov, 2017
# Please contact the author with errors found. 
# mghassem {AT} mit {DOT} edu
#--------------------------------

from __future__ import with_statement
import nltk
import os
import os.path
import re
import string
import sys
import time

def addToDrugs(line, drugs, listing, genList):
    """
    ###### function addToDrugs 
    #   line:    line of text to search
    #   drugs:   array to modify
    #   listing: list of search terms in (generic:search list) form
    #   genList: list of all generic keys being searched for
    #
    #   Searches the provided line for drugs that are listed. Inserts 
    #   a 1 in the drugs array provided at the location which maps 
    #   the found key to the generics list
    """
    genList = dict(enumerate(genList))
    genList = dict((v,k) for k, v in genList.items())

    for (generic, names) in listing.items():
        if re.search(names, line, re.I):
            drugs[genList[generic]] = 1
    return drugs

def readDrugs(f, genList):
    """
    ###### function readDrugs 
    #   f:       file
    #   genList: list of search terms in (generic:search list) form
    #
    #   Converts lines of the form "generic|brand1|brand2" to a
    #   dictionary keyed by "generic" with value "generic|brand1|brand2
    """
    lines = f.read()
    generics = re.findall("^(.*?)\|", lines, re.MULTILINE)
    generics = [x.lower() for x in generics]
    lines = lines.split("\n")
    lines = [x.lower() for x in lines]
    genList.append(generics)
    return dict(zip(generics, lines))

def search(NOTES, 
           SSRI_FILE = os.path.join(os.getcwd(), "SSRI_list.txt"), 
           MISC_FILE = os.path.join(os.getcwd(), "MISC_list.txt"),
           SUMMARY_FILE = "output.csv",
           VERBOSE = False):
    """
    ###### Search the notes
    # NOTES: dataframe loaded from the noteevents table
    # SSRI_FILE: list of SSRI drugs to search for
    # MISC_FILE: list of additional drugs to search for
    # 
    # NB: files should have a line for each distinct drug type, 
    #      and drugs should be separated by a vertical bar '|'
    # 
    # LIMIT FOR PARSING: max number of notes to search.
    # OUTPUT: name of the output file.
    """

    if os.path.isfile(SUMMARY_FILE):
        print('The output file already exists.\n\nRemove the following file or save with a different filename:')
        print(os.path.join(os.getcwd(), SUMMARY_FILE))
        return

    starttime = time.time()
    
    # Keep a list of all generics we are looking for
    genList = []

    # Get the drugs into a structure we can use
    with open(SSRI_FILE) as f:
        SSRI = readDrugs(f, genList)
        print("Using drugs from {}".format(SSRI_FILE))
    try: 
        with open(MISC_FILE) as f:
            MISC = readDrugs(f, genList)
            print("Using additional drugs from {}".format(MISC_FILE))
    except:
        MISC = None
    flatList = [item for sublist in genList for item in sublist]

    # Create indices for the flat list
    # This allows us to understand which "types" are being used
    lengths = [len(type) for type in genList]
    prevLeng = 0
    starts = []
    ends = []
    for leng in lengths:
        starts.append(prevLeng)
        ends.append(prevLeng + leng - 1)
        prevLeng = prevLeng + leng

    # Limit the analysis to discharge summaries
    # Comment out because limitation is now in SQL query
    # NOTES = NOTES[NOTES['category'] == 'Discharge summary']

    # Write heads and notes to new doc
    with open(SUMMARY_FILE, 'a') as f_out:
        f_out.write('"ROW_ID","SUBJECT_ID","HADM_ID","HIST_FOUND","DEPRESSION","ADMIT_FOUND","DIS_FOUND","GEN_DEPRESS_MEDS_FOUND","GROUP","SSRI","MISC","' \
            + '","'.join(flatList) + '"\n')

        # Parse each patient record
        print("Reading documents...")

        for note in NOTES.itertuples():
            if note.Index % 100 == 0:
                print("...index: {}. row_id: {}. subject_id: {}. hadm_id: {}. \n".format(note.Index, note.row_id, note.subject_id, note.hadm_id))
                sys.stdout.flush()
            
            # Reset some per-patient variables
            section = ""
            newSection = ""
            admitFound = 0 # admission note found
            dischargeFound = 0 # discharge summary found
            histFound = 0 # medical history found
            depressionHist = 0;
            drugsAdmit = [0]*len(flatList)
            drugsDis = [0]*len(flatList)
            general_depression_drugs = 0

            # Read through lines sequentially
            # If this looks like a section header, start looking for drugs
            for line in note.text.split("\n"): 

                # Searches for a section header based on heuristics
                m = re.search("""^((\d|[A-Z])(\.|\)))?\s*([a-zA-Z',\.\-\*\d\[\]\(\) ]+)(:| WERE | IS | ARE |INCLUDED|INCLUDING)""", line, re.I)
                if m:
                    newSection = ""
                    # Past Medical History Section
                    if re.search('med(ical)?\s+hist(ory)?', line, re.I):
                        newSection = "hist"
                        histFound = 1

                    # Discharge Medication Section                                                        
                    elif re.search('medication|meds', line, re.I) and re.search('disch(arge)?', line, re.I):
                        newSection = "discharge"
                        dischargeFound = 1

                    # Admitting Medication Section
                    elif re.search('admission|admitting|home|nh|nmeds|pre(\-|\s)?(hosp|op)|current|previous|outpatient|outpt|outside|^[^a-zA-Z]*med(ication)?(s)?', line, re.I) \
                    and (section == "admit" or re.search('medication|meds', line, re.I)):
                        newSection = "admit"
                        admitFound = 1                                         
                        
                    # Med section ended, now in non-meds section                        
                    if section != newSection:
                        section = newSection
                
                # If in history section, search for depression
                if 'hist' in section:
                    if re.search('depression', line, re.I):
                        depressionHist = 1

                # If in meds section, look at each line for specific drugs
                elif 'admit' in section:
                    drugsAdmit = addToDrugs(line, drugsAdmit, SSRI, flatList)
                    if MISC:
                        drugsAdmit = addToDrugs(line, drugsAdmit, MISC, flatList)
                    
                    ## Section just has something like 'Depression meds' 
                    if re.search('depression\s+med(ication)?(s)?', line, re.I):
                        general_depression_drugs = 1
                    
                ## Already in meds section, look at each line for specific drugs
                elif 'discharge' in section:
                    drugsDis = addToDrugs(line, drugsDis, SSRI, flatList)
                    if MISC:
                        drugsDis = addToDrugs(line, drugsDis, MISC, flatList)                        
                    
                # A line with information which we are uncertain about... 
                elif re.search('medication|meds', line, re.I) and re.search('admission|discharge|transfer', line, re.I):
                    if VERBOSE:
                        print('?? {}'.format(line))
                    pass

            group = 0
            # Group 0: Patient has no medications on admission section (or no targeted meds) 
            #          and medications on discharge from the list
            if dischargeFound == 1 and (1 in drugsDis) and (admitFound == 0 or not(1 in drugsAdmit)):        
                group = 0

            # Group 1: Patient has a medications on admission section with no targeted meds
            #          and no medications on discharge
            elif admitFound == 1 and not(1 in drugsAdmit) and (dischargeFound == 0) and general_depression_drugs == 0:
                group = 1

            # Group 2: Patient has medications on admission section, but none from the list
            #          and no medications on discharge from the list
            elif admitFound == 1 and not(1 in drugsAdmit) and dischargeFound == 1 and not(1 in drugsDis) and general_depression_drugs == 0:
                group = 2                                

            # Group 3: Patient has medications on admission (at least one from the list)
            elif (1 in drugsAdmit):
                group = 3
                                
            else:
                if VERBOSE:
                    print('Uncertain about group type for row_id = {}'.format(note.row_id))
                pass

            if VERBOSE:
                print('group is {}'.format(group))

            # Combine the admit and discharge drugs lists
            combined = [w or x for w, x in zip(drugsAdmit, drugsDis)]
        
            # Count the types of each drug
            member = []
            member = [int(1 in drugsAdmit[s:e+1]) for s, e in zip(starts, ends)]

            # save items to csv
            f_out.write(str(note.row_id) + "," + str(note.subject_id) + "," + str(note.hadm_id) + "," + str(histFound) + "," \
                + str(depressionHist) + "," + str(admitFound) + "," + str(dischargeFound) + "," \
                + str(general_depression_drugs) + "," + str(group) + "," + ",".join(map(str, member)) \
                + "," + ",".join(map(str, drugsAdmit)) + "\n")

    # Print summary of analysis
    stoptime = time.time()
    print("Done analyzing {} documents in {} seconds ({} docs/sec)".format(len(NOTES), 
        round(stoptime - starttime, 2), round(len(NOTES) / (stoptime - starttime), 2)))
    print("Summary file is in {}".format(os.getcwd()))