-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTestDocFile.py
790 lines (707 loc) · 26.3 KB
/
TestDocFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 23 15:56:17 2018
"""
import pandas as pd
import gensim.models as g
from collections import defaultdict
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential,Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import RMSprop
from keras.metrics import categorical_accuracy, mean_squared_error, mean_absolute_error, logcosh
from keras_contrib.utils import save_load_utils
#import jellyfish as jf
import numpy as np
import re
import json
import operator
import sys
numDoc = 0
num_docs = 6000000
max_len = 30
vector_dim = 100
num_LSTM_Units = 500
dropout = 0.2
embedding_matrix = np.zeros((num_docs, vector_dim))
dco2VecPath = ""
path = ""
modelName = 'KerasModel_lstm500_lr0.006_dropOut0.2_bSize100_epochs50_AllEmbeddings_Infer_1exp.h5'
fileName = sys.argv[1]
path_sectionMap = ""
myFunCount = 0
listOfSectionHeaders_Dict={'activity': ['activity'],
'admission_date': ['admission_date', 'admission date'],
'allergies_and_adverse_reactions': ['Known Adverse and Allergic Drug Reactions ( if none, enter NKA )',
'Allergic Disorder History',
'ALLERGIES / ADVERSE REACTIONS',
'allergy adverse reaction',
'allergies and adverse reactions',
'allergy and adverse reaction',
'allergy adverse reaction',
'adverse reactions',
'adverse reaction',
'history of allergies',
'history of allergy',
'history allergy',
'allergies',
'allergy'
],
'assessment': ['clinical impression',
'initial impression',
'impression',
'interpretation'],
'assessment_and_plan': [
'impression / recommendations',
'impression recommendation',
'impression and recommendation',
'impression and recommendations',
'impression and plans',
'impression and plan',
'impression / plan',
'impression plan',
'plan and discussion',
'plan discussion',
'assessment_and_plan',
'assessment and recommendations',
'assessment and recommendation',
'assessment recommendation',
'assessment and plan',
'assessment plan',
'assessment / plan',
'assessment plan',
'Assessment',
'clinical comments',
'clinical comment',
'a & p',
'a / p',
'a p'],
'chief_complaint': ['identifying data/chief complaint',
'identify data chief complaint',
'identification / chief complaint',
'identification chief complaint',
'identification and chief complaint',
'reason for admission and consultation',
'reason admission consultation',
'reason for admission / chief complaint',
'reason for admission/chief complaint',
'reason for admission chief complaint',
'reason admission chief complaint'
'reason for hospitalization',
'reason hospitalization',
'reason for admission',
'reason admission',
'reason for visit',
'reason visit',
'chief concern',
'id / cc',
'id cc',
'here for a chief complaint of',
'here a chief complaint',
'chief complaint'],
'diagnoses': [
'PRINCIPAL DISCHARGE DIAGNOSIS',
'discharge diagnoses',
'ADMIT DIAGNOSIS',
'OTHER DIAGNOSIS',
'FINAL DIAGNOSES',
'FINAL DIAGNOSIS',
'Diagnosis',
'diagnoses'],
'discharge_condition': ['DISCHARGE CONDITION'],
'discharge_instructions': ['discharge instructions',
'discharge instruction'],
'family_history': ['family history'],
'findings': [
'findings at surgery',
'finding at surgery',
'finding surgery',
'diagnostic findings',
'diagnostic finding',
'indications / findings',
'indication finding',
'diagnostic impression',
'findings',
'finding'],
'follow_up': [
'followup care plan',
'return to clinic',
'return clinic',
'rtc',
'followup instructions',
'followup instruction',
'follow-up appointments ; arrangements for care',
'follow up appointment arrangement for care',
'follow up appointment arrangement care',
'FOLLOW-UP PLANS',
'follow up appointment',
'followup appointments',
'followup appointment',
'followup',
'follow - up',
'follow-up',
'follow up'],
'history_present_illness': [
'History of Present Illness / Subjective Complaint',
'hpi / interval history',
'hpi interval history',
'patient hpi',
'summary of present illness',
'summary present illness',
'HPI / Subjective Complaint',
'EVENTS/EXTENDED HISTORY OF PRESENT ILLNESS',
'history / physical examination',
'history physical examination',
'in clinical history',
'clinical history / indications',
'clinical history indication',
'clinical history',
'issues briefly as following',
'issue briefly as following',
'current medical problems',
'current medical problem',
# 'indications',
# 'indication',
'patient history',
'history of chronic illness',
'history chronic illness',
'clinical presentation',
'history of present illness',
'history present illness',
'issues briefly as follows',
'issue briefly as follow',
'clinical indication',
'history present illness',
'history of the present illness',
'interval history',
'history present illness',
'present illness',
'hpi'],
'history_source': [
'historian',
'history obtained from',
'history obtain from',
'history obtain',
'HX obtained from',
'hx obtain from',
'hx obtain',
'history source',
'SOURCES OF INFORMATION',
'source of information',
'source information',
'informant'],
# 'source'],
'hospital_course': ['course in the emergency department',
'course emergency department',
'BRIEF SUMMARY OF HOSPITAL COURSE',
'BRIEF RESUME OF HOSPITAL COURSE',
'Hospital Course by system',
'SUMMARY OF HOSPITAL COURSE',
'history / hospital course',
'history hospital course',
'brief hospital course',
'hospital course'],
'laboratory_and_radiology_data': [
'AVAILABLE PERTINENT LABORATORY & X - RAY FINDINGS',
'Available Pertinent Laboratory & X-Ray Findings',
'available pertinent laboratory x ray findings',
'available pertinent laboratory x ray finding',
'studies performed',
'study perform',
'diagnostic procedure',
'laboratory and study data',
'laboratory study data',
'laboratory data / diagnostic studies',
'laboratory data diagnostic study',
'laboratory data',
'diagnostics',
'study data',
'laboratory and radiographic studies',
'laboratory and radiographic study',
'laboratory radiographic study',
'clinical data',
'imaging procedure',
'ancillary studies',
'ancillary study',
'Lab and radiological results',
'lab and radiological result',
'lab radiological result',
'available lab and x-ray results',
'available lab and x ray results',
'available lab and x ray result',
'available lab x ray result',
'lab and imaging',
'lab imaging',
'laboratory and radiology data',
'laboratory data / radiology',
'laboratory data radiology',
'comparison imaging',
'other results',
'other result',
'LABORATORY STUDIES',
'Laboratory data',
'Other labs',
'laboratory and radiology findings',
'laboratory and radiology finding',
'laboratory radiology finding',
'Pertinent Results',
'pertinent studies',
'pertinent study',
'PERTINENT LABORATORY VALUES ON PRESENTATION',
'PERTINENT RADIOLOGY / IMAGING',
'lab and radiological results',
'lab and radiological result',
'Labs / Radiology',
'lab radiological result',
'laboratory and x - ray data',
'laboratory and x-ray data',
'laboratory and x ray data',
'laboratory x ray data',
'laboratory and radiology data',
'laboratory radiology data',
'diagnostic tests and procedures',
'diagnostic test and procedure',
'diagnostic test procedure',
'special studies',
'special study',
'preprocedure studies',
'preprocedure study',
'diagnostic studies',
'diagnostic study',
'Laboratory or imaging',
'laboratory imaging',
'comparison_studies',
'comparison studies',
'comparison study'
],
'medications': [
'ADMISSION MEDICATIONS',
'MEDICATIONS / HERBS / SUPPLEMENTS',
'medication herb supplement',
'Medications administered',
'medication administer',
'MEDICATIONS ON ADMISSION',
'CURRENT MEDICATIONS',
'Last dose of Antibiotics',
'Infusions',
'Other ICU Medications',
'Other Medications',
'premedications',
'premedication',
'premorbid medications',
'premorbid medication',
'medications at vanderbilt',
'medication at vanderbilt',
'medication vanderbilt',
'Medications Known to be Prescribed for or Used by the Patient',
'Medications Known to be Prescribed for or Used by the Patient ( with dose , route , and frequency )',
'most recent medication',
'medications',
'medication'],
'discharge medications':['discharge medications',
'discharge medication'],
'past_medical_history': ['past medical history and review of systems',
'past medical history and review of system',
'past medical history review system',
'past medical problems',
'past medical problem',
'history of past illness',
'history past illness',
'past medical history',
'Past Medical / Surgical History',
'Other Past Medical History',
'PAST MEDICAL HISTORY',
'previous medical history',
'hematology / oncology history',
'hematology oncology history',
'history of general health',
'history general health',
'past medical history / past surgical history',
'past medical history past surgical history',
'medical problems',
'medical problem',
'significant past medical history',
'history of major illnesses and injuries',
'history of major illness and injury',
'history major illness injury',
'past med history',
'past hospitalization history',
'past medical and surgical history',
'past medical surgical history',
'brief medical history',
'Past Medical History / Problem List',
'Past Medical History/Problem List',
'past medical history problem list',
'past medical issues',
'past medical issue',
'past medical history / surgical history',
'past medical history surgical history',
'past infectious history',
'past medical history/family history',
'past medical history family history',
'Known Significant Medical Diagnoses and Conditions',
'past medical history',
'medical history',
'past history',
'illnesses',
'illness',
'pmhx',
'pmh'],
'physical_examination': ['physical examination as compared to admission',
'physical examination as compare to admission',
'physical examination as compare admission',
'external examination',
'physical exam compared admission',
'PHYSICAL EXAM AT TIME OF ADMISSION',
'physical exam as compared to admission',
'physical exam as compare to admission',
'physical exam as compare admission',
"My key findings of this patient ' s physical exam are",
'my key finding of this patient physical exam be',
'my key finding this patient physical exam be',
'admission physical exam',
"I examined the patient and confirmed the House Staff ' s Admission Physical Exam",
'i examine the patient and confirm the house staff admission physical exam',
'i examine patient confirm house staff admission physical exam',
'examination on discharge',
'physicial examination',
'examination on discharge compared to admission',
'examination on discharge compare to admission',
'examination discharge compare admission',
'examination discharge',
'physical examination by organ systems',
'physical examination by organ system',
'physical examination organ system',
'physical findings',
'physical finding',
'physical exam compare admission',
'physical examination',
'PE on admission',
'PE on discharge',
'admission exam',
'admit exam',
'exam on admission',
'admission examination',
'physical exam',
'admission PE',
'examination',
'exam',
'pe'],
'review_of_systems': ['systems review',
'system review',
'history of symptoms & diseases',
'history of symptom disease',
'history symptom disease',
'review of symptoms and diseases',
'review of symptom and disease',
'review symptom disease',
'cardiovascular review of systems',
'cardiovascular review of system',
'cardiovascular review system',
'cardiac review of systems',
'cardiac review of system',
'cardiac review system',
'social history / family history/review of systems',
'social history family history review of system',
'social history family history review system',
'review of systems',
'review of system',
'review system',
'ros'],
'social_history': [
'Social / Occupational History',
'SOCIAL HISTORY']
}
def findWholeWord(w):
return re.compile(r'(^\s*\b({0})\b)|([\r\n]+\s*\b({0})\b)'.format(w), flags=re.IGNORECASE).search
def getparaSpans(keywords,para,index,key):
global paras
for keyword in keywords:
match = findWholeWord(keyword.lower())(para.lower())
if match:
return (match.span()[0],match.span()[1],key)
def cleanClinicalNote(text):
timeRegex = "\\b(1[012]|0[1-9]):([0-5][0-9])(\\s)?([Aa]|[pP])[mM]"
p1 = re.compile(timeRegex)
m1=p1.subn("TIMETOKEN",text)
squareBracketStarRegex = "\[\*\*(.*?)\*\*\]"
p2 = re.compile(squareBracketStarRegex)
m2=p2.subn("",m1[0])
numberRegex="[-+]?[0-9]*\.?[0-9]+"
p3 = re.compile(numberRegex)
m3=p3.subn("NUM",m2[0])
return m3[0]
def paragrapher(text):
d = re.split(r"\r\n\r\n", text)
d1 = re.split(r"\n\n", text)
if len(d1)>len(d):
return d1
else:
return d
def getSpans(paras):
someResult = []
index = 0
for para in paras:
if (len(paras)>1):
for key in listOfSectionHeaders:
for keyword in listOfSectionHeaders_Dict.get(key):
gotSomething = findWholeWord(keyword.lower())(para.lower())
# gotSomething = getparaSpans(listOfSectionHeaders_Dict.get(key),para,index,key)
if gotSomething:
someResult.append((gotSomething.span()[0],gotSomething.span()[1],key,index))
if not gotSomething:
for unknownkeyword in unknownValues:
match_unknownkeyword = findWholeWord(unknownkeyword.lower())(para.lower())
if match_unknownkeyword:
someResult.append((match_unknownkeyword.span()[0],match_unknownkeyword.span()[1],'Unknown',index))
# break
else:
for key in listOfSectionHeaders:
for keyword in listOfSectionHeaders_Dict.get(key):
gotSomething = findWholeWord(keyword.lower())(para.lower())
# gotSomething = getparaSpans(listOfSectionHeaders_Dict.get(key),para,index,key)
if gotSomething:
someResult.append((gotSomething.span()[0],gotSomething.span()[1],key,index))
for unknownkeyword in unknownValues:
match_unknownkeyword = findWholeWord(unknownkeyword.lower())(para.lower())
if match_unknownkeyword:
someResult.append((match_unknownkeyword.span()[0],match_unknownkeyword.span()[1],'Unknown',index))
index=index+1
return someResult
def unique_by_key(elements, key=None):
if key is None:
# no key: the whole element must be unique
key = lambda e: e
return {key(el): el for el in elements}.values()
def preprocessAndGetSectionText(data):
paras = paragrapher(cleanClinicalNote(data))
allSpans=getSpans(paras)
spansSorted = sorted(list(set(allSpans)), key=operator.itemgetter(3, 0,1))
for tup in spansSorted:
if paras[tup[3]][tup[0]-3:tup[0]+1] == '\r\n.\r':
spansSorted.remove(tup)
unique_by_second_element = unique_by_key(spansSorted, key=operator.itemgetter(0,3))
spansSortedUnique = sorted(unique_by_second_element, key=operator.itemgetter(3, 0,1))
spansSortedUniqueCopy = sorted(unique_by_second_element, key=operator.itemgetter(3, 0,1))
sectionText={key: '' for key in listOfSectionHeaders+['Unknown']}
for i in range(len(spansSortedUnique)):
currentSpan = spansSortedUnique[i]
currentSection = currentSpan[2]
paraListIndex = currentSpan[3]
currentSpanSI = currentSpan[0]
currentSpanEI = currentSpan[1]
if i<len(spansSortedUnique)-1 and spansSortedUnique[i+1][3] == paraListIndex:
nextSpan = spansSortedUnique[i+1]
nextSection = nextSpan[2]
nextSpanSI = nextSpan[0]
else:
nextSpanSI = len(paras[paraListIndex])
nextSection = None
txt = paras[paraListIndex][currentSpanEI+1:nextSpanSI]
if len(txt) >4:
if currentSection == 'Unknown':
if len(sectionText[currentSection]) == 0 and nextSection and nextSection == 'Unknown':
sectionText[currentSection] = txt
spansSortedUniqueCopy.remove(currentSpan)
continue
else:
if nextSection and nextSection == 'Unknown':
sectionText[currentSection]=sectionText[currentSection]+' '+txt
spansSortedUniqueCopy.remove(currentSpan)
else:
sectionText[currentSection]=sectionText[currentSection]+txt+'**Unknown**'
else:
if len(sectionText[currentSection]) == 0 and nextSection and nextSection == currentSection:
sectionText[currentSection]=txt
spansSortedUniqueCopy.remove(currentSpan)
elif nextSection and nextSection == currentSection:
sectionText[currentSection]=sectionText[currentSection]+' '+txt
spansSortedUniqueCopy.remove(currentSpan)
else:
sectionText[currentSection]=sectionText[currentSection]+' '+txt
else:
spansSortedUniqueCopy.remove(currentSpan)
if i<len(spansSortedUnique)-1 and spansSortedUnique[i+1][3]-paraListIndex >1:
sectionText[currentSection]=sectionText[currentSection]+' '+" ".join(paras[paraListIndex+1:spansSortedUnique[i+1][3]])
sectionSequence = [tup[2] for tup in spansSortedUniqueCopy]
paraIdx = 0
for i in range(len(spansSortedUniqueCopy)):
currSpan=spansSortedUniqueCopy[i]
currSpanParaIdx = currSpan[3]
if paraIdx!=currSpanParaIdx:
if i==0:
txt="_".join(paras[i:currSpanParaIdx])
sectionText['Unknown']=txt+'**Unknown**'+sectionText['Unknown']
if currSpan[2]!='Unknown':
sectionSequence=['Unknown']+sectionSequence
elif currSpan[0]!=0:
prevSectionHeader = spansSortedUniqueCopy[i-1][2]
sectionText[prevSectionHeader]=sectionText[prevSectionHeader]+' '+paras[currSpanParaIdx][0:currSpan[0]]
paraIdx = currSpanParaIdx
sectionText['Order of Section Header Appearence']=sectionSequence
return sectionText
listOfSectionHeaders=list(listOfSectionHeaders_Dict.keys())
clamp_Map = pd.read_table(path_sectionMap+'section_map.txt', sep = "\t", header = None)
listOfSectionHeaders_Dict_CLAMP = clamp_Map.set_index(1).iloc[0:].stack().groupby(level=0).apply(list).to_dict()
for k in listOfSectionHeaders:
listOfSectionHeaders_Dict_CLAMP.pop(k, None)
removeKeys = ['anesthesia','carbon_copy','closing','complications',
'condition',
'counts',
'data_base',
'description',
'estimated_blood_loss',
'identifying_information',
'instructions',
'objective_data',
'orders',
'problem_list',
'providers',
'references',
'reliability',
'report'
'report_status',
'technique'
]
listOfSectionHeaders=list(listOfSectionHeaders_Dict.keys())
for k in removeKeys:
listOfSectionHeaders_Dict_CLAMP.pop(k, None)
unknownValues = []
for key, value in listOfSectionHeaders_Dict_CLAMP.items():
unknownValues=unknownValues+value
unknownValues.append('comments')
unknownValues.append('Date of Birth')
unknownValues.append('TO DO/PLAN')
unknownValues.remove('recommendations')
unknownValues.remove('Recommendations')
unknownValues.remove('recommendation')
unknownValues=list(set(unknownValues))
unknownValues.sort(key = lambda s: len(s))
unknownValues.reverse()
#fileName='104351.txt'
#fileName='DischargeSummary4826672.txt'
#fileName='DischargeSummary9457973.txt'
#path=path+"batch32\\"
#with open(path+fileName, 'r') as myfile:
# data = myfile.read()
#tmp=preprocessAndGetSectionText(data)
def getBatch(rootString):
x = re.search(r'batch\d+', rootString)
if("i2b2" in rootString):
return "i2b2"
if (x!=None):
return x.group(0)
def replaceNE(noteString, dictelmt):
try:
noteString = re.sub(r'(\b({0})\b)'.format(dictelmt[0]),dictelmt[1],noteString)
except:
return noteString
return noteString
entitiesDict = json.load(open(path + "CLamp_Filename2Entity_dict.json"))
dictelmt = entitiesDict[fileName]
def myFunc(reqTuple):
global dictelmt
global myFunCount
fileEntitiesMapKey = reqTuple[0]
fileText = reqTuple[1]
replaceNE(fileText, dictelmt)
tempDict = {fileEntitiesMapKey:preprocessAndGetSectionText(fileText)}
myFunCount +=1
if (myFunCount%1000==0):
print(str(myFunCount) + " preprocessAndGetSectionText run")
return tempDict
def splitAndRemoveEmptys(text):
splitText = re.split('; |, |\*|\n|-|:|\s',text)
splitText[:] = [x for x in splitText if x != '']
return splitText
def createModelArchitecture(max_len,num_LSTM_Units,vector_dim,num_docs,embedding_matrix,dropout):
doc_input = Input(shape=(max_len,),dtype='float32', name='doc_input')
#
##np.savetxt('embedding_matrix.csv', embedding_matrix, delimiter=',')
#
print("Creating Embedding Layer...")
#embedding layer intialized with the matrix created earlier
embedded_doc_input = Embedding(output_dim=vector_dim, input_dim=num_docs,weights=[embedding_matrix], trainable=False,mask_zero=True)(doc_input)
#
print("Embedding Layer Created, Creating BiDirectional Layer...")
model=(Bidirectional(LSTM(units=num_LSTM_Units, return_sequences=True,recurrent_dropout=dropout)))(embedded_doc_input) # variational biLSTM
# model=(Bidirectional(LSTM(units=num_LSTM_Units, return_sequences=True,recurrent_dropout=dropout)))(model) # variational biLSTM
#
print("BiDirectional Layer Created, Creating TimeDistributed Layer...")
model=(TimeDistributed(Dense(vector_dim, activation="relu")))(model) # a dense layer as suggested by neuralNer
#
print("TimeDistributed Layer Created, Creating CRF Layer...")
crf = CRF(max_len+1) # CRF layer
out = crf(model)
print("CRF Layer Created, Training...")
model = Model(doc_input, out)
return model,crf
sectionHeaders = ['history_present_illness','activity','discharge_condition','past_medical_history','chief_complaint','follow_up','discharge_instructions','allergies_and_adverse_reactions','admission_date','hospital_course','findings','review_of_systems','family_history','laboratory_and_radiology_data','diagnoses','physical_examination','assessment_and_plan','social_history','medications','history_source','assessment','discharge medications','Unknown']
def printTextPredictionSection(model,X_Test):
for i in range(len(X_Test)):
p = model.predict(np.array([X_Test[i]]))
p = np.argmax(p, axis=-1)
for i in range(len(p[0])):
print('-- Belongs to Section --'+sectionHeaders[p[0][i]-1])
with open(path+fileName, 'r') as myfile:
data=myfile.read()
#Replacing the Named Entity with the entity type
#entitiesDict = json.load(open(path + "CLamp_filename2_entity_dict.json"))
#dictelmt = entitiesDict[fileName]
#data = replaceNE(data,dictelmt)
fileCleanedTextDict = myFunc([fileName,data])
with open('fileCleanedTextDict.json', 'w') as fp:
json.dump(fileCleanedTextDict, fp, sort_keys=True, indent=4)
#json.dump(fileCleanedTextDict,'fileCleanedTextDict.json')
#paras = re.split(r'[\n\n][\w\s]+[:]',data)
model_doc2vec= g.Doc2Vec.load(dco2VecPath+"model_dbow0_pretrainedpubWikiPMC_trained_vd100_full-36.bin")
noteVec = []
listOfClinicalNotes_Doc2vec = []
sectionHeaderSequence=fileCleanedTextDict[fileName].get('Order of Section Header Appearence')
unknownTextList = fileCleanedTextDict[fileName].get("Unknown").split("**Unknown**")
unknownSectionCounter = 0
for sectionHeader in sectionHeaderSequence:
numDoc += 1
if sectionHeader == "Unknown":
unknownTextList = unknownTextList[unknownSectionCounter]
sectionTextSpacesRemoved = re.sub(' +',' ',unknownTextList.lower())
paraTokens = splitAndRemoveEmptys(sectionTextSpacesRemoved)
vec = model_doc2vec.infer_vector(paraTokens, steps=20, alpha=0.025)
noteVec.append(numDoc)
embedding_matrix[numDoc]=vec
unknownSectionCounter=unknownSectionCounter+1
continue
sectionTextSpacesRemoved = re.sub(' +',' ',fileCleanedTextDict[fileName].get(sectionHeader).lower())
paraTokens = splitAndRemoveEmptys(sectionTextSpacesRemoved)
vec = model_doc2vec.infer_vector(paraTokens, steps=20, alpha=0.025)
noteVec.append(numDoc)
embedding_matrix[numDoc]=vec
# =============================================================================
# for para in paras:
# numDoc += 1
# sectionTextSpacesRemoved = re.sub(' +',' ',para.lower())
# paraTokens = splitAndRemoveEmptys(sectionTextSpacesRemoved)
# vec = model_doc2vec.infer_vector(paraTokens, steps=20, alpha=0.025)
# noteVec.append(vec)
# embedding_matrix[numDoc]=vec
#
# =============================================================================
listOfClinicalNotes_Doc2vec.append(noteVec)
X_test = pad_sequences(maxlen=max_len, dtype='int32',sequences=listOfClinicalNotes_Doc2vec, padding="post", value=0)
model1,crf1=createModelArchitecture(max_len,num_LSTM_Units,vector_dim,num_docs,embedding_matrix,dropout)
model1.compile(loss=crf1.loss_function, optimizer="rmsprop", metrics=[crf1.accuracy])
print("Model Architecture Created")
print("Loading Model")
#modelName = 'KerasModel'+'_lstm'+str(num_LSTM_Units)+'_lr'+str(learning_rate)+'_dropOut'+str(dropout)+'_bSize'+str(batch_size)+'_epochs'+str(epochs)+'_'+embeddingLayerFlag+'_'+embeddingFlag+'_'+str(experiment)+'exp.h5'
save_load_utils.load_all_weights(model1,path+modelName,include_optimizer=False)
for i in range(len(X_test)):
p = model1.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
print("{:50}{}".format("True", "Pred"))
print(80 * "=")
for true,pred in zip(fileCleanedTextDict[fileName].get('Order of Section Header Appearence'),p[0]):
print("{:50} {}".format(true, sectionHeaders[pred-1]))
#printTextPredictionSection(model1,X_test)