-
Notifications
You must be signed in to change notification settings - Fork 0
/
mlhc2024_accepted_submissions.bib
1308 lines (1271 loc) · 75.2 KB
/
mlhc2024_accepted_submissions.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@Proceedings{MLHC-2024,
booktitle = {Proceedings of the 9th Machine Learning for
Healthcare Conference},
volume = 252,
editor = {Kaivalya Deshpande and Madalina Fiterau and Shalmali
Joshi and Zachary Lipton and Rajesh Ranganath and
I\~nigo Urteaga},
name = {Machine Learning for Healthcare Conference},
shortname = {MLHC},
conference_number =9,
year = 2024,
start = {2024-08-16},
end = {2024-08-17},
published = {2024-11-25},
url = {http://mlforhc.org/},
address = {Toronto, Canada}
}
@inproceedings{mlhc2024_submission_139,
title = {{MALADE}: Orchestration of {LLM}-powered Agents with
Retrieval Augmented Generation for
Pharmacovigilance},
author = {Jihye Choi and Nils Palumbo and Prasad Chalasani and
Matthew M. Engelhard and Somesh Jha and Anivarya
Kumar and David Page},
abstract = {In the era of Large Language Models (LLMs), given
their remarkable text understanding and generation
abilities, there is an unprecedented opportunity to
develop new, LLM-based methods for trustworthy
medical knowledge synthesis, extraction, and
summarization. This paper focuses on the problem of
Pharmacovigilance (PhV), where the significance and
challenges lie in identifying Adverse Drug Events
(ADEs) from diverse text sources, such as medical
literature, clinical notes, and drug
labels. Unfortunately, this task is hindered by
factors including variations in the terminologies of
drugs and outcomes, and ADE descriptions often being
buried in large amounts of narrative text. We
present MALADE, the first effective collaborative
multi-agent system powered by LLM with Retrieval
Augmented Generation for ADE extraction from drug
label data. This technique involves augmenting a
query to an LLM with relevant information extracted
from text resources and instructing the LLM to
compose a response consistent with the augmented
data. MALADE is a general LLM-agnostic architecture,
and its unique capabilities are: (1) leveraging a
variety of external sources, such as medical
literature, drug labels, and FDA tools (e.g.,
OpenFDA drug information API), (2) extracting
drug-outcome association in a structured format
along with the strength of the association, and (3)
providing explanations for established
associations. Instantiated with GPT-4 Turbo or
GPT-4o, and FDA drug label data, MALADE demonstrates
its efficacy with an Area Under ROC Curve of 0.90
against the OMOP Ground Truth table of ADEs. Our
implementation leverages the Langroid multi-agent
LLM framework and can be found at
https://github.com/jihyechoi77/malade.},
openreview = {z0SuPuHD7q}
}
@inproceedings{mlhc2024_submission_91,
title = {Beyond Clinical Trials: Using Real World Evidence to
Investigate Heterogeneous, Time-Varying Treatment
Effects},
author = {Isabel Chien and Cliff Wong and Zelalem Gero and
Jaspreet Bagga and Risa Ueno and Richard E. Turner
and Roshanthi K. Weerasinghe and Brian Piening and
Tristan Naumann and Carlo Bifulco and Hoifung Poon
and Javier Gonz\'alez Hern\'andez},
abstract = {Randomized controlled trials (RCTs), though
essential for evaluating the efficacy of novel
treatments, are costly and time-intensive. Due to
strict eligibility criteria, RCTs may not adequately
represent diverse patient populations, leading to
equity issues and limited
generalizability. Additionally, conventional trial
analysis methods are limited by strict assumptions
and biases. Real-world evidence (RWE) offers a
promising avenue to explore treatment effects beyond
trial settings, addressing gaps in representation
and providing additional insights into patient
outcomes over time. We introduce TRIALSCOPE-X and
TRIALSCOPE-XL, machine learning pipelines designed
to analyze treatment outcomes using RWE by
mitigating biases that arise from observational data
and addressing the limitations of conventional
methods. We estimate causal, time-varying treatment
effects across heterogeneous patient populations and
varied timeframes. Preliminary results investigating
the treatment benefit of Keytruda, a widely-used
cancer immunotherapy drug, demonstrate the utility
of our methods in evaluating treatment outcomes
under novel settings and uncovering potential
disparities. Our findings highlight the potential of
RWE-based analysis to provide data-driven insights
that inform evidence-based medicine and shape more
inclusive and comprehensive clinical research,
supplementing traditional clinical trial findings.},
openreview = {wUruL3DqKB}
}
@inproceedings{mlhc2024_submission_26,
title = {General-Purpose Retrieval-Enhanced Medical
Prediction Model Using Near-Infinite History},
author = {Junu Kim and Chaeeun Shim and Bosco Seong Kyu Yang
and Chami Im and Sung Yoon Lim and Han-Gil Jeong and
Edward Choi},
abstract = {Machine learning (ML) has recently shown promising
results in medical predictions using electronic
health records (EHRs). However, since ML models
typically have a limited capability in terms of
input sizes, selecting specific medical events from
EHRs for use as input is necessary. This selection
process, often relying on expert opinion, can cause
bottlenecks in development. We propose
Retrieval-Enhanced Medical prediction model (REMed)
to address such challenges. REMed can essentially
evaluate unlimited medical events, select the
relevant ones, and make predictions. This allows
for an unrestricted input size, eliminating the need
for manual event selection. We verified these
properties through experiments involving 27 clinical
prediction tasks across four independent cohorts,
where REMed outperformed the baselines. Notably, we
found that the preferences of REMed align closely
with those of medical experts. We expect our
approach to significantly expedite the development
of EHR prediction models by minimizing clinicians'
need for manual involvement.},
openreview = {veSe9eDmOz}
}
@inproceedings{mlhc2024_submission_182,
title = {Localising the Seizure Onset Zone from Single-Pulse
Electrical Stimulation Responses with a {CNN}
Transformer},
author = {Jamie Norris and Aswin Chari and Dorien van Blooijs
and Gerald K. Cooray and Karl Friston and Martin M
Tisdall and Richard E Rosch},
abstract = {Epilepsy is one of the most common neurological
disorders, often requiring surgical intervention
when medication fails to control seizures. For
effective surgical outcomes, precise localisation of
the epileptogenic focus - often approximated through
the Seizure Onset Zone (SOZ) - is critical yet
remains a challenge. Active probing through
electrical stimulation is already standard clinical
practice for identifying epileptogenic areas. Our
study advances the application of deep learning for
SOZ localisation using Single-Pulse Electrical
Stimulation (SPES) responses, with two key
contributions. Firstly, we implement an existing
deep learning model to compare two SPES analysis
paradigms: divergent and convergent. These paradigms
evaluate outward and inward effective connections,
respectively. We assess the generalisability of
these models to unseen patients and electrode
placements using held-out test sets. Our findings
reveal a notable improvement in moving from a
divergent (AUROC: 0.574) to a convergent approach
(AUROC: 0.666), marking the first application of the
latter in this context. Secondly, we demonstrate the
efficacy of CNN Transformers with cross-channel
attention in handling heterogeneous electrode
placements, increasing the AUROC to 0.730. These
findings represent a significant step in modelling
patient-specific intracranial EEG electrode
placements in SPES. Future work will explore
integrating these models into clinical
decision-making processes to bridge the gap between
deep learning research and practical healthcare
applications.},
openreview = {vE2LTa2x83}
}
@inproceedings{mlhc2024_submission_163,
title = {{PRECIS}e : Prototype-Reservation for Explainable
classification under Imbalanced and Scarce-Data
Settings},
author = {Vaibhav Ganatra and Drishti Goel},
abstract = {Deep learning models used for medical image
classification tasks are often constrained by the
limited amount of training data along with severe
class imbalance. Despite these problems, models
should be explainable to enable human trust in the
models' decisions to ensure wider adoption in high
risk situations. In this paper, we propose PRECISe,
an explainable-by-design model meticulously
constructed to concurrently address all three
challenges. Evaluation on 2 imbalanced medical image
datasets reveals that PRECISe outperforms the
current state-of-the-art methods on data efficient
generalization to minority classes, achieving an
accuracy of ~87% in detecting pneumonia in chest
x-rays upon training on <60 images
only. Additionally, a case study is presented to
highlight the model's ability to produce easily
interpretable predictions, reinforcing its practical
utility and reliability for medical imaging tasks.},
openreview = {u30e7ZQ0To}
}
@inproceedings{mlhc2024_submission_29,
title = {The Data Addition Dilemma},
author = {Judy Hanwen Shen and Inioluwa Deborah Raji and Irene
Y. Chen},
abstract = {In many machine learning for healthcare tasks,
standard datasets are constructed by amassing data
across many, often fundamentally dissimilar,
sources. But when does adding more data help, and
when does it hinder progress on desired model
outcomes in real-world settings? We identify this
situation as the Data Addition Dilemma,
demonstrating that adding training data in this
multi-source scaling context can at times result in
reduced overall accuracy, uncertain fairness
outcomes and reduced worst-subgroup performance. We
find that this possibly arises from an empirically
observed trade-off between model performance
improvements due to data scaling and model
deterioration from distribution shift. We thus
establish baseline strategies for navigating this
dilemma, introducing distribution shift heuristics
to guide decision-making for which data sources to
add in order to yield the expected model performance
improvements. We conclude with a discussion of the
required considerations for data collection and
suggestions for studying data composition and scale
in the age of increasingly larger models.},
openreview = {s8WSOR8w3n}
}
@inproceedings{mlhc2024_submission_25,
title = {Benchmarking Reliability of Deep Learning Models for
Pathological Gait Classification},
author = {Abhishek Jaiswal and Nisheeth Srivastava},
abstract = {Early detection of neurodegenerative disorders is an
important open problem, since early diagnosis and
treatment may yield a better prognosis. Researchers
have recently sought to leverage advances in machine
learning algorithms to detect symptoms of altered
gait, possibly corresponding to the emergence of
neurodegenerative etiologies. However, while several
claims of positive and accurate detection have been
made in the recent literature, using a variety of
sensors and algorithms, solutions are far from being
realized in practice. This paper analyzes existing
approaches to identify gaps inhibiting
translation. Using a set of experiments across three
Kinect-simulated and one real Parkinson's patient
datasets, we highlight possible sources of errors
and generalization failures in these
approaches. Based on these observations, we propose
our strong baseline called Asynchronous Multi-Stream
Graph Convolutional Network (AMS-GCN) that can
reliably differentiate multiple categories of
pathological gaits across datasets.},
openreview = {pCH5UQrKuZ}
}
@inproceedings{mlhc2024_submission_8,
title = {To which reference class do you belong? Measuring
racial fairness of reference classes with normative
modeling},
author = {Saige Rutherford and Thomas Wolfers and Charlotte
Fraza and Nathaniel G. Harnett and Christian
Beckmann and Henricus G. Ruhe and Andre Marquand},
abstract = {Reference classes in healthcare establish healthy
norms, such as pediatric growth charts of height and
weight, and are used to chart deviations from these
norms which represent potential clinical risk. How
the demographics of the reference class influence
clinical interpretation of deviations is
unknown. Using normative modeling, a method for
building reference classes, we evaluate the fairness
(racial bias) in reference models of structural
brain images that are widely used in psychiatry and
neurology. We test whether including ``race'' in the
model creates fairer models. We predict
self-reported race using the deviation scores from
three different reference class normative models to
better understand bias in an integrated,
multivariate sense. Across all these tasks, we
uncover racial disparities that are not easily
addressed with existing data or commonly used
modeling techniques. Our work suggests that
deviations from the norm could be due to demographic
mismatch with the reference class, and assigning
clinical meaning to these deviations should be done
with caution. Our approach also suggests that
acquiring more representative samples is an urgent
research priority.},
openreview = {mako7lcwWF}
}
@inproceedings{mlhc2024_submission_151,
title = {Direct Preference Optimization for Suppressing
Hallucinated Prior Exams in Radiology Report
Generation},
author = {Oishi Banerjee and Hong-Yu Zhou and Kay Wu and
Subathra Adithan and Stephen Kwak and Pranav
Rajpurkar},
abstract = {Recent advances in generative vision-language models
(VLMs) have exciting potential implications for AI
in radiology, yet VLMs are also known to produce
hallucinations, nonsensical text, and other unwanted
behaviors that can waste clinicians' time and cause
patient harm. Drawing on recent work on direct
preference optimization (DPO), we propose a simple
method for modifying the behavior of pretrained VLMs
performing radiology report generation by
suppressing unwanted types of generations. We apply
our method to the prevention of hallucinations of
prior exams, addressing a long-established problem
behavior in models performing chest X-ray report
generation. Across our experiments, we find that DPO
fine-tuning achieves a 3.2-4.8x reduction in lines
hallucinating prior exams while maintaining model
performance on clinical accuracy metrics. Our work
is, to the best of our knowledge, the first work to
apply DPO to medical VLMs, providing a data- and
compute- efficient way to suppress problem behaviors
while maintaining overall clinical accuracy.},
openreview = {kvYYP1LfRq}
}
@inproceedings{mlhc2024_submission_195,
title = {MedAutoCorrect: Image-Conditioned Autocorrection in
Medical Reporting},
author = {Arnold Caleb Asiimwe and Didac Suris Coll-Vinent and
Pranav Rajpurkar and Carl Vondrick},
abstract = {In medical reporting, the accuracy of radiological
reports, whether generated by humans or machine
learning algorithms, is critical. We tackle a new
task in this paper: image- conditioned
autocorrection of inaccuracies within these
reports. Using the MIMIC-CXR dataset, we first
intentionally introduce a diverse range of errors
into reports. Subsequently, we propose a two-stage
framework capable of pinpointing these errors and
then making corrections, simulating an
autocorrection process. This method aims to address
the short- comings of existing automated medical
reporting systems, like factual errors and incorrect
conclusions, enhancing report reliability in vital
healthcare applications. Importantly, our approach
could serve as a guardrail, ensuring the accuracy
and trustworthiness of automated report
generation. Experiments on established datasets and
state of the art report generation models validate
this method's potential in correcting medical
reporting errors.},
openreview = {iW9ItiwxyC}
}
@inproceedings{mlhc2024_submission_13,
title = {Network-Assisted Mediation Analysis with
High-Dimensional Neuroimaging Mediators},
author = {Baoyi Shi and Ying Liu and Shanghong Xie and Xi Zhu
and Yuanjia Wang},
abstract = {Mediation analysis is a widely used statistical
approach to estimate the causal pathways through
which an exposure affects an outcome via
intermediate variables, i.e., mediators. In many
applications, high-dimensional correlated biomarkers
are potential mediators, posing challenges to
standard mediation analysis approaches. However,
some of these biomarkers, such as neuroimaging
measures across brain regions, often exhibit
hierarchical network structures that can be
leveraged to advance mediation analysis. In this
paper, we aim to study how brain cortical thickness,
characterized by a star-shaped hierarchical network
structure, mediates the effect of maternal smoking
on children's cognitive abilities within the
adolescent brain cognitive development (ABCD)
study. We propose a network-assisted mediation
analysis approach based on a conditional Gaussian
graphical model to account for the star-shaped
network structure of neuroimaging mediators. Within
our framework, the joint indirect effect of these
mediators is decomposed into the indirect effect
through hub mediators and the indirect effects
solely through each leaf mediator. This
decomposition provides mediator-specific insights
and informs efficient intervention
designs. Additionally, after accounting for hub
mediators, the indirect effects solely through each
leaf mediator can be identified and evaluated
individually, thereby addressing the challenges of
high-dimensional correlated mediators. In our study,
our proposed approach identifies a brain region as a
significant leaf mediator, a finding that existing
approaches cannot discover.},
openreview = {dZatjnB77L}
}
@inproceedings{mlhc2024_submission_66,
title = {A Comprehensive View of Personalized Federated
Learning on Heterogeneous Clinical Datasets},
author = {Fatemeh Tavakoli and D. B. Emerson and Sana Ayromlou
and John Taylor Jewell and Amrit Krishnan and
Yuchong Zhang and Amol Verma and Fahad Razak},
abstract = {Federated learning (FL) is increasingly being
recognized as a key approach to overcoming the data
silos that so frequently obstruct the training and
deployment of machine-learning models in clinical
settings. This work contributes to a growing body of
FL research specifically focused on clinical
applications along three important
directions. First, we expand the FLamby benchmark
(du Terrail et al., 2022a) to include a
comprehensive evaluation of personalized FL methods
and demonstrate substantive performance improvements
over the original results. Next, we advocate for a
comprehensive checkpointing and evaluation framework
for FL to reflect practical settings and provide
multiple comparison baselines. To this end, an
open-source library aimed at making FL
experimentation simpler and more reproducible is
released. Finally, we propose an important ablation
of PerFCL (Zhang et al., 2022). This ablation
results in a natural extension of FENDA (Kim et al.,
2016) to the FL setting. Experiments conducted on
the FLamby benchmark and GEMINI datasets (Verma et
al., 2017) show that the proposed approach is robust
to heterogeneous clinical data and often outperforms
existing global and personalized FL techniques,
including PerFCL.},
openreview = {btijACJ4QU}
}
@inproceedings{mlhc2024_submission_174,
title = {{LLMSYN}: Generating Synthetic Electronic Health
Records Without Patient-Level Data},
author = {Yijie Hao and Huan He and Joyce C. Ho},
abstract = {Recent advancements in large language models (LLMs)
have shown promise in tasks like question answering,
text summarization, and code generation. However,
their effectiveness within the healthcare sector
remains uncertain. This study investigates LLMs'
potential in generating synthetic Electronic Health
Records (EHRs) by assessing their ability to produce
structured data. Unfortunately, our preliminary
results indicate that employing LLMs directly
resulted in poor statistical similarity and
utility. Feeding real-world dataset to LLMs could
mitigate this issue, but privacy concerns were
raised when uploading pa- tients' information to the
LLM API. To address these challenges and unleash the
potential of LLMs in health data science, we present
a new generation pipeline called LLMSYN. This
pipeline utilizes only high-level statistical
information from datasets and publicly available
medical knowledge. The results demonstrate that the
generated EHRs by LLMSYN ex- hibit improved
statistical similarity and utility in downstream
tasks, achieving predictive performance comparable
to training with real data, while presenting minimal
privacy risks. Our findings suggest that LLMSYN
offers a promising approach to enhance the utility
of LLM models in synthetic structured EHR
generation.},
openreview = {Yd3TDXmYVX}
}
@inproceedings{mlhc2024_submission_184,
title = {{XDT}-{CXR}: Investigating Cross-Disease
Transferability in Zero-Shot Binary Classification
of Chest X-Rays},
author = {Umaima Rahman and Abhishek Basu and Muhammad Uzair
Khattak and Aniq Ur Rahman},
abstract = {This study explores the concept of cross-disease
transferability (XDT) in medical imaging, focusing
on the potential of binary classifiers trained on
one disease to perform zero-shot classification on
another disease affecting the same organ. Utilizing
chest X-rays (CXR) as the primary modality, we
investigate whether a model trained on one pulmonary
disease can make predictions about another novel
pulmonary disease, a scenario with significant
implications for medical settings with limited data
on emerging diseases. The XDT framework leverages
the embedding space of a vision encoder, which,
through kernel transformation, aids in
distinguishing between diseased and non-diseased
classes in the latent space. This capability is
especially beneficial in resource-limited
environments or in regions with low prevalence of
certain diseases, where conventional diagnostic
practices may fail. However, the XDT framework is
currently limited to binary classification,
determining only the presence or absence of a
disease rather than differentiating among multiple
diseases. This limitation underscores the
supplementary role of XDT to traditional diagnostic
tests in clinical settings. Furthermore, results
show that XDT-CXR as a framework is able to make
better predictions comapred to other zero-shot
learning (ZSL) baselines.},
openreview = {XvGhPugrKF}
}
@inproceedings{mlhc2024_submission_64,
title = {Mixed Type Multimorbidity Variational Autoencoder: A
Deep Generative Model for Multimorbidity Analysis},
author = {Woojung Kim and Paul A. Jenkins and Christopher Yau},
abstract = {This paper introduces the Mixed Type Multimorbidity
Variational Autoencoder ($\text{M}^{3}$VAE), a deep
probabilistic generative model developed for
supervised dimensionality reduction in the context
of multimorbidity analysis. The model is designed to
overcome the limitations of purely supervised or
unsupervised approaches in this
field. $\text{M}^{3}$VAE focuses on identifying
latent representations of mixed-type health-related
attributes essential for predicting patient survival
outcomes. It integrates datasets with multiple
modalities (by which we mean data of multiple
types), encompassing health measurements,
demographic details, and (potentially censored)
survival outcomes. A key feature of
$\text{M}^{3}$VAE is its ability to reconstruct
latent representations that exhibit clustering
patterns, thereby revealing important patterns in
disease co-occurrence. This functionality provides
insights for understanding and predicting health
outcomes. The efficacy of $\text{M}^{3}$VAE has been
demonstrated through experiments with both synthetic
and real-world electronic health record data,
showing its capability in identifying interpretable
morbidity groupings related to future survival
outcomes.},
openreview = {XHifMs5JkN}
}
@inproceedings{mlhc2024_submission_45,
title = {Ne{RF}-{US}: Removing Ultrasound Imaging Artifacts
from Neural Radiance Fields in the Wild},
author = {Rishit Dagli and Atsuhiro Hibi and Rahul Krishnan
and Pascal N Tyrrell},
abstract = {Current methods for performing 3D reconstruction and
novel view synthesis (NVS) in ultrasound imaging
data often face severe artifacts when training
NeRF-based approaches. The artifacts produced by
current approaches differ from NeRF floaters in
general scenes because of the unique nature of
ultrasound capture. Furthermore, existing models
fail to produce reasonable 3D reconstructions when
ultrasound data is captured or obtained casually in
uncontrolled environments, which is common in
clinical settings. Consequently, existing
reconstruction and NVS methods struggle to handle
ultrasound motion, fail to capture intricate
details, and cannot model transparent and reflective
surfaces. In this work, we introduced NeRF-US, which
incorporates 3D-geometry guidance for border
probability and scattering density into NeRF
training, while also utilizing ultrasound-specific
rendering over traditional volume rendering. These
3D priors are learned through a diffusion
model. Through experiments conducted on our new
"Ultrasound in the Wild" dataset, we observed
accurate, clinically plausible, artifact-free
reconstructions.},
openreview = {WNwMLWvzwB}
}
@inproceedings{mlhc2024_submission_194,
title = {Leveraging {LLM}s for Multimodal Medical Time Series
Analysis},
author = {Nimeesha Chan and Felix Parker and William C Bennett
and Tianyi Wu and Mung Yao Jia and James Fackler MD
and Kimia Ghobadi},
abstract = {The complexity and heterogeneity of data in many
real-world applications pose significant challenges
for traditional machine learning and signal
processing techniques. For instance, in medicine,
effective analysis of diverse physiological signals
is crucial for patient monitoring and clinical
decision-making and yet highly challenging. We
introduce MedTsLLM, a general multimodal large
language model (LLM) framework that effectively
integrates time series data and rich contextual
information in the form of text to analyze
physiological signals, performing three tasks with
clinical relevance: semantic segmentation, boundary
detection, and anomaly detection in time
series. These critical tasks enable deeper analysis
of physiological signals and can provide actionable
insights for clinicians. We utilize a reprogramming
layer to align embeddings of time series patches
with a pretrained LLM's embedding space and make
effective use of raw time series, in conjunction
with textual context. Given the multivariate nature
of medical datasets, we develop methods to handle
multiple covariates. We additionally tailor the text
prompt to include patient-specific information. Our
model outperforms state-of-the-art baselines,
including deep learning models, other LLMs, and
clinical methods across multiple medical domains,
specifically electrocardiograms and respiratory
waveforms. MedTsLLM presents a promising step
towards harnessing the power of LLMs for medical
time series analysis that can elevate data-driven
tools for clinicians and improve patient outcomes.},
openreview = {W4ZxKk14HM}
}
@inproceedings{mlhc2024_submission_131,
title = {Automatically Extracting Numerical Results from
Randomized Controlled Trials with Large Language
Models},
author = {Hye Sun Yun and David Pogrebitskiy and Iain James
Marshall and Byron C Wallace},
abstract = {Meta-analyses statistically aggregate the findings
of different randomized controlled trials (RCTs) to
assess treatment effectiveness. Because this yields
robust estimates of treatment effectiveness, results
from meta-analyses are considered the strongest form
of evidence. However, rigorous evidence syntheses
are time-consuming and labor-intensive, requiring
manual extraction of data from individual trials to
be synthesized. Ideally, language technologies would
permit fully automatic meta-analysis, on
demand. This requires accurately extracting
numerical results from individual trials, which has
been beyond the capabilities of natural language
processing (NLP) models to date. In this work, we
evaluate whether modern large language models (LLMs)
can reliably perform this task. We annotate (and
release) a modest but granular evaluation dataset of
clinical trial reports with numerical findings
attached to interventions, comparators, and
outcomes. Using this dataset, we evaluate the
performance of seven LLMs applied zero-shot for the
task of conditionally extracting numerical findings
from trial reports. We find that massive LLMs that
can accommodate lengthy inputs are tantalizingly
close to realizing fully automatic meta-analysis,
especially for dichotomous (binary) outcomes (e.g.,
mortality). However, LLMs---including ones trained
on biomedical texts---perform poorly when the
outcome measures are complex and tallying the
results requires inference. This work charts a path
toward fully automatic meta-analysis of RCTs via
LLMs, while also highlighting the limitations of
existing models for this aim.},
openreview = {VUiZ69NyST}
}
@inproceedings{mlhc2024_submission_156,
title = {Event-Based Contrastive Learning for Medical Time
Series},
author = {Nassim Oufattole and Hyewon Jeong and Matthew
B.A. McDermott and Aparna Balagopalan and Bryan
Jangeesingh and Marzyeh Ghassemi and Collin Stultz},
abstract = {In clinical practice, one often needs to identify
whether a patient is at high risk of adverse
outcomes after some key medical event. For example,
quantifying the risk of adverse outcomes after an
acute cardiovascular event helps healthcare
providers identify those patients at the highest
risk of poor outcomes; i.e., patients who benefit
from invasive therapies that can lower their
risk. Assessing the risk of adverse outcomes,
however, is challenging due to the complexity,
variability, and heterogeneity of longitudinal
medical data, especially for individuals suffering
from chronic diseases like heart failure. In this
paper, we introduce Event-Based Contrastive Learning
(EBCL) - a method for learning embeddings of
heterogeneous patient data that preserves temporal
information before and after key index events. We
demonstrate that EBCL can be used to construct
models that yield improved performance on important
downstream tasks relative to other pretraining
methods. We develop and test the method using a
cohort of heart failure patients obtained from a
large hospital network and the publicly available
MIMIC-IV dataset consisting of patients in an
intensive care unit at a large tertiary care
center. On both cohorts, EBCL pretraining yields
models that are performant with respect to a number
of downstream tasks, including mortality, hospital
readmission, and length of stay. In addition,
unsupervised EBCL embeddings effectively cluster
heart failure patients into subgroups with distinct
outcomes, thereby providing information that helps
identify new heart failure phenotypes. The
contrastive framework around the index event can be
adapted to a wide array of time-series datasets and
provides information that can be used to guide
personalized care.},
openreview = {TEG1wypeoD}
}
@inproceedings{mlhc2024_submission_3,
title = {Needles in Needle Stacks: Meaningful Clinical
Information Buried in Noisy Sensor Data},
author = {Sujay Nagaraj and Andrew J Goodwin and Dmytro
Lopushanskyy and Sebastian David Goodfellow and
Danny Eytan and Hadrian Balaci and Robert Greer and
Anand Jayarajan and Azadeh Assadi and Mjaye Leslie
Mazwi and Anna Goldenberg},
abstract = {Central Venous Lines (C-Lines) and Arterial Lines
(A-Lines) are routinely used in the Critical Care
Unit (CCU) for blood sampling, medication
administration, and high-frequency blood pressure
measurement. Judiciously accessing these lines is
important, as over-utilization is associated with
significant in-hospital morbidity and
mortality. Documenting the frequency of line-access
is an important step in reducing these adverse
outcomes. Unfortunately, the current gold-standard
for documentation is manual and subject to error,
omission, and bias. The high-frequency blood
pressure waveform data from sensors in these lines
are often noisy and full of artifacts. Standard
approaches in signal processing remove noise
artifacts before meaningful analysis. However, from
bedside observations, we characterized a *distinct*
artifact that occurs during each instance of C-Line
or A-Line use. These artifacts are buried amongst
physiological waveform and extraneous noise. We
focus on Machine Learning (ML) models that can
detect these artifacts from waveform data in
real-time - finding needles in needle stacks, in
order to automate the documentation of
line-access. We built and evaluated ML classifiers
running in real-time at a major children's hospital
to achieve this goal. We demonstrate the utility of
these tools for reducing documentation burden,
increasing available information for bedside
clinicians, and informing unit-level initiatives to
improve patient safety.},
openreview = {SPgoUgKr7i}
}
@inproceedings{mlhc2024_submission_161,
title = {Selective Fine-tuning on {LLM}-labeled Data May
Reduce Reliance on Human Annotation: A Case Study
Using Schedule-of-Event Table Detection},
author = {Bhawesh Kumar and Jonathan Amar and Eric Yang and
Nan Li and Yugang jia},
abstract = {Large Language Models (LLMs) have demonstrated their
efficacy across a broad spectrum of tasks in
healthcare applications. However, often LLMs need to
be fine-tuned on task specific expert-annotated data
to achieve optimal performance, which can be
expensive and time consuming. In this study, we
fine-tune PaLM-2 with parameter efficient
fine-tuning (PEFT) using noisy labels obtained from
Gemini-pro 1.0 for the detection of
Schedule-of-Event (SoE) tables, which specify care
plan in clinical trial protocols. We introduce a
filtering mechanism to select high-confidence labels
for this table classification task, thereby reducing
the noise in the auto-generated labels. We find that
the fine-tuned PaLM-2 with filtered labels
outperforms Gemini Pro 1.0 and other LLMs on this
task and achieves performance close to PaLM-2
fine-tuned on non-expert human annotations. Our
results show that leveraging LLM-generated labels,
coupled with strategic filtering can be a viable and
cost-effective strategy for improving LLM
performance on specialized tasks, especially in
domains where expert annotations are scarce,
expensive, or time-consuming to obtain.},
openreview = {NNYrPGrpJ1}
}
@inproceedings{mlhc2024_submission_69,
title = {A {LUPI} distillation-based approach: Application to
predicting Proximal Junctional Kyphosis},
author = {Yun Chao Lin and Andrea Clark-Sevilla and Rohith
Ravindranath and Fthimnir Hassan and Justin Reyes
and Joseph Lombardi and Lawrence G. Lenke and Ansaf
Salleb-Aouissi},
abstract = {We propose a learning algorithm called XGBoost+, a
modified version of the extreme gradient boosting
algorithm (XGBoost). The new algorithm utilizes
privileged information (PI), data collected after
inference time. XGBoost+ incorporates PI into a
distillation framework for XGBoost. We also evaluate
our proposed method on a real-world clinical dataset
about Proximal Junctional Kyphosis (PJK). Our
approach outperforms vanilla XGBoost, SVM, and SVM+
on various datasets. Our approach showcases the
advantage of using privileged information to improve
the performance of machine learning models in
healthcare, where data after inference time can be
leveraged to build better models.},
openreview = {LvEdt6YqbT}
}
@inproceedings{mlhc2024_submission_82,
title = {{CORE}-{BEHRT}: A Carefully Optimized and Rigorously
Evaluated {BEHRT}},
author = {Mikkel Fruelund Odgaard and Kiril Vadimovic Klein
and Martin Sillesen and Sanne M{\o}ller Thysen and
Espen Jimenez-Solem and Mads Nielsen},
abstract = {The widespread adoption of Electronic Health Records
(EHR) has significantly increased the amount of
available healthcare data. This has allowed models
inspired by Natural Language Processing (NLP) and
Computer Vision, which scale exceptionally well, to
be used in EHR research. Particularly, BERT-based
models have surged in popularity following the
release of BEHRT and Med-BERT. Subsequent models
have largely built on these foundations despite the
fundamental design choices of these pioneering
models remaining underexplored. Through incremental
optimization, we study BERT-based EHR modeling and
isolate the sources of improvement for key design
choices, giving us insights into the effect of data
representation, individual technical components, and
training procedure. Evaluating this across a set of
generic tasks (death, pain treatment, and general
infection), we showed that improving data
representation can increase the average downstream
performance from 0.785 to 0.797 AUROC ($p <
10^{-7}$), primarily when including medication and
timestamps. Improving the architecture and training
protocol on top of this increased average downstream
performance to 0.801 AUROC ($p < 10^{-7}$). We then
demonstrated the consistency of our optimization
through a rigorous evaluation across 25 diverse
clinical prediction tasks. We observed significant
performance increases in 17 out of 25 tasks and
improvements in 24 tasks, highlighting the
generalizability of our results. Our findings
provide a strong foundation for future work and aim
to increase the trustworthiness of BERT-based EHR
models.},
openreview = {LdQFhJSgPR}
}
@inproceedings{mlhc2024_submission_44,
title = {Minimax Risk Classifiers for Mislabeled Data: a
Study on Patient Outcome Prediction Tasks},
author = {Lucia Filippozzi and Santiago Mazuelas and I\~nigo
Urteaga},
abstract = {Healthcare datasets are often impacted by incorrect
or mislabeled data, due to imperfect annotations,
data collection issues, ambiguity, and subjective
interpretations. Incorrectly classified data,
referred to as "noisy labels", can significantly
degrade the performance of supervised learning
models. Namely, noisy labels hinder the algorithm's
ability to accurately capture the true underlying
patterns from observed data. More importantly,
evaluating the performance of a classifier when only
noisy test labels are available is a significant
complication. We hereby tackle the challenge of
trusting the labelling process both in training and
testing, as noisy patient outcome labels in
healthcare raise methodological and ethical
considerations. We propose a novel adaptation of
Minimax Risk Classifiers (MRCs) for data subject to
noisy labels, both in training and evaluation. We
show that the upper bound of the MRC's expected loss
can serve as a useful estimator for the classifier's
performance, especially in situations where clean
test data is not available. We demonstrate the
benefits of the proposed methodology in healthcare
tasks where patient outcomes are predicted from
mislabeled data. The proposed technique is accurate
and stable, avoiding overly optimistic assessments
of prediction error, a significantly harmful burden
in patient outcome prediction tasks in healthcare.},
openreview = {LXcAXGtIkx}
}
@inproceedings{mlhc2024_submission_179,
title = {Predicting Long-Term Allograft Survival in Liver
Transplant Recipients},
author = {Xiang Gao and Michael Cooper and Maryam Naghibzadeh
and Amirhossein Azhie and Mamatha Bhat and Rahul
Krishnan},
abstract = {Liver allograft failure occurs in approximately 20%
of liver transplant recipients within five years
post-transplant, leading to mortality or the need
for retransplantation. Providing an accurate and
interpretable model for individualized risk
estimation of graft failure is essential for
improving post-transplant care. To this end, we
introduce the Model for Allograft Survival (MAS), a
simple linear risk score that outperforms other
advanced survival models. Using longitudinal patient
follow-up data from the United States (U.S.), we
develop our models on 82,959 liver transplant
recipients and conduct multi-site evaluations on 11
regions. Additionally, by testing on a separate
non-U.S. cohort, we explore the out-of-distribution
generalization performance of various models without
additional fine-tuning, a crucial property for
clinical deployment. We find that the most complex
models are also the ones most vulnerable to
distribution shifts despite achieving the best
in-distribution performance. Our findings not only
provide a strong risk score for predicting long-term
graft failure but also suggest that the routine
machine learning pipeline with only in-distribution
held-out validation could create harmful
consequences for patients at deployment.},
openreview = {JhvatSLKhG}
}
@inproceedings{mlhc2024_submission_150,
title = {FineRadScore: A Radiology Report Line-by-Line
Evaluation Technique Generating Corrections with
Severity Scores},
author = {Alyssa Huang and Oishi Banerjee and Kay Wu and
Eduardo Pontes Reis and Pranav Rajpurkar},
abstract = {The current gold standard for evaluating generated
chest x-ray (CXR) reports is through radiologist
annotations. However, this process can be extremely
time-consuming and costly, especially when
evaluating large numbers of reports. In this work,
we present FineRadScore, a Large Language Model
(LLM)-based automated evaluation metric for
generated CXR reports. Given a candidate report and
a ground-truth report, FineRadScore gives the
minimum number of line-by-line corrections required
to go from the candidate to the ground-truth
report. Additionally, FineRadScore provides an error
severity rating with each correction and generates
comments explaining why the correction was
needed. We demonstrate that FineRadScore's
corrections and error severity scores align with
radiologist opinions. We also show that, when used
to judge the quality of the report as a whole,
FineRadScore aligns with radiologists as well as
current state-of-the-art automated CXR evaluation
metrics. Finally, we analyze FineRadScore's
shortcomings to provide suggestions for future
improvements.},
openreview = {IiPw5miLHY}
}
@inproceedings{mlhc2024_submission_164,
title = {{DOSSIER}: Fact Checking in Electronic Health
Records while Preserving Patient Privacy},
author = {Haoran Zhang and Supriya Nagesh and Milind Shyani
and Nina Mishra},
abstract = {Given a particular claim about a specific document,
the fact checking problem is to determine if the
claim is true and, if so, provide corroborating
evidence. The problem is motivated by contexts where
a document is too lengthy to quickly read and find
an answer. This paper focuses on electronic health
records, or a medical dossier, where a physician has
a pointed claim to make about the record. Prior
methods that rely on directly prompting an LLM may
suffer from hallucinations and violate privacy
constraints. We present a system, DOSSIER, that
verifies claims related to the tabular data within a
document. For a clinical record, the tables include
timestamped vital signs, medications, and
labs. DOSSIER weaves together methods for tagging
medical entities within a claim, converting natural
language to SQL, and utilizing biomedical knowledge
graphs, in order to identify rows across multiple
tables that prove the answer. A distinguishing and
desirable characteristic of DOSSIER is that no
private medical records are shared with an LLM. An
extensive experimental evaluation is conducted over
a large corpus of medical records demonstrating
improved accuracy over five baselines. Our methods
provide hope that physicians can privately, quickly,
and accurately fact check a claim in an
evidence-based fashion.},
openreview = {HIvOO1fF4I}
}
@inproceedings{mlhc2024_submission_160,
title = {Risk stratification through class-conditional
conformal estimation: A strategy that improves the
rule-out performance of {MACE} in the prehospital
setting},
author = {Juan Jose Garcia and Nikhil Sarin and Rebecca
R. Kitzmiller and Ashok Krishnamurthy and Jessica
K. Z\`egre-Hemsey},
abstract = {Accurate risk stratification of clinical scores is
important to mitigate adverse outcomes in patient
care. In this study we explore whether
class-conditional conformal estimation can yield
better risk stratification cutoffs, as measured by
rule-out and rule-in performance. In the binary
setting, the cutoffs are chosen to theoretically
bound the false positive rate (FPR) and the false
negative rate (FNR). We showcase rule-out
performance improvements for the task of 30-day
major adverse cardiac event (MACE) prediction in the
prehospital setting over standard of care HEART and