mlhc2024_accepted_submissions.bib

@Proceedings{MLHC-2024,
  booktitle =	 {Proceedings of the 9th Machine Learning for
                  Healthcare Conference},
  volume =	 252,
  editor =	 {Kaivalya Deshpande and Madalina Fiterau and Shalmali
                  Joshi and Zachary Lipton and Rajesh Ranganath and
                  I\~nigo Urteaga},
  name =	 {Machine Learning for Healthcare Conference},
  shortname =	 {MLHC},
  conference_number =9,
  year =	 2024,
  start =	 {2024-08-16},
  end =		 {2024-08-17},
  published =	 {2024-11-25},
  url =		 {http://mlforhc.org/},
  address =	 {Toronto, Canada}
}

@inproceedings{mlhc2024_submission_139,
  title =	 {{MALADE}: Orchestration of {LLM}-powered Agents with
                  Retrieval Augmented Generation for
                  Pharmacovigilance},
  author =	 {Jihye Choi and Nils Palumbo and Prasad Chalasani and
                  Matthew M. Engelhard and Somesh Jha and Anivarya
                  Kumar and David Page},
  abstract =	 {In the era of Large Language Models (LLMs), given
                  their remarkable text understanding and generation
                  abilities, there is an unprecedented opportunity to
                  develop new, LLM-based methods for trustworthy
                  medical knowledge synthesis, extraction, and
                  summarization. This paper focuses on the problem of
                  Pharmacovigilance (PhV), where the significance and
                  challenges lie in identifying Adverse Drug Events
                  (ADEs) from diverse text sources, such as medical
                  literature, clinical notes, and drug
                  labels. Unfortunately, this task is hindered by
                  factors including variations in the terminologies of
                  drugs and outcomes, and ADE descriptions often being
                  buried in large amounts of narrative text. We
                  present MALADE, the first effective collaborative
                  multi-agent system powered by LLM with Retrieval
                  Augmented Generation for ADE extraction from drug
                  label data. This technique involves augmenting a
                  query to an LLM with relevant information extracted
                  from text resources and instructing the LLM to
                  compose a response consistent with the augmented
                  data. MALADE is a general LLM-agnostic architecture,
                  and its unique capabilities are: (1) leveraging a
                  variety of external sources, such as medical
                  literature, drug labels, and FDA tools (e.g.,
                  OpenFDA drug information API), (2) extracting
                  drug-outcome association in a structured format
                  along with the strength of the association, and (3)
                  providing explanations for established
                  associations. Instantiated with GPT-4 Turbo or
                  GPT-4o, and FDA drug label data, MALADE demonstrates
                  its efficacy with an Area Under ROC Curve of 0.90
                  against the OMOP Ground Truth table of ADEs. Our
                  implementation leverages the Langroid multi-agent
                  LLM framework and can be found at
                  https://github.com/jihyechoi77/malade.},
  openreview =	 {z0SuPuHD7q}
}

@inproceedings{mlhc2024_submission_91,
  title =	 {Beyond Clinical Trials: Using Real World Evidence to
                  Investigate Heterogeneous, Time-Varying Treatment
                  Effects},
  author =	 {Isabel Chien and Cliff Wong and Zelalem Gero and
                  Jaspreet Bagga and Risa Ueno and Richard E. Turner
                  and Roshanthi K. Weerasinghe and Brian Piening and
                  Tristan Naumann and Carlo Bifulco and Hoifung Poon
                  and Javier Gonz\'alez Hern\'andez},
  abstract =	 {Randomized controlled trials (RCTs), though
                  essential for evaluating the efficacy of novel
                  treatments, are costly and time-intensive. Due to
                  strict eligibility criteria, RCTs may not adequately
                  represent diverse patient populations, leading to
                  equity issues and limited
                  generalizability. Additionally, conventional trial
                  analysis methods are limited by strict assumptions
                  and biases. Real-world evidence (RWE) offers a
                  promising avenue to explore treatment effects beyond
                  trial settings, addressing gaps in representation
                  and providing additional insights into patient
                  outcomes over time. We introduce TRIALSCOPE-X and
                  TRIALSCOPE-XL, machine learning pipelines designed
                  to analyze treatment outcomes using RWE by
                  mitigating biases that arise from observational data
                  and addressing the limitations of conventional
                  methods. We estimate causal, time-varying treatment
                  effects across heterogeneous patient populations and
                  varied timeframes. Preliminary results investigating
                  the treatment benefit of Keytruda, a widely-used
                  cancer immunotherapy drug, demonstrate the utility
                  of our methods in evaluating treatment outcomes
                  under novel settings and uncovering potential
                  disparities. Our findings highlight the potential of
                  RWE-based analysis to provide data-driven insights
                  that inform evidence-based medicine and shape more
                  inclusive and comprehensive clinical research,
                  supplementing traditional clinical trial findings.},
  openreview =	 {wUruL3DqKB}
}

@inproceedings{mlhc2024_submission_26,
  title =	 {General-Purpose Retrieval-Enhanced Medical
                  Prediction Model Using Near-Infinite History},
  author =	 {Junu Kim and Chaeeun Shim and Bosco Seong Kyu Yang
                  and Chami Im and Sung Yoon Lim and Han-Gil Jeong and
                  Edward Choi},
  abstract =	 {Machine learning (ML) has recently shown promising
                  results in medical predictions using electronic
                  health records (EHRs).  However, since ML models
                  typically have a limited capability in terms of
                  input sizes, selecting specific medical events from
                  EHRs for use as input is necessary.  This selection
                  process, often relying on expert opinion, can cause
                  bottlenecks in development.  We propose
                  Retrieval-Enhanced Medical prediction model (REMed)
                  to address such challenges.  REMed can essentially
                  evaluate unlimited medical events, select the
                  relevant ones, and make predictions.  This allows
                  for an unrestricted input size, eliminating the need
                  for manual event selection.  We verified these
                  properties through experiments involving 27 clinical
                  prediction tasks across four independent cohorts,
                  where REMed outperformed the baselines.  Notably, we
                  found that the preferences of REMed align closely
                  with those of medical experts.  We expect our
                  approach to significantly expedite the development
                  of EHR prediction models by minimizing clinicians'
                  need for manual involvement.},
  openreview =	 {veSe9eDmOz}
}

@inproceedings{mlhc2024_submission_182,
  title =	 {Localising the Seizure Onset Zone from Single-Pulse
                  Electrical Stimulation Responses with a {CNN}
                  Transformer},
  author =	 {Jamie Norris and Aswin Chari and Dorien van Blooijs
                  and Gerald K. Cooray and Karl Friston and Martin M
                  Tisdall and Richard E Rosch},
  abstract =	 {Epilepsy is one of the most common neurological
                  disorders, often requiring surgical intervention
                  when medication fails to control seizures. For
                  effective surgical outcomes, precise localisation of
                  the epileptogenic focus - often approximated through
                  the Seizure Onset Zone (SOZ) - is critical yet
                  remains a challenge. Active probing through
                  electrical stimulation is already standard clinical
                  practice for identifying epileptogenic areas. Our
                  study advances the application of deep learning for
                  SOZ localisation using Single-Pulse Electrical
                  Stimulation (SPES) responses, with two key
                  contributions. Firstly, we implement an existing
                  deep learning model to compare two SPES analysis
                  paradigms: divergent and convergent. These paradigms
                  evaluate outward and inward effective connections,
                  respectively. We assess the generalisability of
                  these models to unseen patients and electrode
                  placements using held-out test sets. Our findings
                  reveal a notable improvement in moving from a
                  divergent (AUROC: 0.574) to a convergent approach
                  (AUROC: 0.666), marking the first application of the
                  latter in this context. Secondly, we demonstrate the
                  efficacy of CNN Transformers with cross-channel
                  attention in handling heterogeneous electrode
                  placements, increasing the AUROC to 0.730. These
                  findings represent a significant step in modelling
                  patient-specific intracranial EEG electrode
                  placements in SPES. Future work will explore
                  integrating these models into clinical
                  decision-making processes to bridge the gap between
                  deep learning research and practical healthcare
                  applications.},
  openreview =	 {vE2LTa2x83}
}

@inproceedings{mlhc2024_submission_163,
  title =	 {{PRECIS}e : Prototype-Reservation for Explainable
                  classification under Imbalanced and Scarce-Data
                  Settings},
  author =	 {Vaibhav Ganatra and Drishti Goel},
  abstract =	 {Deep learning models used for medical image
                  classification tasks are often constrained by the
                  limited amount of training data along with severe
                  class imbalance. Despite these problems, models
                  should be explainable to enable human trust in the
                  models' decisions to ensure wider adoption in high
                  risk situations. In this paper, we propose PRECISe,
                  an explainable-by-design model meticulously
                  constructed to concurrently address all three
                  challenges. Evaluation on 2 imbalanced medical image
                  datasets reveals that PRECISe outperforms the
                  current state-of-the-art methods on data efficient
                  generalization to minority classes, achieving an
                  accuracy of ~87% in detecting pneumonia in chest
                  x-rays upon training on <60 images
                  only. Additionally, a case study is presented to
                  highlight the model's ability to produce easily
                  interpretable predictions, reinforcing its practical
                  utility and reliability for medical imaging tasks.},
  openreview =	 {u30e7ZQ0To}
}

@inproceedings{mlhc2024_submission_29,
  title =	 {The Data Addition Dilemma},
  author =	 {Judy Hanwen Shen and Inioluwa Deborah Raji and Irene
                  Y. Chen},
  abstract =	 {In many machine learning for healthcare tasks,
                  standard datasets are constructed by amassing data
                  across many, often fundamentally dissimilar,
                  sources. But when does adding more data help, and
                  when does it hinder progress on desired model
                  outcomes in real-world settings? We identify this
                  situation as the Data Addition Dilemma,
                  demonstrating that adding training data in this
                  multi-source scaling context can at times result in
                  reduced overall accuracy, uncertain fairness
                  outcomes and reduced worst-subgroup performance. We
                  find that this possibly arises from an empirically
                  observed trade-off between model performance
                  improvements due to data scaling and model
                  deterioration from distribution shift.  We thus
                  establish baseline strategies for navigating this
                  dilemma, introducing distribution shift heuristics
                  to guide decision-making for which data sources to
                  add in order to yield the expected model performance
                  improvements.  We conclude with a discussion of the
                  required considerations for data collection and
                  suggestions for studying data composition and scale
                  in the age of increasingly larger models.},
  openreview =	 {s8WSOR8w3n}
}

@inproceedings{mlhc2024_submission_25,
  title =	 {Benchmarking Reliability of Deep Learning Models for
                  Pathological Gait Classification},
  author =	 {Abhishek Jaiswal and Nisheeth Srivastava},
  abstract =	 {Early detection of neurodegenerative disorders is an
                  important open problem, since early diagnosis and
                  treatment may yield a better prognosis. Researchers
                  have recently sought to leverage advances in machine
                  learning algorithms to detect symptoms of altered
                  gait, possibly corresponding to the emergence of
                  neurodegenerative etiologies. However, while several
                  claims of positive and accurate detection have been
                  made in the recent literature, using a variety of
                  sensors and algorithms, solutions are far from being
                  realized in practice. This paper analyzes existing
                  approaches to identify gaps inhibiting
                  translation. Using a set of experiments across three
                  Kinect-simulated and one real Parkinson's patient
                  datasets, we highlight possible sources of errors
                  and generalization failures in these
                  approaches. Based on these observations, we propose
                  our strong baseline called Asynchronous Multi-Stream
                  Graph Convolutional Network (AMS-GCN) that can
                  reliably differentiate multiple categories of
                  pathological gaits across datasets.},
  openreview =	 {pCH5UQrKuZ}
}

@inproceedings{mlhc2024_submission_8,
  title =	 {To which reference class do you belong? Measuring
                  racial fairness of reference classes with normative
                  modeling},
  author =	 {Saige Rutherford and Thomas Wolfers and Charlotte
                  Fraza and Nathaniel G. Harnett and Christian
                  Beckmann and Henricus G. Ruhe and Andre Marquand},
  abstract =	 {Reference classes in healthcare establish healthy
                  norms, such as pediatric growth charts of height and
                  weight, and are used to chart deviations from these
                  norms which represent potential clinical risk. How
                  the demographics of the reference class influence
                  clinical interpretation of deviations is
                  unknown. Using normative modeling, a method for
                  building reference classes, we evaluate the fairness
                  (racial bias) in reference models of structural
                  brain images that are widely used in psychiatry and
                  neurology. We test whether including ``race'' in the
                  model creates fairer models. We predict
                  self-reported race using the deviation scores from
                  three different reference class normative models to
                  better understand bias in an integrated,
                  multivariate sense. Across all these tasks, we
                  uncover racial disparities that are not easily
                  addressed with existing data or commonly used
                  modeling techniques. Our work suggests that
                  deviations from the norm could be due to demographic
                  mismatch with the reference class, and assigning
                  clinical meaning to these deviations should be done
                  with caution. Our approach also suggests that
                  acquiring more representative samples is an urgent
                  research priority.},
  openreview =	 {mako7lcwWF}
}

@inproceedings{mlhc2024_submission_151,
  title =	 {Direct Preference Optimization for Suppressing
                  Hallucinated Prior Exams in Radiology Report
                  Generation},
  author =	 {Oishi Banerjee and Hong-Yu Zhou and Kay Wu and
                  Subathra Adithan and Stephen Kwak and Pranav
                  Rajpurkar},
  abstract =	 {Recent advances in generative vision-language models
                  (VLMs) have exciting potential implications for AI
                  in radiology, yet VLMs are also known to produce
                  hallucinations, nonsensical text, and other unwanted
                  behaviors that can waste clinicians' time and cause
                  patient harm. Drawing on recent work on direct
                  preference optimization (DPO), we propose a simple
                  method for modifying the behavior of pretrained VLMs
                  performing radiology report generation by
                  suppressing unwanted types of generations. We apply
                  our method to the prevention of hallucinations of
                  prior exams, addressing a long-established problem
                  behavior in models performing chest X-ray report
                  generation. Across our experiments, we find that DPO
                  fine-tuning achieves a 3.2-4.8x reduction in lines
                  hallucinating prior exams while maintaining model
                  performance on clinical accuracy metrics. Our work
                  is, to the best of our knowledge, the first work to
                  apply DPO to medical VLMs, providing a data- and
                  compute- efficient way to suppress problem behaviors
                  while maintaining overall clinical accuracy.},
  openreview =	 {kvYYP1LfRq}
}

@inproceedings{mlhc2024_submission_195,
  title =	 {MedAutoCorrect: Image-Conditioned Autocorrection in
                  Medical Reporting},
  author =	 {Arnold Caleb Asiimwe and Didac Suris Coll-Vinent and
                  Pranav Rajpurkar and Carl Vondrick},
  abstract =	 {In medical reporting, the accuracy of radiological
                  reports, whether generated by humans or machine
                  learning algorithms, is critical. We tackle a new
                  task in this paper: image- conditioned
                  autocorrection of inaccuracies within these
                  reports. Using the MIMIC-CXR dataset, we first
                  intentionally introduce a diverse range of errors
                  into reports. Subsequently, we propose a two-stage
                  framework capable of pinpointing these errors and
                  then making corrections, simulating an
                  autocorrection process. This method aims to address
                  the short- comings of existing automated medical
                  reporting systems, like factual errors and incorrect
                  conclusions, enhancing report reliability in vital
                  healthcare applications. Importantly, our approach
                  could serve as a guardrail, ensuring the accuracy
                  and trustworthiness of automated report
                  generation. Experiments on established datasets and
                  state of the art report generation models validate
                  this method's potential in correcting medical
                  reporting errors.},
  openreview =	 {iW9ItiwxyC}
}

@inproceedings{mlhc2024_submission_13,
  title =	 {Network-Assisted Mediation Analysis with
                  High-Dimensional Neuroimaging Mediators},
  author =	 {Baoyi Shi and Ying Liu and Shanghong Xie and Xi Zhu
                  and Yuanjia Wang},
  abstract =	 {Mediation analysis is a widely used statistical
                  approach to estimate the causal pathways through
                  which an exposure affects an outcome via
                  intermediate variables, i.e., mediators. In many
                  applications, high-dimensional correlated biomarkers
                  are potential mediators, posing challenges to
                  standard mediation analysis approaches. However,
                  some of these biomarkers, such as neuroimaging
                  measures across brain regions, often exhibit
                  hierarchical network structures that can be
                  leveraged to advance mediation analysis. In this
                  paper, we aim to study how brain cortical thickness,
                  characterized by a star-shaped hierarchical network
                  structure, mediates the effect of maternal smoking
                  on children's cognitive abilities within the
                  adolescent brain cognitive development (ABCD)
                  study. We propose a network-assisted mediation
                  analysis approach based on a conditional Gaussian
                  graphical model to account for the star-shaped
                  network structure of neuroimaging mediators. Within
                  our framework, the joint indirect effect of these
                  mediators is decomposed into the indirect effect
                  through hub mediators and the indirect effects
                  solely through each leaf mediator. This
                  decomposition provides mediator-specific insights
                  and informs efficient intervention
                  designs. Additionally, after accounting for hub
                  mediators, the indirect effects solely through each
                  leaf mediator can be identified and evaluated
                  individually, thereby addressing the challenges of
                  high-dimensional correlated mediators. In our study,
                  our proposed approach identifies a brain region as a
                  significant leaf mediator, a finding that existing
                  approaches cannot discover.},
  openreview =	 {dZatjnB77L}
}

@inproceedings{mlhc2024_submission_66,
  title =	 {A Comprehensive View of Personalized Federated
                  Learning on Heterogeneous Clinical Datasets},
  author =	 {Fatemeh Tavakoli and D. B. Emerson and Sana Ayromlou
                  and John Taylor Jewell and Amrit Krishnan and
                  Yuchong Zhang and Amol Verma and Fahad Razak},
  abstract =	 {Federated learning (FL) is increasingly being
                  recognized as a key approach to overcoming the data
                  silos that so frequently obstruct the training and
                  deployment of machine-learning models in clinical
                  settings. This work contributes to a growing body of
                  FL research specifically focused on clinical
                  applications along three important
                  directions. First, we expand the FLamby benchmark
                  (du Terrail et al., 2022a) to include a
                  comprehensive evaluation of personalized FL methods
                  and demonstrate substantive performance improvements
                  over the original results. Next, we advocate for a
                  comprehensive checkpointing and evaluation framework
                  for FL to reflect practical settings and provide
                  multiple comparison baselines. To this end, an
                  open-source library aimed at making FL
                  experimentation simpler and more reproducible is
                  released. Finally, we propose an important ablation
                  of PerFCL (Zhang et al., 2022). This ablation
                  results in a natural extension of FENDA (Kim et al.,
                  2016) to the FL setting. Experiments conducted on
                  the FLamby benchmark and GEMINI datasets (Verma et
                  al., 2017) show that the proposed approach is robust
                  to heterogeneous clinical data and often outperforms
                  existing global and personalized FL techniques,
                  including PerFCL.},
  openreview =	 {btijACJ4QU}
}

@inproceedings{mlhc2024_submission_174,
  title =	 {{LLMSYN}: Generating Synthetic Electronic Health
                  Records Without Patient-Level Data},
  author =	 {Yijie Hao and Huan He and Joyce C. Ho},
  abstract =	 {Recent advancements in large language models (LLMs)
                  have shown promise in tasks like question answering,
                  text summarization, and code generation. However,
                  their effectiveness within the healthcare sector
                  remains uncertain. This study investigates LLMs'
                  potential in generating synthetic Electronic Health
                  Records (EHRs) by assessing their ability to produce
                  structured data. Unfortunately, our preliminary
                  results indicate that employing LLMs directly
                  resulted in poor statistical similarity and
                  utility. Feeding real-world dataset to LLMs could
                  mitigate this issue, but privacy concerns were
                  raised when uploading pa- tients' information to the
                  LLM API. To address these challenges and unleash the
                  potential of LLMs in health data science, we present
                  a new generation pipeline called LLMSYN. This
                  pipeline utilizes only high-level statistical
                  information from datasets and publicly available
                  medical knowledge. The results demonstrate that the
                  generated EHRs by LLMSYN ex- hibit improved
                  statistical similarity and utility in downstream
                  tasks, achieving predictive performance comparable
                  to training with real data, while presenting minimal
                  privacy risks.  Our findings suggest that LLMSYN
                  offers a promising approach to enhance the utility
                  of LLM models in synthetic structured EHR
                  generation.},
  openreview =	 {Yd3TDXmYVX}
}

@inproceedings{mlhc2024_submission_184,
  title =	 {{XDT}-{CXR}: Investigating Cross-Disease
                  Transferability in Zero-Shot Binary Classification
                  of Chest X-Rays},
  author =	 {Umaima Rahman and Abhishek Basu and Muhammad Uzair
                  Khattak and Aniq Ur Rahman},
  abstract =	 {This study explores the concept of cross-disease
                  transferability (XDT) in medical imaging, focusing
                  on the potential of binary classifiers trained on
                  one disease to perform zero-shot classification on
                  another disease affecting the same organ. Utilizing
                  chest X-rays (CXR) as the primary modality, we
                  investigate whether a model trained on one pulmonary
                  disease can make predictions about another novel
                  pulmonary disease, a scenario with significant
                  implications for medical settings with limited data
                  on emerging diseases. The XDT framework leverages
                  the embedding space of a vision encoder, which,
                  through kernel transformation, aids in
                  distinguishing between diseased and non-diseased
                  classes in the latent space. This capability is
                  especially beneficial in resource-limited
                  environments or in regions with low prevalence of
                  certain diseases, where conventional diagnostic
                  practices may fail. However, the XDT framework is
                  currently limited to binary classification,
                  determining only the presence or absence of a
                  disease rather than differentiating among multiple
                  diseases. This limitation underscores the
                  supplementary role of XDT to traditional diagnostic
                  tests in clinical settings. Furthermore, results
                  show that XDT-CXR as a framework is able to make
                  better predictions comapred to other zero-shot
                  learning (ZSL) baselines.},
  openreview =	 {XvGhPugrKF}
}

@inproceedings{mlhc2024_submission_64,
  title =	 {Mixed Type Multimorbidity Variational Autoencoder: A
                  Deep Generative Model for Multimorbidity Analysis},
  author =	 {Woojung Kim and Paul A. Jenkins and Christopher Yau},
  abstract =	 {This paper introduces the Mixed Type Multimorbidity
                  Variational Autoencoder ($\text{M}^{3}$VAE), a deep
                  probabilistic generative model developed for
                  supervised dimensionality reduction in the context
                  of multimorbidity analysis. The model is designed to
                  overcome the limitations of purely supervised or
                  unsupervised approaches in this
                  field. $\text{M}^{3}$VAE focuses on identifying
                  latent representations of mixed-type health-related
                  attributes essential for predicting patient survival
                  outcomes. It integrates datasets with multiple
                  modalities (by which we mean data of multiple
                  types), encompassing health measurements,
                  demographic details, and (potentially censored)
                  survival outcomes. A key feature of
                  $\text{M}^{3}$VAE is its ability to reconstruct
                  latent representations that exhibit clustering
                  patterns, thereby revealing important patterns in
                  disease co-occurrence. This functionality provides
                  insights for understanding and predicting health
                  outcomes. The efficacy of $\text{M}^{3}$VAE has been
                  demonstrated through experiments with both synthetic
                  and real-world electronic health record data,
                  showing its capability in identifying interpretable
                  morbidity groupings related to future survival
                  outcomes.},
  openreview =	 {XHifMs5JkN}
}

@inproceedings{mlhc2024_submission_45,
  title =	 {Ne{RF}-{US}: Removing Ultrasound Imaging Artifacts
                  from Neural Radiance Fields in the Wild},
  author =	 {Rishit Dagli and Atsuhiro Hibi and Rahul Krishnan
                  and Pascal N Tyrrell},
  abstract =	 {Current methods for performing 3D reconstruction and
                  novel view synthesis (NVS) in ultrasound imaging
                  data often face severe artifacts when training
                  NeRF-based approaches. The artifacts produced by
                  current approaches differ from NeRF floaters in
                  general scenes because of the unique nature of
                  ultrasound capture. Furthermore, existing models
                  fail to produce reasonable 3D reconstructions when
                  ultrasound data is captured or obtained casually in
                  uncontrolled environments, which is common in
                  clinical settings. Consequently, existing
                  reconstruction and NVS methods struggle to handle
                  ultrasound motion, fail to capture intricate
                  details, and cannot model transparent and reflective
                  surfaces. In this work, we introduced NeRF-US, which
                  incorporates 3D-geometry guidance for border
                  probability and scattering density into NeRF
                  training, while also utilizing ultrasound-specific
                  rendering over traditional volume rendering. These
                  3D priors are learned through a diffusion
                  model. Through experiments conducted on our new
                  "Ultrasound in the Wild" dataset, we observed
                  accurate, clinically plausible, artifact-free
                  reconstructions.},
  openreview =	 {WNwMLWvzwB}
}

@inproceedings{mlhc2024_submission_194,
  title =	 {Leveraging {LLM}s for Multimodal Medical Time Series
                  Analysis},
  author =	 {Nimeesha Chan and Felix Parker and William C Bennett
                  and Tianyi Wu and Mung Yao Jia and James Fackler MD
                  and Kimia Ghobadi},
  abstract =	 {The complexity and heterogeneity of data in many
                  real-world applications pose significant challenges
                  for traditional machine learning and signal
                  processing techniques. For instance, in medicine,
                  effective analysis of diverse physiological signals
                  is crucial for patient monitoring and clinical
                  decision-making and yet highly challenging. We
                  introduce MedTsLLM, a general multimodal large
                  language model (LLM) framework that effectively
                  integrates time series data and rich contextual
                  information in the form of text to analyze
                  physiological signals, performing three tasks with
                  clinical relevance: semantic segmentation, boundary
                  detection, and anomaly detection in time
                  series. These critical tasks enable deeper analysis
                  of physiological signals and can provide actionable
                  insights for clinicians. We utilize a reprogramming
                  layer to align embeddings of time series patches
                  with a pretrained LLM's embedding space and make
                  effective use of raw time series, in conjunction
                  with textual context. Given the multivariate nature
                  of medical datasets, we develop methods to handle
                  multiple covariates. We additionally tailor the text
                  prompt to include patient-specific information. Our
                  model outperforms state-of-the-art baselines,
                  including deep learning models, other LLMs, and
                  clinical methods across multiple medical domains,
                  specifically electrocardiograms and respiratory
                  waveforms. MedTsLLM presents a promising step
                  towards harnessing the power of LLMs for medical
                  time series analysis that can elevate data-driven
                  tools for clinicians and improve patient outcomes.},
  openreview =	 {W4ZxKk14HM}
}

@inproceedings{mlhc2024_submission_131,
  title =	 {Automatically Extracting Numerical Results from
                  Randomized Controlled Trials with Large Language
                  Models},
  author =	 {Hye Sun Yun and David Pogrebitskiy and Iain James
                  Marshall and Byron C Wallace},
  abstract =	 {Meta-analyses statistically aggregate the findings
                  of different randomized controlled trials (RCTs) to
                  assess treatment effectiveness. Because this yields
                  robust estimates of treatment effectiveness, results
                  from meta-analyses are considered the strongest form
                  of evidence. However, rigorous evidence syntheses
                  are time-consuming and labor-intensive, requiring
                  manual extraction of data from individual trials to
                  be synthesized. Ideally, language technologies would
                  permit fully automatic meta-analysis, on
                  demand. This requires accurately extracting
                  numerical results from individual trials, which has
                  been beyond the capabilities of natural language
                  processing (NLP) models to date. In this work, we
                  evaluate whether modern large language models (LLMs)
                  can reliably perform this task. We annotate (and
                  release) a modest but granular evaluation dataset of
                  clinical trial reports with numerical findings
                  attached to interventions, comparators, and
                  outcomes. Using this dataset, we evaluate the
                  performance of seven LLMs applied zero-shot for the
                  task of conditionally extracting numerical findings
                  from trial reports. We find that massive LLMs that
                  can accommodate lengthy inputs are tantalizingly
                  close to realizing fully automatic meta-analysis,
                  especially for dichotomous (binary) outcomes (e.g.,
                  mortality). However, LLMs---including ones trained
                  on biomedical texts---perform poorly when the
                  outcome measures are complex and tallying the
                  results requires inference. This work charts a path
                  toward fully automatic meta-analysis of RCTs via
                  LLMs, while also highlighting the limitations of
                  existing models for this aim.},
  openreview =	 {VUiZ69NyST}
}

@inproceedings{mlhc2024_submission_156,
  title =	 {Event-Based Contrastive Learning for Medical Time
                  Series},
  author =	 {Nassim Oufattole and Hyewon Jeong and Matthew
                  B.A. McDermott and Aparna Balagopalan and Bryan
                  Jangeesingh and Marzyeh Ghassemi and Collin Stultz},
  abstract =	 {In clinical practice, one often needs to identify
                  whether a patient is at high risk of adverse
                  outcomes after some key medical event. For example,
                  quantifying the risk of adverse outcomes after an
                  acute cardiovascular event helps healthcare
                  providers identify those patients at the highest
                  risk of poor outcomes; i.e., patients who benefit
                  from invasive therapies that can lower their
                  risk. Assessing the risk of adverse outcomes,
                  however, is challenging due to the complexity,
                  variability, and heterogeneity of longitudinal
                  medical data, especially for individuals suffering
                  from chronic diseases like heart failure. In this
                  paper, we introduce Event-Based Contrastive Learning
                  (EBCL) - a method for learning embeddings of
                  heterogeneous patient data that preserves temporal
                  information before and after key index events. We
                  demonstrate that EBCL can be used to construct
                  models that yield improved performance on important
                  downstream tasks relative to other pretraining
                  methods. We develop and test the method using a
                  cohort of heart failure patients obtained from a
                  large hospital network and the publicly available
                  MIMIC-IV dataset consisting of patients in an
                  intensive care unit at a large tertiary care
                  center. On both cohorts, EBCL pretraining yields
                  models that are performant with respect to a number
                  of downstream tasks, including mortality, hospital
                  readmission, and length of stay.  In addition,
                  unsupervised EBCL embeddings effectively cluster
                  heart failure patients into subgroups with distinct
                  outcomes, thereby providing information that helps
                  identify new heart failure phenotypes.  The
                  contrastive framework around the index event can be
                  adapted to a wide array of time-series datasets and
                  provides information that can be used to guide
                  personalized care.},
  openreview =	 {TEG1wypeoD}
}

@inproceedings{mlhc2024_submission_3,
  title =	 {Needles in Needle Stacks: Meaningful Clinical
                  Information Buried in Noisy Sensor Data},
  author =	 {Sujay Nagaraj and Andrew J Goodwin and Dmytro
                  Lopushanskyy and Sebastian David Goodfellow and
                  Danny Eytan and Hadrian Balaci and Robert Greer and
                  Anand Jayarajan and Azadeh Assadi and Mjaye Leslie
                  Mazwi and Anna Goldenberg},
  abstract =	 {Central Venous Lines (C-Lines) and Arterial Lines
                  (A-Lines) are routinely used in the Critical Care
                  Unit (CCU) for blood sampling, medication
                  administration, and high-frequency blood pressure
                  measurement. Judiciously accessing these lines is
                  important, as over-utilization is associated with
                  significant in-hospital morbidity and
                  mortality. Documenting the frequency of line-access
                  is an important step in reducing these adverse
                  outcomes. Unfortunately, the current gold-standard
                  for documentation is manual and subject to error,
                  omission, and bias. The high-frequency blood
                  pressure waveform data from sensors in these lines
                  are often noisy and full of artifacts. Standard
                  approaches in signal processing remove noise
                  artifacts before meaningful analysis. However, from
                  bedside observations, we characterized a *distinct*
                  artifact that occurs during each instance of C-Line
                  or A-Line use. These artifacts are buried amongst
                  physiological waveform and extraneous noise. We
                  focus on Machine Learning (ML) models that can
                  detect these artifacts from waveform data in
                  real-time - finding needles in needle stacks, in
                  order to automate the documentation of
                  line-access. We built and evaluated ML classifiers
                  running in real-time at a major children's hospital
                  to achieve this goal. We demonstrate the utility of
                  these tools for reducing documentation burden,
                  increasing available information for bedside
                  clinicians, and informing unit-level initiatives to
                  improve patient safety.},
  openreview =	 {SPgoUgKr7i}
}

@inproceedings{mlhc2024_submission_161,
  title =	 {Selective Fine-tuning on {LLM}-labeled Data May
                  Reduce Reliance on Human Annotation: A Case Study
                  Using Schedule-of-Event Table Detection},
  author =	 {Bhawesh Kumar and Jonathan Amar and Eric Yang and
                  Nan Li and Yugang jia},
  abstract =	 {Large Language Models (LLMs) have demonstrated their
                  efficacy across a broad spectrum of tasks in
                  healthcare applications. However, often LLMs need to
                  be fine-tuned on task specific expert-annotated data
                  to achieve optimal performance, which can be
                  expensive and time consuming. In this study, we
                  fine-tune PaLM-2 with parameter efficient
                  fine-tuning (PEFT) using noisy labels obtained from
                  Gemini-pro 1.0 for the detection of
                  Schedule-of-Event (SoE) tables, which specify care
                  plan in clinical trial protocols. We introduce a
                  filtering mechanism to select high-confidence labels
                  for this table classification task, thereby reducing
                  the noise in the auto-generated labels. We find that
                  the fine-tuned PaLM-2 with filtered labels
                  outperforms Gemini Pro 1.0 and other LLMs on this
                  task and achieves performance close to PaLM-2
                  fine-tuned on non-expert human annotations. Our
                  results show that leveraging LLM-generated labels,
                  coupled with strategic filtering can be a viable and
                  cost-effective strategy for improving LLM
                  performance on specialized tasks, especially in
                  domains where expert annotations are scarce,
                  expensive, or time-consuming to obtain.},
  openreview =	 {NNYrPGrpJ1}
}

@inproceedings{mlhc2024_submission_69,
  title =	 {A {LUPI} distillation-based approach: Application to
                  predicting Proximal Junctional Kyphosis},
  author =	 {Yun Chao Lin and Andrea Clark-Sevilla and Rohith
                  Ravindranath and Fthimnir Hassan and Justin Reyes
                  and Joseph Lombardi and Lawrence G. Lenke and Ansaf
                  Salleb-Aouissi},
  abstract =	 {We propose a learning algorithm called XGBoost+, a
                  modified version of the extreme gradient boosting
                  algorithm (XGBoost). The new algorithm utilizes
                  privileged information (PI), data collected after
                  inference time. XGBoost+ incorporates PI into a
                  distillation framework for XGBoost. We also evaluate
                  our proposed method on a real-world clinical dataset
                  about Proximal Junctional Kyphosis (PJK).  Our
                  approach outperforms vanilla XGBoost, SVM, and SVM+
                  on various datasets. Our approach showcases the
                  advantage of using privileged information to improve
                  the performance of machine learning models in
                  healthcare, where data after inference time can be
                  leveraged to build better models.},
  openreview =	 {LvEdt6YqbT}
}

@inproceedings{mlhc2024_submission_82,
  title =	 {{CORE}-{BEHRT}: A Carefully Optimized and Rigorously
                  Evaluated {BEHRT}},
  author =	 {Mikkel Fruelund Odgaard and Kiril Vadimovic Klein
                  and Martin Sillesen and Sanne M{\o}ller Thysen and
                  Espen Jimenez-Solem and Mads Nielsen},
  abstract =	 {The widespread adoption of Electronic Health Records
                  (EHR) has significantly increased the amount of
                  available healthcare data. This has allowed models
                  inspired by Natural Language Processing (NLP) and
                  Computer Vision, which scale exceptionally well, to
                  be used in EHR research. Particularly, BERT-based
                  models have surged in popularity following the
                  release of BEHRT and Med-BERT. Subsequent models
                  have largely built on these foundations despite the
                  fundamental design choices of these pioneering
                  models remaining underexplored. Through incremental
                  optimization, we study BERT-based EHR modeling and
                  isolate the sources of improvement for key design
                  choices, giving us insights into the effect of data
                  representation, individual technical components, and
                  training procedure.  Evaluating this across a set of
                  generic tasks (death, pain treatment, and general
                  infection), we showed that improving data
                  representation can increase the average downstream
                  performance from 0.785 to 0.797 AUROC ($p <
                  10^{-7}$), primarily when including medication and
                  timestamps. Improving the architecture and training
                  protocol on top of this increased average downstream
                  performance to 0.801 AUROC ($p < 10^{-7}$). We then
                  demonstrated the consistency of our optimization
                  through a rigorous evaluation across 25 diverse
                  clinical prediction tasks. We observed significant
                  performance increases in 17 out of 25 tasks and
                  improvements in 24 tasks, highlighting the
                  generalizability of our results. Our findings
                  provide a strong foundation for future work and aim
                  to increase the trustworthiness of BERT-based EHR
                  models.},
  openreview =	 {LdQFhJSgPR}
}

@inproceedings{mlhc2024_submission_44,
  title =	 {Minimax Risk Classifiers for Mislabeled Data: a
                  Study on Patient Outcome Prediction Tasks},
  author =	 {Lucia Filippozzi and Santiago Mazuelas and I\~nigo
                  Urteaga},
  abstract =	 {Healthcare datasets are often impacted by incorrect
                  or mislabeled data, due to imperfect annotations,
                  data collection issues, ambiguity, and subjective
                  interpretations. Incorrectly classified data,
                  referred to as "noisy labels", can significantly
                  degrade the performance of supervised learning
                  models. Namely, noisy labels hinder the algorithm's
                  ability to accurately capture the true underlying
                  patterns from observed data. More importantly,
                  evaluating the performance of a classifier when only
                  noisy test labels are available is a significant
                  complication. We hereby tackle the challenge of
                  trusting the labelling process both in training and
                  testing, as noisy patient outcome labels in
                  healthcare raise methodological and ethical
                  considerations.  We propose a novel adaptation of
                  Minimax Risk Classifiers (MRCs) for data subject to
                  noisy labels, both in training and evaluation.  We
                  show that the upper bound of the MRC's expected loss
                  can serve as a useful estimator for the classifier's
                  performance, especially in situations where clean
                  test data is not available. We demonstrate the
                  benefits of the proposed methodology in healthcare
                  tasks where patient outcomes are predicted from
                  mislabeled data. The proposed technique is accurate
                  and stable, avoiding overly optimistic assessments
                  of prediction error, a significantly harmful burden
                  in patient outcome prediction tasks in healthcare.},
  openreview =	 {LXcAXGtIkx}
}

@inproceedings{mlhc2024_submission_179,
  title =	 {Predicting Long-Term Allograft Survival in Liver
                  Transplant Recipients},
  author =	 {Xiang Gao and Michael Cooper and Maryam Naghibzadeh
                  and Amirhossein Azhie and Mamatha Bhat and Rahul
                  Krishnan},
  abstract =	 {Liver allograft failure occurs in approximately 20%
                  of liver transplant recipients within five years
                  post-transplant, leading to mortality or the need
                  for retransplantation. Providing an accurate and
                  interpretable model for individualized risk
                  estimation of graft failure is essential for
                  improving post-transplant care. To this end, we
                  introduce the Model for Allograft Survival (MAS), a
                  simple linear risk score that outperforms other
                  advanced survival models. Using longitudinal patient
                  follow-up data from the United States (U.S.), we
                  develop our models on 82,959 liver transplant
                  recipients and conduct multi-site evaluations on 11
                  regions. Additionally, by testing on a separate
                  non-U.S. cohort, we explore the out-of-distribution
                  generalization performance of various models without
                  additional fine-tuning, a crucial property for
                  clinical deployment. We find that the most complex
                  models are also the ones most vulnerable to
                  distribution shifts despite achieving the best
                  in-distribution performance. Our findings not only
                  provide a strong risk score for predicting long-term
                  graft failure but also suggest that the routine
                  machine learning pipeline with only in-distribution
                  held-out validation could create harmful
                  consequences for patients at deployment.},
  openreview =	 {JhvatSLKhG}
}

@inproceedings{mlhc2024_submission_150,
  title =	 {FineRadScore: A Radiology Report Line-by-Line
                  Evaluation Technique Generating Corrections with
                  Severity Scores},
  author =	 {Alyssa Huang and Oishi Banerjee and Kay Wu and
                  Eduardo Pontes Reis and Pranav Rajpurkar},
  abstract =	 {The current gold standard for evaluating generated
                  chest x-ray (CXR) reports is through radiologist
                  annotations. However, this process can be extremely
                  time-consuming and costly, especially when
                  evaluating large numbers of reports. In this work,
                  we present FineRadScore, a Large Language Model
                  (LLM)-based automated evaluation metric for
                  generated CXR reports. Given a candidate report and
                  a ground-truth report, FineRadScore gives the
                  minimum number of line-by-line corrections required
                  to go from the candidate to the ground-truth
                  report. Additionally, FineRadScore provides an error
                  severity rating with each correction and generates
                  comments explaining why the correction was
                  needed. We demonstrate that FineRadScore's
                  corrections and error severity scores align with
                  radiologist opinions. We also show that, when used
                  to judge the quality of the report as a whole,
                  FineRadScore aligns with radiologists as well as
                  current state-of-the-art automated CXR evaluation
                  metrics. Finally, we analyze FineRadScore's
                  shortcomings to provide suggestions for future
                  improvements.},
  openreview =	 {IiPw5miLHY}
}

@inproceedings{mlhc2024_submission_164,
  title =	 {{DOSSIER}: Fact Checking in Electronic Health
                  Records while Preserving Patient Privacy},
  author =	 {Haoran Zhang and Supriya Nagesh and Milind Shyani
                  and Nina Mishra},
  abstract =	 {Given a particular claim about a specific document,
                  the fact checking problem is to determine if the
                  claim is true and, if so, provide corroborating
                  evidence. The problem is motivated by contexts where
                  a document is too lengthy to quickly read and find
                  an answer.  This paper focuses on electronic health
                  records, or a medical dossier, where a physician has
                  a pointed claim to make about the record.  Prior
                  methods that rely on directly prompting an LLM may
                  suffer from hallucinations and violate privacy
                  constraints.  We present a system, DOSSIER, that
                  verifies claims related to the tabular data within a
                  document.  For a clinical record, the tables include
                  timestamped vital signs, medications, and
                  labs. DOSSIER weaves together methods for tagging
                  medical entities within a claim, converting natural
                  language to SQL, and utilizing biomedical knowledge
                  graphs, in order to identify rows across multiple
                  tables that prove the answer. A distinguishing and
                  desirable characteristic of DOSSIER is that no
                  private medical records are shared with an LLM.  An
                  extensive experimental evaluation is conducted over
                  a large corpus of medical records demonstrating
                  improved accuracy over five baselines.  Our methods
                  provide hope that physicians can privately, quickly,
                  and accurately fact check a claim in an
                  evidence-based fashion.},
  openreview =	 {HIvOO1fF4I}
}

@inproceedings{mlhc2024_submission_160,
  title =	 {Risk stratification through class-conditional
                  conformal estimation: A strategy that improves the
                  rule-out performance of {MACE} in the prehospital
                  setting},
  author =	 {Juan Jose Garcia and Nikhil Sarin and Rebecca
                  R. Kitzmiller and Ashok Krishnamurthy and Jessica
                  K. Z\`egre-Hemsey},
  abstract =	 {Accurate risk stratification of clinical scores is
                  important to mitigate adverse outcomes in patient
                  care.  In this study we explore whether
                  class-conditional conformal estimation can yield
                  better risk stratification cutoffs, as measured by
                  rule-out and rule-in performance. In the binary
                  setting, the cutoffs are chosen to theoretically
                  bound the false positive rate (FPR) and the false
                  negative rate (FNR). We showcase rule-out
                  performance improvements for the task of 30-day
                  major adverse cardiac event (MACE) prediction in the
                  prehospital setting over standard of care HEART and
                  HEAR algorithms. Further, we observe the theoretical
                  bounds materialize 96\% and 77\% of the time for FPR
                  and FNR respectively across multiple
                  datasets. Improving risk score accuracy is important
                  since inaccurate stratification can lead to
                  significant negative patient outcomes. For instance,
                  in the case of MACE prediction, better rule-out
                  performance translates into less delay of time
                  dependent therapies that restore bloodflow to the
                  compromised myocardium, thereby reducing morbidity
                  and mortality.},
  openreview =	 {GVly8wZEzD}
}

@inproceedings{mlhc2024_submission_180,
  title =	 {Decision-Focused Model-based Reinforcement Learning
                  for Reward Transfer},
  author =	 {Abhishek Sharma and Sonali Parbhoo and Omer
                  Gottesman and Finale Doshi-Velez},
  abstract =	 {Model-based reinforcement learning (MBRL) provides a
                  way to learn a transition model of the environment,
                  which can then be used to plan personalized policies
                  for different patient cohorts, and to understand the
                  dynamics involved in the decision-making
                  process. However, standard MBRL algorithms are
                  either sensitive to changes in the reward function
                  or achieve suboptimal performance on the task when
                  the transition model is restricted. Motivated by the
                  need to use simple and interpretable models in
                  critical domains such as healthcare, we propose a
                  novel robust decision-focused (RDF) algorithm that
                  learns a transition model that achieves high returns
                  while being robust to changes in the reward
                  function. We demonstrate our RDF algorithm can be
                  used with several model classes and planning
                  algorithms. We also provide theoretical and
                  empirical envidence, on variety of simulators and
                  real patient data, that RDF can learn simple yet
                  effective models that can be used to plan
                  personalized policies.},
  openreview =	 {FEGMXFQGJr}
}

@inproceedings{mlhc2024_submission_38,
  title =	 {Fair{EHR}-{CLP}: Towards Fairness-Aware Clinical
                  Predictions with Contrastive Learning in Multimodal
                  Electronic Health Records},
  author =	 {Yuqing Wang and Malvika Pillai and Yun Zhao and
                  Catherine M Curtin and Tina Hernandez-Boussard},
  abstract =	 {In the high-stakes realm of healthcare, ensuring
                  fairness in predictive models is crucial. Electronic
                  Health Records (EHRs) have become integral to
                  medical decision-making, yet existing methods for
                  enhancing model fairness restrict themselves to
                  unimodal data and fail to address the multifaceted
                  social biases intertwined with demographic factors
                  in EHRs. To mitigate these biases, we present
                  $\textit{FairEHR-CLP}$: a general framework for
                  $\textbf{Fair}$ness-aware Clinical
                  $\textbf{P}$redictions with $\textbf{C}$ontrastive
                  $\textbf{L}$earning in $\textbf{EHR}$s. FairEHR-CLP
                  operates through a two-stage process, utilizing
                  patient demographics, longitudinal data, and
                  clinical notes. First, synthetic counterparts are
                  generated for each patient, allowing for diverse
                  demographic identities while preserving essential
                  health information. Second, fairness-aware
                  predictions employ contrastive learning to align
                  patient representations across sensitive attributes,
                  jointly optimized with an MLP classifier with a
                  softmax layer for clinical classification
                  tasks. Acknowledging the unique challenges in EHRs,
                  such as varying group sizes and class imbalance, we
                  introduce a novel fairness metric to effectively
                  measure error rate disparities across subgroups.
                  Extensive experiments on three diverse EHR datasets
                  on three tasks demonstrate the effectiveness of
                  FairEHR-CLP in terms of fairness and utility
                  compared with competitive baselines. FairEHR-CLP
                  represents an advancement towards ensuring both
                  accuracy and equity in predictive healthcare
                  models.},
  openreview =	 {DpRiqEgzEM}
}

@inproceedings{mlhc2024_submission_142,
  title =	 {Multimodal Sleep Apnea Detection with Missing or
                  Noisy Modalities},
  author =	 {Hamed Fayyaz and Niharika S. D'Souza and Rahmatollah
                  Beheshti},
  abstract =	 {Polysomnography (PSG) is a type of sleep study that
                  records multimodal physiological signals and is
                  widely used for purposes such as sleep staging and
                  respiratory event detection. Conventional machine
                  learning methods assume that each sleep study is
                  associated with a fixed set of observed modalities
                  and that all modalities are available for each
                  sample. However, noisy and missing modalities are a
                  common issue in real-world clinical settings. In
                  this study, we propose a comprehensive pipeline
                  aiming to compensate for the missing or noisy
                  modalities when performing sleep apnea
                  detection. Unlike other existing studies, our
                  proposed model works with any combination of
                  available modalities. Our experiments show that the
                  proposed model outperforms other state-of-the-art
                  approaches in sleep apnea detection using various
                  subsets of available data and different levels of
                  noise, and maintains its high performance
                  (AUROC$>$0.9) even in the presence of high levels of
                  noise or missingness. This is especially relevant in
                  settings where the level of noise and missingness is
                  high (such as pediatric or outside-of-clinic
                  scenarios).},
  openreview =	 {DPpA0ggsz6}
}

@inproceedings{mlhc2024_submission_34,
  title =	 {Early Prediction of Causes (not Effects) in
                  Healthcare by Long-Term Clinical Time Series
                  Forecasting},
  author =	 {Michael Staniek and Marius Fracarolli and Michael
                  Hagmann and Stefan Riezler},
  abstract =	 {Machine learning for early syndrome diagnosis aims
                  to solve the intricate task of predicting a ground
                  truth label that most often is the outcome (effect)
                  of a medical consensus definition applied to
                  observed clinical measurements (causes), given
                  clinical measurements observed several hours before.
                  Instead of focusing on the prediction of the future
                  effect, we propose to directly predict the causes
                  via time series forecasting (TSF) of clinical
                  variables and determine the effect by applying the
                  gold standard consensus definition to the forecasted
                  values.  This method has the invaluable advantage of
                  being straightforwardly interpretable to clinical
                  practitioners, and because model training does not
                  rely on a particular label anymore, the forecasted
                  data can be used to predict any consensus-based
                  label.  We exemplify our method by means of
                  long-term TSF with Transformer models, with a focus
                  on accurate prediction of sparse clinical variables
                  involved in the SOFA-based Sepsis-3 definition and
                  the new Simplified Acute Physiology Score (SAPS-II)
                  definition.  Our experiments are conducted on two
                  datasets and show that contrary to recent proposals
                  which advocate set function encoders for time series
                  and direct multi-step decoders, best results are
                  achieved by a combination of standard dense encoders
                  with iterative multi-step decoders.  The key for
                  success of iterative multi-step decoding can be
                  attributed to its ability to capture cross-variate
                  dependencies and to a student forcing training
                  strategy that teaches the model to rely on its own
                  previous time step predictions for the next time
                  step prediction.},
  openreview =	 {D8bKiPmD5O}
}

@inproceedings{mlhc2024_submission_58,
  title =	 {Semi-Supervised Generative Models for Disease
                  Trajectories: A Case Study on Systemic Sclerosis},
  author =	 {C\'ecile Trottet and Manuel Sch\"urch and Ahmed
                  Allam and Imon Shoumitra Barua and Liubov Petelytska
                  and Oliver Distler and Anna-Maria Hoffmann-Vold and
                  Michael Krauthammer},
  abstract =	 {We propose a deep generative approach using latent
                  temporal processes for modeling and holistically
                  analyzing complex disease trajectories, with a
                  particular focus on Systemic Sclerosis (SSc). We aim
                  to learn temporal latent representations of the
                  underlying generative process that explain the
                  observed patient disease trajectories in an
                  interpretable and comprehensive way.  To enhance the
                  interpretability of these latent temporal processes,
                  we develop a semi-supervised approach for
                  disentangling the latent space using established
                  medical knowledge.  By combining the generative
                  approach with medical definitions of different
                  characteristics of SSc, we facilitate the discovery
                  of new aspects of the disease.  We show that the
                  learned temporal latent processes can be utilized
                  for further data analysis and clinical hypothesis
                  testing, including finding similar patients and
                  clustering SSc patient trajectories into novel
                  sub-types. Moreover, our method enables personalized
                  online monitoring and prediction of multivariate
                  time series with uncertainty quantification.},
  openreview =	 {7CEK71yFCd}
}

@inproceedings{mlhc2024_submission_53,
  title =	 {G-Transformer: Counterfactual Outcome Prediction
                  under Dynamic and Time-varying Treatment Regimes},
  author =	 {Hong Xiong and Feng Wu and Leon Deng and Megan Su
                  and Li-wei H. Lehman},
  abstract =	 {In the context of medical decision making,
                  counterfactual prediction enables clinicians to
                  predict treatment outcomes of interest under
                  alternative courses of therapeutic actions given
                  observed patient history. Prior machine learning
                  approaches for counterfactual predictions under
                  time-varying treatments focus on static time-varying
                  treatment regimes where treatments do not depend on
                  previous covariate history.  In this work, we
                  present G-Transformer, a Transformer-based framework
                  supporting g-computation for counterfactual
                  prediction under dynamic and time-varying treatment
                  strategies. G-Transfomer captures complex,
                  long-range dependencies in time-varying covariates
                  using a Transformer architecture. G-Transformer
                  estimates the conditional distribution of relevant
                  covariates given covariate and treatment history at
                  each time point using an encoder architecture, then
                  produces Monte Carlo estimates of counterfactual
                  outcomes by simulating forward patient trajectories
                  under treatment strategies of interest.  We evaluate
                  G-Transformer extensively using two simulated
                  longitudinal datasets from mechanistic models, and a
                  real-world sepsis ICU dataset from
                  MIMIC-IV. G-Transformer outperforms both classical
                  and state-of-the-art counterfactual prediction
                  models in these settings. To the best of our
                  knowledge, this is the first Transformer-based
                  architecture for counterfactual outcome prediction
                  under dynamic and time-varying treatment
                  strategies.},
  openreview =	 {2zs4xZyNxG}
}

@inproceedings{mlhc2024_submission_155,
  title =	 {Bidirectional Generative Pre-training for Improving
                  Healthcare Time-series Representation Learning},
  author =	 {Ziyang Song and Qincheng Lu and He Zhu and David
                  L. Buckeridge and Yue Li},
  abstract =	 {Learning time-series representations for
                  discriminative tasks, such as classification and
                  regression, has been a long-standing challenge in
                  the healthcare domain. Current pre-training methods
                  are limited in either unidirectional next-token
                  prediction or randomly masked token prediction. We
                  propose a novel architecture called Bidirectional
                  Timely Generative Pre-trained Transformer
                  (BiTimelyGPT), which pre-trains on biosignals and
                  longitudinal clinical records by both next-token and
                  previous-token prediction in alternating transformer
                  layers. This pre-training task preserves original
                  distribution and data shapes of the
                  time-series. Additionally, the full-rank forward and
                  backward attention matrices exhibit more expressive
                  representation capabilities. Using biosignals and
                  longitudinal clinical records, BiTimelyGPT
                  demonstrates superior performance in predicting
                  neurological functionality, disease diagnosis, and
                  physiological signs. By visualizing the attention
                  heatmap, we observe that the pre-trained BiTimelyGPT
                  can identify discriminative segments from biosignal
                  time-series sequences, even more so after
                  fine-tuning on the task.},
  openreview =	 {2D1etA8ZqG}
}

@inproceedings{mlhc2024_submission_68,
  title =	 {Predictive Powered Inference for Healthcare;
                  Relating Optical Coherence Tomography Scans to
                  Multiple Sclerosis Disease Progression},
  author =	 {Jacob Schultz and Jerry L Prince and Bruno Michel
                  Jedynak},
  abstract =	 {Predictive power inference (PPI and PPI++) is a
                  recently developed statistical method for computing
                  confidence intervals and tests. It combines
                  observations with machine-learning predictions. We
                  use this technique to measure the association
                  between the thickness of retinal layers and the time
                  from the onset of Multiple Sclerosis (MS)
                  symptoms. Further, we correlate the former with the
                  Expanded Disability Status Scale, a measure of the
                  progression of MS. In both cases, the confidence
                  intervals provided with PPI++ improve upon standard
                  statistical methodology, showing the advantage of
                  PPI++ for answering inference problems in
                  healthcare.},
  openreview =	 {0Ujlbyjjns}
}

@inproceedings{mlhc2024_submission_21,
  title =	 {Multinomial belief networks for healthcare data},
  author =	 {Hylke Cornelis Donker and Dorien Neijzen and Johann
                  de Jong and Gerton Lunter},
  abstract =	 {Healthcare data from patient or population cohorts
                  are often characterized by sparsity, high
                  missingness and relatively small sample sizes.  In
                  addition, being able to quantify uncertainty is
                  often important in a medical context.  To address
                  these analytical requirements we propose a deep
                  generative Bayesian model for multinomial count
                  data. We develop a collapsed Gibbs sampling
                  procedure that takes advantage of a series of
                  augmentation relations, inspired by the
                  Zhou--Cong--Chen model. We visualise the model's
                  ability to identify coherent substructures in the
                  data using a dataset of handwritten digits.  We then
                  apply it to a large experimental dataset of DNA
                  mutations in cancer and show that we can identify
                  biologically meaningful clusters of mutational
                  signatures in a fully data-driven way.},
  openreview =	 {043h6W3veQ}
}