Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paper #399

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open

Paper #399

Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/draft-pdf.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Draft PDF
on:
push:
paths:
- paper/**
- .github/workflows/draft-pdf.yml

jobs:
paper:
runs-on: ubuntu-latest
name: Paper Draft
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build draft PDF
uses: openjournals/openjournals-draft-action@master
with:
journal: joss
# This should be the path to the paper within your repo.
paper-path: paper/paper.md
- name: Upload
uses: actions/upload-artifact@v4
with:
name: paper
# This is the output path where Pandoc will write the compiled
# PDF. Note, this should be the same directory as the input
# paper.md
path: paper/paper.pdf
161 changes: 161 additions & 0 deletions paper/paper.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
@book{Bischl2024,
title = {Applied Machine Learning Using {m}lr3 in {R}},
editor = {Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang},
url = {https://mlr3book.mlr-org.com},
year = {2024},
isbn = {9781032507545},
publisher = {CRC Press}
}

@article{Lang2019,
author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd},
doi = {10.21105/JOSS.01903},
issn = {2475-9066},
journal = {Journal of Open Source Software},
month = {dec},
number = {44},
pages = {1903},
publisher = {The Open Journal},
title = {{mlr3: A modern object-oriented machine learning framework in R}},
url = {https://joss.theoj.org/papers/10.21105/joss.01903},
volume = {4},
year = {2019}
}

@Manual{R,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2019},
url = {https://www.R-project.org/},
}

@article{mlr,
author = {Bernd Bischl and Michel Lang and Lars Kotthoff and Julia Schiffner and Jakob Richter and Erich Studerus and Giuseppe Casalicchio and Zachary M. Jones},
title = {mlr: Machine Learning in R},
journal = {Journal of Machine Learning Research},
year = {2016},
volume = {17},
number = {170},
pages = {1-5},
url = {http://jmlr.org/papers/v17/15-066.html}
}

@Manual{data.table,
title = {data.table: Extension of `data.frame`},
author = {Matt Dowle and Arun Srinivasan},
year = {2019},
note = {R package version 1.12.6},
url = {https://CRAN.R-project.org/package=data.table},
}

@article{sk-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011},
url = {http://jmlr.org/papers/v12/pedregosa11a.html}
}

@article{caret,
author = {Max Kuhn},
title = {Building Predictive Models in R Using the caret Package},
journal = {Journal of Statistical Software, Articles},
volume = {28},
number = {5},
year = {2008},
keywords = {},
pages = {1--26},
doi = {10.18637/jss.v028.i05},
url = {https://www.jstatsoft.org/v028/i05}
}

@article{weka,
title = {{The WEKA Data Mining Software: An Update}},
author = {Hall, M. and Frank, E. and Holmes, G. and Pfahringer, B. and Reutemann, P. and Witten, I. H. },
journal = {ACM SIGKDD explorations newsletter},
volume = {11},
number = {1},
pages = {10--18},
year = {2009},
publisher = {ACM},
doi = {10.1145/1656274.1656278}
}

@Manual{tidymodels,
title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
author = {Max Kuhn and Hadley Wickham},
url = {https://www.tidymodels.org},
year = {2020},
}

@software{mlj,
author = {Anthony Blaom and Franz Kiraly and Thibaut Lienart and Sebastian Vollmer},
title = {alan-turing-institute/MLJ.jl: v0.5.3},
month = nov,
year = 2019,
publisher = {Zenodo},
version = {v0.5.3},
doi = {10.5281/zenodo.3541506},
url = {https://doi.org/10.5281/zenodo.3541506}
}

@article{checkmate,
author = {Michel Lang},
title = {{checkmate: Fast Argument Checks for Defensive R Programming}},
year = {2017},
journal = {{The R Journal}},
doi = {10.32614/RJ-2017-028},
pages = {437--445},
volume = {9},
number = {1}
}

@Manual{paradox,
title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms},
author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder},
year = {2024},
note = {R package version 1.0.1},
url = {https://CRAN.R-project.org/package=paradox}
}

@article{mlr3pipelines2021,
author = {Binder, Martin and Pfisterer, Florian and Lang, Michel and Schneider, Lennart and Kotthoff, Lars and Bischl, Bernd},
journal = {Journal of Machine Learning Research},
number = {184},
pages = {1--7},
title = {{mlr3pipelines - Flexible Machine Learning Pipelines in R}},
url = {http://jmlr.org/papers/v22/21-0281.html},
volume = {22},
year = {2021}
}

@article{Sonabend2021,
author = {Sonabend, Raphael and Kir{\'{a}}ly, Franz J. and Bender, Andreas and Bischl, Bernd and Lang, Michel},
doi = {10.1093/BIOINFORMATICS/BTAB039},
issn = {1367-4803},
journal = {Bioinformatics},
month = {sep},
number = {17},
pages = {2789--2791},
publisher = {Oxford Academic},
title = {{mlr3proba: an R package for machine learning in survival analysis}},
url = {https://academic.oup.com/bioinformatics/article/37/17/2789/6125361},
volume = {37},
year = {2021}
}

@incollection{benchlargescale,
author = "Sebastian Fischer and Michel Lang and Marc Becker",
title = "Large-Scale Benchmarking",
booktitle = "Applied Machine Learning Using {m}lr3 in {R}",
publisher = "CRC Press", year = "2024",
editor = "Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang",
url = "https://mlr3book.mlr-org.com/large-scale_benchmarking.html"
}
158 changes: 158 additions & 0 deletions paper/paper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
---
title: 'mlr3extralearners: A community-driven package for integrating learners into mlr3'
Copy link
Collaborator

@bblodfon bblodfon Dec 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternative suggestions:

  1. mlr3extralearners: Expanding the mlr3 Ecosystem with Community-Driven Learner Integration (personal favorite)
  2. mlr3extralearners: A Unified Interface for Diverse Machine Learning Algorithms in the mlr3 framework

tags:
sebffischer marked this conversation as resolved.
Show resolved Hide resolved
- R
- machine learning
- community
- FAIR
authors:
- name: Sebastian Fischer
orcid: 0000-0002-9609-3197
affiliation: "2, 3"
- name: John Zobolas
orcid: 0000-0002-3609-8674
affiliation: 4
- name: Raphael Sonabend
orcid: 0000-0001-9225-4654
- name: Marc Becker
orcid: 0000-0002-8115-0400
affiliation: 2
- name: Michel Lang
orcid: 0000-0001-9754-0393
affiliation: "1, 2"
- name: Martin Binder
affiliation: 2
- name: Lennart Schneider
orchid: 0000-0003-4152-5308
affiliation: "2, 3"
- name: Lukas Burk
orchid: 0000-0001-7528-3795
affiliation: "2, 3"
- name: Patrick Schratz
orcid: 0000-0003-0748-6624
affiliation: 2
- name: Byron C. Jaeger
orchid: 0000-0001-7399-2299
affiliation: 13
- name: Stephen A. Lauer
orchid:
affiliation: 7
- name: Lorenz A. Kapsner
orchid:
affiliation: 8
- name: Maximilian Mücke
orchid: 0009-0000-9432-9795
affiliation: 2
- name: Zezhi Wang
orchid:
affiliation: 9
- name: Keenan Ganz
orchid: 0000-0002-8486-3959
affiliation: 10
- name: Henri Funk
orchid: 0009-0007-0949-8385
affiliation: "3, 11, 12"
- name: Philipp Kopper
orchid: 0000-0002-5037-7135
affiliation: 3
- name: Andreas Bender
orchid: 0000-0001-5628-8611
affiliation: "2, 3"
- name: Bernd Bischl
orcid: 0000-0001-6002-6980
affiliation: "2, 3, 5, 6"
affiliations:
- name: TU Dortmund University, Germany
index: 1
- name: Department of Statistics, LMU Munich, Germany
index: 2
- name: Munich Center for Machine Learning (MCML), Germany
index: 3
- name: Department of Cancer Genetics, Institute for Cancer Research, Oslo University Hospital, Norway
index: 4
- name: Leibniz Institute for Prevention Research and Epidemiology (BIPS), Bremen, Germany
index: 5
- name: Faculty of Mathematics and Computer Science, University of Bremen, Germany
index: 6
- name: Certilytics, Inc., Louisville, Kentucky
index: 7
- name: Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU), Erlangen, Germany
index: 8
- name: Department of Statistics and Finance/International Institute of Finance, School of Management, University of Science and Technology of China, Hefei, Anhui, China
index: 9
- name: School of Environmental and Forest Sciences, University of Washington, Seattle
index: 10
- name: Department of Geography, LMU Munich, Germany
index: 11
- name: Statistical Consulting Unit StaBLab, LMU Munich, Germany
index: 12
- name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina
index: 13
date: XXX December 2024
bibliography: paper.bib
---

# Summary

The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem.
The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024].
At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework.
The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis.
This enables users to seamlessly access and utilize these learners directly within their workflows as well as execute large-scale empirical benchmark experiments [@benchlargescale], leveraging the `mlr3` framework's parallelization and optimization capabilities.
An overview of all `mlr3` learners, including those from `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html).

Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem.
This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes.
Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages.
sebffischer marked this conversation as resolved.
Show resolved Hide resolved
In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/).
sebffischer marked this conversation as resolved.
Show resolved Hide resolved

- Comparison with other packages:
- parsnip tidymodels
- ???

# Statement of Need

In order to solve modeling problems using machine learning, one often has specific requirements for the learning algorithm such as performance, interpretability, or the ability to handle specific data types and modeling tasks.
For this reason, it is essential for the `mlr3` ecosystem to offer a wide variety of learners, such that users can choose the most appropriate learner for their specific problem.
While connecting a new learner to `mlr3` is straightforward and can be done on a per-need basis, integrating learners into `mlr3extralearners` also makes this available to other users and avoids replication of effort.
Furthermore, contributing to `mlr3extralearners` also has the added benefits that the learners are reviewed by the maintainers of the package, ensuring that they are correct and work as expected.

Besides the advantage for users of machine learning methods, `mlr3extralearners` also offers benefits for package developers.
After developing a new R package that implements a machine learning algorithm, making it available in the `mlr3` ecosystem means that the learning algorithm is immediately integrated into the wider ecosystem and can therefore easily be tuned or combined with preprocessing steps [@mlr3pipelines2021].

# Features

The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem.
By doing so, many different learning algorithms can be used with the same syntax and standardized interface.
However, the benefits of `mlr3extralearners` do not stop at mere integration.

## Metadata

One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata.
For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox].
Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning.
Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis [@Sonabend2021]) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have.
The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values.

## Functional Correctness

One problem that manifests when integrating learning algorithms from different R packages is that their API can change.
The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was integrated.
In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly.

In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners.
These tests perform simple sanity checks and also verify that the learner's metadata is correctly annotated, e.g. that a learner that claims to be able to handle missing values actually does so.

## Templates for new Learners

In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the new learner itself, as well as associated test files.
These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user.
The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors.

# Acknowledgements

Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research
Foundation) – 460135501 (NFDI project MaRDI).
John Zobolas received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 101016851, project PANCAIM.

# References
Loading