Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paper #399

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open

Paper #399

Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/draft-pdf.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Draft PDF
on:
push:
paths:
- paper/**
- .github/workflows/draft-pdf.yml

jobs:
paper:
runs-on: ubuntu-latest
name: Paper Draft
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build draft PDF
uses: openjournals/openjournals-draft-action@master
with:
journal: joss
# This should be the path to the paper within your repo.
paper-path: paper/paper.md
- name: Upload
uses: actions/upload-artifact@v4
with:
name: paper
# This is the output path where Pandoc will write the compiled
# PDF. Note, this should be the same directory as the input
# paper.md
path: paper/paper.pdf
186 changes: 186 additions & 0 deletions paper/paper.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
@book{Bischl2024,
title = {Applied Machine Learning Using {m}lr3 in {R}},
editor = {Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang},
url = {https://mlr3book.mlr-org.com},
year = {2024},
isbn = {9781032507545},
publisher = {CRC Press}
}

@article{Lang2019,
author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd},
doi = {10.21105/JOSS.01903},
issn = {2475-9066},
journal = {Journal of Open Source Software},
month = {dec},
number = {44},
pages = {1903},
publisher = {The Open Journal},
title = {{mlr3: A modern object-oriented machine learning framework in R}},
url = {https://joss.theoj.org/papers/10.21105/joss.01903},
volume = {4},
year = {2019}
}

@Manual{R,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2019},
url = {https://www.R-project.org/},
}

@article{mlr,
author = {Bernd Bischl and Michel Lang and Lars Kotthoff and Julia Schiffner and Jakob Richter and Erich Studerus and Giuseppe Casalicchio and Zachary M. Jones},
title = {mlr: Machine Learning in R},
journal = {Journal of Machine Learning Research},
year = {2016},
volume = {17},
number = {170},
pages = {1-5},
url = {http://jmlr.org/papers/v17/15-066.html}
}

@Manual{data.table,
title = {data.table: Extension of `data.frame`},
author = {Matt Dowle and Arun Srinivasan},
year = {2019},
note = {R package version 1.12.6},
url = {https://CRAN.R-project.org/package=data.table},
}

@article{sk-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011},
url = {http://jmlr.org/papers/v12/pedregosa11a.html}
}

@article{caret,
author = {Max Kuhn},
title = {Building Predictive Models in R Using the caret Package},
journal = {Journal of Statistical Software, Articles},
volume = {28},
number = {5},
year = {2008},
keywords = {},
pages = {1--26},
doi = {10.18637/jss.v028.i05},
url = {https://www.jstatsoft.org/v028/i05}
}

@article{weka,
title = {{The WEKA Data Mining Software: An Update}},
author = {Hall, M. and Frank, E. and Holmes, G. and Pfahringer, B. and Reutemann, P. and Witten, I. H. },
journal = {ACM SIGKDD explorations newsletter},
volume = {11},
number = {1},
pages = {10--18},
year = {2009},
publisher = {ACM},
doi = {10.1145/1656274.1656278}
}

@Manual{tidymodels,
title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
author = {Max Kuhn and Hadley Wickham},
url = {https://www.tidymodels.org},
year = {2020}
}

@Manual{parsnip,
title = {parsnip: A Common API to Modeling and Analysis Functions},
author = {Max Kuhn and Davis Vaughan},
year = {2024},
note = {R package version 1.2.1, https://parsnip.tidymodels.org/},
url = {https://github.com/tidymodels/parsnip}
}

@Manual{mlr3tuning,
title = {mlr3tuning: Hyperparameter Optimization for 'mlr3'},
author = {Becker, Marc and Lang, Michel and Richter, Jakob and Bischl, Bernd and Schalk, Daniel},
year = {2024},
note = {R package version 1.2.1, https://mlr3tuning.mlr-org.com/},
url = {https://github.com/mlr-org/mlr3tuning}
}

@article{caret,
title={Building predictive models in R using the caret package},
author={Kuhn, Max},
journal={Journal of statistical software},
volume={28},
pages={1--26},
year={2008}
}

@software{mlj,
author = {Anthony Blaom and Franz Kiraly and Thibaut Lienart and Sebastian Vollmer},
title = {alan-turing-institute/MLJ.jl: v0.5.3},
month = nov,
year = 2019,
publisher = {Zenodo},
version = {v0.5.3},
doi = {10.5281/zenodo.3541506},
url = {https://doi.org/10.5281/zenodo.3541506}
}

@article{checkmate,
author = {Michel Lang},
title = {{checkmate: Fast Argument Checks for Defensive R Programming}},
year = {2017},
journal = {{The R Journal}},
doi = {10.32614/RJ-2017-028},
pages = {437--445},
volume = {9},
number = {1}
}

@Manual{paradox,
title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms},
author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder},
year = {2024},
note = {R package version 1.0.1},
url = {https://CRAN.R-project.org/package=paradox}
}

@article{mlr3pipelines2021,
author = {Binder, Martin and Pfisterer, Florian and Lang, Michel and Schneider, Lennart and Kotthoff, Lars and Bischl, Bernd},
journal = {Journal of Machine Learning Research},
number = {184},
pages = {1--7},
title = {{mlr3pipelines - Flexible Machine Learning Pipelines in R}},
url = {http://jmlr.org/papers/v22/21-0281.html},
volume = {22},
year = {2021}
}

@article{Sonabend2021,
author = {Sonabend, Raphael and Kir{\'{a}}ly, Franz J. and Bender, Andreas and Bischl, Bernd and Lang, Michel},
doi = {10.1093/BIOINFORMATICS/BTAB039},
issn = {1367-4803},
journal = {Bioinformatics},
month = {sep},
number = {17},
pages = {2789--2791},
publisher = {Oxford Academic},
title = {{mlr3proba: an R package for machine learning in survival analysis}},
url = {https://academic.oup.com/bioinformatics/article/37/17/2789/6125361},
volume = {37},
year = {2021}
}

@incollection{benchlargescale,
author = "Sebastian Fischer and Michel Lang and Marc Becker",
title = "Large-Scale Benchmarking",
booktitle = "Applied Machine Learning Using {m}lr3 in {R}",
publisher = "CRC Press", year = "2024",
editor = "Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang",
url = "https://mlr3book.mlr-org.com/large-scale_benchmarking.html"
}
163 changes: 163 additions & 0 deletions paper/paper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
---
title: 'mlr3extralearners: A community-driven package for integrating learners into mlr3'
Copy link
Collaborator

@bblodfon bblodfon Dec 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternative suggestions:

  1. mlr3extralearners: Expanding the mlr3 Ecosystem with Community-Driven Learner Integration (personal favorite)
  2. mlr3extralearners: A Unified Interface for Diverse Machine Learning Algorithms in the mlr3 framework

tags:
sebffischer marked this conversation as resolved.
Show resolved Hide resolved
- R
- machine learning
- community
- FAIR
authors:
- name: Sebastian Fischer
orcid: 0000-0002-9609-3197
affiliation: "2, 3"
- name: John Zobolas
orcid: 0000-0002-3609-8674
affiliation: 4
- name: Raphael Sonabend
orcid: 0000-0001-9225-4654
- name: Marc Becker
orcid: 0000-0002-8115-0400
affiliation: 2
- name: Michel Lang
orcid: 0000-0001-9754-0393
affiliation: "1, 2"
- name: Martin Binder
affiliation: 2
- name: Lennart Schneider
orchid: 0000-0003-4152-5308
affiliation: "2, 3"
- name: Lukas Burk
orchid: 0000-0001-7528-3795
affiliation: "2, 3"
- name: Patrick Schratz
orcid: 0000-0003-0748-6624
affiliation: 2
- name: Byron C. Jaeger
orchid: 0000-0001-7399-2299
affiliation: 13
- name: Stephen A. Lauer
orchid: 0000-0003-2948-630X
affiliation: 7
- name: Lorenz A. Kapsner
orchid:
affiliation: 8
- name: Maximilian Mücke
orchid: 0009-0000-9432-9795
affiliation: 2
- name: Zezhi Wang
orchid:
affiliation: 9
- name: Keenan Ganz
orchid: 0000-0002-8486-3959
affiliation: 10
- name: Henri Funk
orchid: 0009-0007-0949-8385
affiliation: "3, 11, 12"
- name: Philipp Kopper
orchid: 0000-0002-5037-7135
affiliation: 3
- name: Andreas Bender
orchid: 0000-0001-5628-8611
affiliation: "2, 3"
- name: Bernd Bischl
orcid: 0000-0001-6002-6980
affiliation: "2, 3, 5, 6"
affiliations:
- name: TU Dortmund University, Germany
index: 1
- name: Department of Statistics, LMU Munich, Germany
index: 2
- name: Munich Center for Machine Learning (MCML), Germany
index: 3
- name: Department of Cancer Genetics, Institute for Cancer Research, Oslo University Hospital, Norway
index: 4
- name: Leibniz Institute for Prevention Research and Epidemiology (BIPS), Bremen, Germany
index: 5
- name: Faculty of Mathematics and Computer Science, University of Bremen, Germany
index: 6
- name: Certilytics, Inc., 9200 Shelbyville Rd, Louisville, KY, 40222, USA
index: 7
- name: Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU), Erlangen, Germany
index: 8
- name: Department of Statistics and Finance/International Institute of Finance, School of Management, University of Science and Technology of China, Hefei, Anhui, China
index: 9
- name: School of Environmental and Forest Sciences, University of Washington, Seattle
index: 10
- name: Department of Geography, LMU Munich, Germany
index: 11
- name: Statistical Consulting Unit StaBLab, LMU Munich, Germany
index: 12
- name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina
index: 13
date: XXX December 2024
bibliography: paper.bib
---

# Summary

The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem.
The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024].
At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework.
The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis.
This enables users to seamlessly access and utilize these learners directly within their workflows.
It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale].
An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html).

Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem.
This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes.
Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages.
sebffischer marked this conversation as resolved.
Show resolved Hide resolved
In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/).
sebffischer marked this conversation as resolved.
Show resolved Hide resolved

# Statement of Need
sebffischer marked this conversation as resolved.
Show resolved Hide resolved

Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as performance, interpretability, or compatibility with specific data types and tasks.
To address this challenge, packages like `caret` [@caret] and `parsnip` [@parsnip] from the `tidymodels` ecosystem have historically provided unified interfaces for simplifying model experimentation [@tidymodels].
For instance, `parsnip` provides a clean and consistent way to define models, enabling users to experiment with different algorithms without dealing with the nuances of underlying package syntax.
Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in R.

Within this ecosystem, `mlr3extralearners` plays a crucial role by providing a comprehensive collection of external machine learning algorithms integrated into the `mlr3` framework.
This ensures that users can access a wide variety of learners to meet their specific needs,
and choose the most appropriate learner for their specific problem.
While connecting new learners to `mlr3` is straightforward and can be done on a per-need basis, integrating them into `mlr3extralearners` benefits the broader community by avoiding redundant effort and ensuring accessibility for all users.
Additionally, contributions to `mlr3extralearners` are reviewed by the package maintainers, providing a layer of quality assurance.
This review process ensures that integrated learners function as expected and adhere to the high standards of the `mlr3` ecosystem.

Beyond its utility for users, `mlr3extralearners` also offers significant advantages for developers of machine learning packages.
By integrating a new algorithm into the `mlr3` ecosystem, developers can immediately make their methods accessible to a wider audience.
This integration facilitates seamless tuning [@mlr3tuning] and preprocessing [@mlr3pipelines2021] through the broader `mlr3` framework, enhancing the usability and impact of their work.

# Features

The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem.
By doing so, many different learning algorithms can be used with the same syntax and standardized interface.
However, the benefits of `mlr3extralearners` do not stop at mere integration.

## Metadata

One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata.
For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox].
Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning.
Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis [@Sonabend2021]) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have.
The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values.

## Functional Correctness

One problem that manifests when integrating learning algorithms from different R packages is that their API can change.
The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was integrated.
In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly.

In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners.
These tests perform simple sanity checks and also verify that the learner's metadata is correctly annotated, e.g. that a learner that claims to be able to handle missing values actually does so.

## Templates for new Learners

In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the new learner itself, as well as associated test files.
These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user.
The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors.

# Acknowledgements

Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research
Foundation) – 460135501 (NFDI project MaRDI).
John Zobolas received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 101016851, project PANCAIM.

# References
Loading