mlr-org · sebffischer · Dec 3, 2024 · Dec 3, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml
@@ -0,0 +1,28 @@
+name: Draft PDF
+on:
+  push:
+    paths:
+      - paper/**
+      - .github/workflows/draft-pdf.yml
+
+jobs:
+  paper:
+    runs-on: ubuntu-latest
+    name: Paper Draft
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build draft PDF
+        uses: openjournals/openjournals-draft-action@master
+        with:
+          journal: joss
+          # This should be the path to the paper within your repo.
+          paper-path: paper/paper.md
+      - name: Upload
+        uses: actions/upload-artifact@v4
+        with:
+          name: paper
+          # This is the output path where Pandoc will write the compiled
+          # PDF. Note, this should be the same directory as the input
+          # paper.md
+          path: paper/paper.pdf
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,161 @@
+@book{Bischl2024,
+    title = {Applied Machine Learning Using {m}lr3 in {R}},
+    editor = {Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang},
+    url = {https://mlr3book.mlr-org.com},
+    year = {2024},
+    isbn = {9781032507545},
+    publisher = {CRC Press}
+}
+
+@article{Lang2019,
+    author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd},
+    doi = {10.21105/JOSS.01903},
+    issn = {2475-9066},
+    journal = {Journal of Open Source Software},
+    month = {dec},
+    number = {44},
+    pages = {1903},
+    publisher = {The Open Journal},
+    title = {{mlr3: A modern object-oriented machine learning framework in R}},
+    url = {https://joss.theoj.org/papers/10.21105/joss.01903},
+    volume = {4},
+    year = {2019}
+}
+
+@Manual{R,
+    title = {R: A Language and Environment for Statistical Computing},
+    author = {{R Core Team}},
+    organization = {R Foundation for Statistical Computing},
+    address = {Vienna, Austria},
+    year = {2019},
+    url = {https://www.R-project.org/},
+}
+
+@article{mlr,
+    author  = {Bernd Bischl and Michel Lang and Lars Kotthoff and Julia Schiffner and Jakob Richter and Erich Studerus and Giuseppe Casalicchio and Zachary M. Jones},
+    title   = {mlr: Machine Learning in R},
+    journal = {Journal of Machine Learning Research},
+    year    = {2016},
+    volume  = {17},
+    number  = {170},
+    pages   = {1-5},
+    url     = {http://jmlr.org/papers/v17/15-066.html}
+}
+
+@Manual{data.table,
+    title = {data.table: Extension of `data.frame`},
+    author = {Matt Dowle and Arun Srinivasan},
+    year = {2019},
+    note = {R package version 1.12.6},
+    url = {https://CRAN.R-project.org/package=data.table},
+}
+
+@article{sk-learn,
+    title={Scikit-learn: Machine Learning in {P}ython},
+    author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+        and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+            and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+            Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+    journal={Journal of Machine Learning Research},
+    volume={12},
+    pages={2825--2830},
+    year={2011},
+    url = {http://jmlr.org/papers/v12/pedregosa11a.html}
+}
+
+@article{caret,
+   author = {Max Kuhn},
+   title = {Building Predictive Models in R Using the caret Package},
+   journal = {Journal of Statistical Software, Articles},
+   volume = {28},
+   number = {5},
+   year = {2008},
+   keywords = {},
+   pages = {1--26},
+   doi = {10.18637/jss.v028.i05},
+   url = {https://www.jstatsoft.org/v028/i05}
+}
+
+@article{weka,
+  title       = {{The WEKA Data Mining Software: An Update}},
+  author      = {Hall, M. and Frank, E. and Holmes, G. and Pfahringer, B. and Reutemann, P. and Witten, I. H. },
+  journal     = {ACM SIGKDD explorations newsletter},
+  volume      = {11},
+  number      = {1},
+  pages       = {10--18},
+  year        = {2009},
+  publisher   = {ACM},
+  doi         = {10.1145/1656274.1656278}
+}
+
+@Manual{tidymodels,
+  title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
+  author = {Max Kuhn and Hadley Wickham},
+  url = {https://www.tidymodels.org},
+  year = {2020},
+}
+
+@software{mlj,
+  author       = {Anthony Blaom and Franz Kiraly and Thibaut Lienart and Sebastian Vollmer},
+  title        = {alan-turing-institute/MLJ.jl: v0.5.3},
+  month        = nov,
+  year         = 2019,
+  publisher    = {Zenodo},
+  version      = {v0.5.3},
+  doi          = {10.5281/zenodo.3541506},
+  url          = {https://doi.org/10.5281/zenodo.3541506}
+}
+
+@article{checkmate,
+  author = {Michel Lang},
+  title = {{checkmate: Fast Argument Checks for Defensive R Programming}},
+  year = {2017},
+  journal = {{The R Journal}},
+  doi = {10.32614/RJ-2017-028},
+  pages = {437--445},
+  volume = {9},
+  number = {1}
+}
+
+@Manual{paradox,
+  title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms},
+  author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder},
+  year = {2024},
+  note = {R package version 1.0.1},
+  url = {https://CRAN.R-project.org/package=paradox}
+}
+
+@article{mlr3pipelines2021,
+  author = {Binder, Martin and Pfisterer, Florian and Lang, Michel and Schneider, Lennart and Kotthoff, Lars and Bischl, Bernd},
+  journal = {Journal of Machine Learning Research},
+  number = {184},
+  pages = {1--7},
+  title = {{mlr3pipelines - Flexible Machine Learning Pipelines in R}},
+  url = {http://jmlr.org/papers/v22/21-0281.html},
+  volume = {22},
+  year = {2021}
+}
+
+@article{Sonabend2021,
+  author = {Sonabend, Raphael and Kir{\'{a}}ly, Franz J. and Bender, Andreas and Bischl, Bernd and Lang, Michel},
+  doi = {10.1093/BIOINFORMATICS/BTAB039},
+  issn = {1367-4803},
+  journal = {Bioinformatics},
+  month = {sep},
+  number = {17},
+  pages = {2789--2791},
+  publisher = {Oxford Academic},
+  title = {{mlr3proba: an R package for machine learning in survival analysis}},
+  url = {https://academic.oup.com/bioinformatics/article/37/17/2789/6125361},
+  volume = {37},
+  year = {2021}
+}
+
+@incollection{benchlargescale,
+  author = "Sebastian Fischer and Michel Lang and Marc Becker",
+  title = "Large-Scale Benchmarking",
+  booktitle = "Applied Machine Learning Using {m}lr3 in {R}",
+  publisher = "CRC Press", year = "2024",
+  editor = "Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang",
+  url = "https://mlr3book.mlr-org.com/large-scale_benchmarking.html"
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,158 @@
+---
+title: 'mlr3extralearners: A community-driven package for integrating learners into mlr3'
+tags:
+  - R
+  - machine learning
+  - community
+  - FAIR
+authors:
+  - name: Sebastian Fischer
+    orcid: 0000-0002-9609-3197
+    affiliation: "2, 3"
+  - name: John Zobolas
+    orcid: 0000-0002-3609-8674
+    affiliation: 4
+  - name: Raphael Sonabend
+    orcid: 0000-0001-9225-4654
+  - name: Marc Becker
+    orcid: 0000-0002-8115-0400
+    affiliation: 2
+  - name: Michel Lang
+    orcid: 0000-0001-9754-0393
+    affiliation: "1, 2"
+  - name: Martin Binder
+    affiliation: 2
+  - name: Lennart Schneider
+    orchid: 0000-0003-4152-5308
+    affiliation: "2, 3"
+  - name: Lukas Burk
+    orchid: 0000-0001-7528-3795
+    affiliation: "2, 3"
+  - name: Patrick Schratz
+    orcid: 0000-0003-0748-6624
+    affiliation: 2
+  - name: Byron C. Jaeger
+    orchid: 0000-0001-7399-2299
+    affiliation: 13
+  - name: Stephen A. Lauer
+    orchid: 
+    affiliation: 7
+  - name: Lorenz A. Kapsner
+    orchid: 
+    affiliation: 8
+  - name: Maximilian Mücke
+    orchid: 0009-0000-9432-9795
+    affiliation: 2
+  - name: Zezhi Wang 
+    orchid: 
+    affiliation: 9
+  - name: Keenan Ganz
+    orchid: 0000-0002-8486-3959
+    affiliation: 10
+  - name: Henri Funk
+    orchid: 0009-0007-0949-8385
+    affiliation: "3, 11, 12"
+  - name: Philipp Kopper
+    orchid: 0000-0002-5037-7135
+    affiliation: 3
+  - name: Andreas Bender
+    orchid: 0000-0001-5628-8611
+    affiliation: "2, 3"
+  - name: Bernd Bischl
+    orcid: 0000-0001-6002-6980
+    affiliation: "2, 3, 5, 6"
+affiliations:
+ - name: TU Dortmund University, Germany
+   index: 1
+ - name: Department of Statistics, LMU Munich, Germany
+   index: 2
+ - name: Munich Center for Machine Learning (MCML), Germany
+   index: 3
+ - name: Department of Cancer Genetics, Institute for Cancer Research, Oslo University Hospital, Norway
+   index: 4
+ - name: Leibniz Institute for Prevention Research and Epidemiology (BIPS), Bremen, Germany
+   index: 5
+ - name: Faculty of Mathematics and Computer Science, University of Bremen, Germany
+   index: 6
+ - name: Certilytics, Inc., Louisville, Kentucky
+   index: 7
+ - name: Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU), Erlangen, Germany
+   index: 8
+ - name: Department of Statistics and Finance/International Institute of Finance, School of Management, University of Science and Technology of China, Hefei, Anhui, China
+   index: 9
+ - name: School of Environmental and Forest Sciences, University of Washington, Seattle
+   index: 10
+ - name: Department of Geography, LMU Munich, Germany
+   index: 11
+ - name: Statistical Consulting Unit StaBLab, LMU Munich, Germany
+   index: 12
+ - name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina
+   index: 13
+date: XXX December 2024
+bibliography: paper.bib
+---
+
+# Summary
+
+The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem.
+The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024].
+At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework.
+The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis.
+This enables users to seamlessly access and utilize these learners directly within their workflows as well as execute large-scale empirical benchmark experiments [@benchlargescale], leveraging the `mlr3` framework's parallelization and optimization capabilities.
+An overview of all `mlr3` learners, including those from `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html).
+
+Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem.
+This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes.
+Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages.
+In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/).
+
+- Comparison with other packages:
+  - parsnip tidymodels
+  - ???
+
+# Statement of Need
+
+In order to solve modeling problems using machine learning, one often has specific requirements for the learning algorithm such as performance, interpretability, or the ability to handle specific data types and modeling tasks.
+For this reason, it is essential for the `mlr3` ecosystem to offer a wide variety of learners, such that users can choose the most appropriate learner for their specific problem.
+While connecting a new learner to `mlr3` is straightforward and can be done on a per-need basis, integrating learners into `mlr3extralearners` also makes this available to other users and avoids replication of effort.
+Furthermore, contributing to `mlr3extralearners` also has the added benefits that the learners are reviewed by the maintainers of the package, ensuring that they are correct and work as expected.
+
+Besides the advantage for users of machine learning methods, `mlr3extralearners` also offers benefits for package developers.
+After developing a new R package that implements a machine learning algorithm, making it available in the `mlr3` ecosystem means that the learning algorithm is immediately integrated into the wider ecosystem and can therefore easily be tuned or combined with preprocessing steps [@mlr3pipelines2021].
+
+# Features
+
+The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem.
+By doing so, many different learning algorithms can be used with the same syntax and standardized interface.
+However, the benefits of `mlr3extralearners` do not stop at mere integration.
+
+## Metadata
+
+One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata.
+For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox].
+Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning.
+Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis [@Sonabend2021]) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have.
+The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values.
+
+## Functional Correctness
+
+One problem that manifests when integrating learning algorithms from different R packages is that their API can change.
+The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was integrated.
+In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly.
+
+In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners.
+These tests perform simple sanity checks and also verify that the learner's metadata is correctly annotated, e.g. that a learner that claims to be able to handle missing values actually does so.
+
+## Templates for new Learners
+
+In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the new learner itself, as well as associated test files.
+These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user.
+The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors.
+
+# Acknowledgements
+
+Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research
+Foundation) – 460135501 (NFDI project MaRDI).
+John Zobolas received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 101016851, project PANCAIM.
+
+# References