mlr-org · sebffischer · Dec 3, 2024 · Dec 3, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml
@@ -0,0 +1,28 @@
+name: Draft PDF
+on:
+  push:
+    paths:
+      - paper/**
+      - .github/workflows/draft-pdf.yml
+
+jobs:
+  paper:
+    runs-on: ubuntu-latest
+    name: Paper Draft
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build draft PDF
+        uses: openjournals/openjournals-draft-action@master
+        with:
+          journal: joss
+          # This should be the path to the paper within your repo.
+          paper-path: paper/paper.md
+      - name: Upload
+        uses: actions/upload-artifact@v4
+        with:
+          name: paper
+          # This is the output path where Pandoc will write the compiled
+          # PDF. Note, this should be the same directory as the input
+          # paper.md
+          path: paper/paper.pdf
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,186 @@
+@book{Bischl2024,
+    title = {Applied Machine Learning Using {m}lr3 in {R}},
+    editor = {Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang},
+    url = {https://mlr3book.mlr-org.com},
+    year = {2024},
+    isbn = {9781032507545},
+    publisher = {CRC Press}
+}
+
+@article{Lang2019,
+    author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd},
+    doi = {10.21105/JOSS.01903},
+    issn = {2475-9066},
+    journal = {Journal of Open Source Software},
+    month = {dec},
+    number = {44},
+    pages = {1903},
+    publisher = {The Open Journal},
+    title = {{mlr3: A modern object-oriented machine learning framework in R}},
+    url = {https://joss.theoj.org/papers/10.21105/joss.01903},
+    volume = {4},
+    year = {2019}
+}
+
+@Manual{R,
+    title = {R: A Language and Environment for Statistical Computing},
+    author = {{R Core Team}},
+    organization = {R Foundation for Statistical Computing},
+    address = {Vienna, Austria},
+    year = {2019},
+    url = {https://www.R-project.org/},
+}
+
+@article{mlr,
+    author  = {Bernd Bischl and Michel Lang and Lars Kotthoff and Julia Schiffner and Jakob Richter and Erich Studerus and Giuseppe Casalicchio and Zachary M. Jones},
+    title   = {mlr: Machine Learning in R},
+    journal = {Journal of Machine Learning Research},
+    year    = {2016},
+    volume  = {17},
+    number  = {170},
+    pages   = {1-5},
+    url     = {http://jmlr.org/papers/v17/15-066.html}
+}
+
+@Manual{data.table,
+    title = {data.table: Extension of `data.frame`},
+    author = {Matt Dowle and Arun Srinivasan},
+    year = {2019},
+    note = {R package version 1.12.6},
+    url = {https://CRAN.R-project.org/package=data.table},
+}
+
+@article{sk-learn,
+    title={Scikit-learn: Machine Learning in {P}ython},
+    author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+        and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+            and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+            Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+    journal={Journal of Machine Learning Research},
+    volume={12},
+    pages={2825--2830},
+    year={2011},
+    url = {http://jmlr.org/papers/v12/pedregosa11a.html}
+}
+
+@article{caret,
+   author = {Max Kuhn},
+   title = {Building Predictive Models in R Using the caret Package},
+   journal = {Journal of Statistical Software, Articles},
+   volume = {28},
+   number = {5},
+   year = {2008},
+   keywords = {},
+   pages = {1--26},
+   doi = {10.18637/jss.v028.i05},
+   url = {https://www.jstatsoft.org/v028/i05}
+}
+
+@article{weka,
+  title       = {{The WEKA Data Mining Software: An Update}},
+  author      = {Hall, M. and Frank, E. and Holmes, G. and Pfahringer, B. and Reutemann, P. and Witten, I. H. },
+  journal     = {ACM SIGKDD explorations newsletter},
+  volume      = {11},
+  number      = {1},
+  pages       = {10--18},
+  year        = {2009},
+  publisher   = {ACM},
+  doi         = {10.1145/1656274.1656278}
+}
+
+@Manual{tidymodels,
+  title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
+  author = {Max Kuhn and Hadley Wickham},
+  url = {https://www.tidymodels.org},
+  year = {2020}
+}
+
+@Manual{parsnip,
+  title = {parsnip: A Common API to Modeling and Analysis Functions},
+  author = {Max Kuhn and Davis Vaughan},
+  year = {2024},
+  note = {R package version 1.2.1, https://parsnip.tidymodels.org/},
+  url = {https://github.com/tidymodels/parsnip}
+}
+
+@Manual{mlr3tuning,
+  title = {mlr3tuning: Hyperparameter Optimization for 'mlr3'},
+  author = {Becker, Marc and Lang, Michel and Richter, Jakob and Bischl, Bernd and Schalk, Daniel},
+  year = {2024},
+  note = {R package version 1.2.1, https://mlr3tuning.mlr-org.com/},
+  url = {https://github.com/mlr-org/mlr3tuning}
+}
+
+@article{caret,
+  title={Building predictive models in R using the caret package},
+  author={Kuhn, Max},
+  journal={Journal of statistical software},
+  volume={28},
+  pages={1--26},
+  year={2008}
+}
+
+@software{mlj,
+  author       = {Anthony Blaom and Franz Kiraly and Thibaut Lienart and Sebastian Vollmer},
+  title        = {alan-turing-institute/MLJ.jl: v0.5.3},
+  month        = nov,
+  year         = 2019,
+  publisher    = {Zenodo},
+  version      = {v0.5.3},
+  doi          = {10.5281/zenodo.3541506},
+  url          = {https://doi.org/10.5281/zenodo.3541506}
+}
+
+@article{checkmate,
+  author = {Michel Lang},
+  title = {{checkmate: Fast Argument Checks for Defensive R Programming}},
+  year = {2017},
+  journal = {{The R Journal}},
+  doi = {10.32614/RJ-2017-028},
+  pages = {437--445},
+  volume = {9},
+  number = {1}
+}
+
+@Manual{paradox,
+  title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms},
+  author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder},
+  year = {2024},
+  note = {R package version 1.0.1},
+  url = {https://CRAN.R-project.org/package=paradox}
+}
+
+@article{mlr3pipelines2021,
+  author = {Binder, Martin and Pfisterer, Florian and Lang, Michel and Schneider, Lennart and Kotthoff, Lars and Bischl, Bernd},
+  journal = {Journal of Machine Learning Research},
+  number = {184},
+  pages = {1--7},
+  title = {{mlr3pipelines - Flexible Machine Learning Pipelines in R}},
+  url = {http://jmlr.org/papers/v22/21-0281.html},
+  volume = {22},
+  year = {2021}
+}
+
+@article{Sonabend2021,
+  author = {Sonabend, Raphael and Kir{\'{a}}ly, Franz J. and Bender, Andreas and Bischl, Bernd and Lang, Michel},
+  doi = {10.1093/BIOINFORMATICS/BTAB039},
+  issn = {1367-4803},
+  journal = {Bioinformatics},
+  month = {sep},
+  number = {17},
+  pages = {2789--2791},
+  publisher = {Oxford Academic},
+  title = {{mlr3proba: an R package for machine learning in survival analysis}},
+  url = {https://academic.oup.com/bioinformatics/article/37/17/2789/6125361},
+  volume = {37},
+  year = {2021}
+}
+
+@incollection{benchlargescale,
+  author = "Sebastian Fischer and Michel Lang and Marc Becker",
+  title = "Large-Scale Benchmarking",
+  booktitle = "Applied Machine Learning Using {m}lr3 in {R}",
+  publisher = "CRC Press", year = "2024",
+  editor = "Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang",
+  url = "https://mlr3book.mlr-org.com/large-scale_benchmarking.html"
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,163 @@
+---
+title: 'mlr3extralearners: A community-driven package for integrating learners into mlr3'
+tags:
+  - R
+  - machine learning
+  - community
+  - FAIR
+authors:
+  - name: Sebastian Fischer
+    orcid: 0000-0002-9609-3197
+    affiliation: "2, 3"
+  - name: John Zobolas
+    orcid: 0000-0002-3609-8674
+    affiliation: 4
+  - name: Raphael Sonabend
+    orcid: 0000-0001-9225-4654
+  - name: Marc Becker
+    orcid: 0000-0002-8115-0400
+    affiliation: 2
+  - name: Michel Lang
+    orcid: 0000-0001-9754-0393
+    affiliation: "1, 2"
+  - name: Martin Binder
+    affiliation: 2
+  - name: Lennart Schneider
+    orchid: 0000-0003-4152-5308
+    affiliation: "2, 3"
+  - name: Lukas Burk
+    orchid: 0000-0001-7528-3795
+    affiliation: "2, 3"
+  - name: Patrick Schratz
+    orcid: 0000-0003-0748-6624
+    affiliation: 2
+  - name: Byron C. Jaeger
+    orchid: 0000-0001-7399-2299
+    affiliation: 13
+  - name: Stephen A. Lauer
+    orchid: 0000-0003-2948-630X
+    affiliation: 7
+  - name: Lorenz A. Kapsner
+    orchid: 
+    affiliation: 8
+  - name: Maximilian Mücke
+    orchid: 0009-0000-9432-9795
+    affiliation: 2
+  - name: Zezhi Wang 
+    orchid: 
+    affiliation: 9
+  - name: Keenan Ganz
+    orchid: 0000-0002-8486-3959
+    affiliation: 10
+  - name: Henri Funk
+    orchid: 0009-0007-0949-8385
+    affiliation: "3, 11, 12"
+  - name: Philipp Kopper
+    orchid: 0000-0002-5037-7135
+    affiliation: 3
+  - name: Andreas Bender
+    orchid: 0000-0001-5628-8611
+    affiliation: "2, 3"
+  - name: Bernd Bischl
+    orcid: 0000-0001-6002-6980
+    affiliation: "2, 3, 5, 6"
+affiliations:
+ - name: TU Dortmund University, Germany
+   index: 1
+ - name: Department of Statistics, LMU Munich, Germany
+   index: 2
+ - name: Munich Center for Machine Learning (MCML), Germany
+   index: 3
+ - name: Department of Cancer Genetics, Institute for Cancer Research, Oslo University Hospital, Norway
+   index: 4
+ - name: Leibniz Institute for Prevention Research and Epidemiology (BIPS), Bremen, Germany
+   index: 5
+ - name: Faculty of Mathematics and Computer Science, University of Bremen, Germany
+   index: 6
+ - name: Certilytics, Inc., 9200 Shelbyville Rd, Louisville, KY, 40222, USA
+   index: 7
+ - name: Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU), Erlangen, Germany
+   index: 8
+ - name: Department of Statistics and Finance/International Institute of Finance, School of Management, University of Science and Technology of China, Hefei, Anhui, China
+   index: 9
+ - name: School of Environmental and Forest Sciences, University of Washington, Seattle
+   index: 10
+ - name: Department of Geography, LMU Munich, Germany
+   index: 11
+ - name: Statistical Consulting Unit StaBLab, LMU Munich, Germany
+   index: 12
+ - name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina
+   index: 13
+date: XXX December 2024
+bibliography: paper.bib
+---
+
+# Summary
+
+The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem.
+The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024].
+At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework.
+The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis.
+This enables users to seamlessly access and utilize these learners directly within their workflows.
+It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities  [@benchlargescale].
+An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html).
+
+Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem.
+This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes.
+Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages.
+In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/).
+
+# Statement of Need
+
+Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as performance, interpretability, or compatibility with specific data types and tasks.
+To address this challenge, packages like `caret` [@caret] and `parsnip` [@parsnip] from the `tidymodels` ecosystem have historically provided unified interfaces for simplifying model experimentation [@tidymodels].
+For instance, `parsnip` provides a clean and consistent way to define models, enabling users to experiment with different algorithms without dealing with the nuances of underlying package syntax.
+Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in R.
+
+Within this ecosystem, `mlr3extralearners` plays a crucial role by providing a comprehensive collection of external machine learning algorithms integrated into the `mlr3` framework.
+This ensures that users can access a wide variety of learners to meet their specific needs,
+and choose the most appropriate learner for their specific problem.
+While connecting new learners to `mlr3` is straightforward and can be done on a per-need basis, integrating them into `mlr3extralearners` benefits the broader community by avoiding redundant effort and ensuring accessibility for all users.
+Additionally, contributions to `mlr3extralearners` are reviewed by the package maintainers, providing a layer of quality assurance.
+This review process ensures that integrated learners function as expected and adhere to the high standards of the `mlr3` ecosystem.
+
+Beyond its utility for users, `mlr3extralearners` also offers significant advantages for developers of machine learning packages.
+By integrating a new algorithm into the `mlr3` ecosystem, developers can immediately make their methods accessible to a wider audience.
+This integration facilitates seamless tuning [@mlr3tuning] and preprocessing [@mlr3pipelines2021] through the broader `mlr3` framework, enhancing the usability and impact of their work.
+
+# Features
+
+The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem.
+By doing so, many different learning algorithms can be used with the same syntax and standardized interface.
+However, the benefits of `mlr3extralearners` do not stop at mere integration.
+
+## Metadata
+
+One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata.
+For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox].
+Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning.
+Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis [@Sonabend2021]) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have.
+The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values.
+
+## Functional Correctness
+
+One problem that manifests when integrating learning algorithms from different R packages is that their API can change.
+The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was integrated.
+In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly.
+
+In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners.
+These tests perform simple sanity checks and also verify that the learner's metadata is correctly annotated, e.g. that a learner that claims to be able to handle missing values actually does so.
+
+## Templates for new Learners
+
+In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the new learner itself, as well as associated test files.
+These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user.
+The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors.
+
+# Acknowledgements
+
+Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research
+Foundation) – 460135501 (NFDI project MaRDI).
+John Zobolas received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 101016851, project PANCAIM.
+
+# References