Merge pull request #31 from boost-R/refactor_noncyc

Refactor noncyclical fitting code
boost-R · Apr 4, 2017 · bb9a1da · bb9a1da
2 parents 4f21eb6 + 6309410
commit bb9a1da
Show file tree

Hide file tree

Showing 12 changed files with 292 additions and 463 deletions.
diff --git a/R/AAA.R b/R/AAA.R
@@ -3,3 +3,6 @@
     ### the variance of the negative gradient
     options(gamboostLSS_stab_ngrad = FALSE)
 }
+
+# get rid of NOTEs in R CMD check for "undefined global functions or variables"
+globalVariables(c("ngradient", "y", "combined_risk"))
diff --git a/R/cvrisk.nc_mboostLSS.R b/R/cvrisk.nc_mboostLSS.R
@@ -99,7 +99,7 @@ plot.nc_cvriskLSS <- function(x, type = "lines",
                               main = attr(x, "type"),
                               ...) {
     if (type != "lines")
-        warning("Only ", sQuote('type = "lines"'), " supported for non-cyclical fitting")
+        warning("Only ", sQuote('type = "lines"'), " supported for noncyclical fitting")
     plot.cvriskLSS(x = x, type = "lines", xlab = xlab, ylab = ylab, 
                     ylim = ylim, main = main, ...)
 }

diff --git a/R/helpers.R b/R/helpers.R
@@ -119,20 +119,20 @@ rescale_weights <- function(w) {
 
 ## helper function in a modified version based on mboost_2.2-3
 ## print trace of boosting iterations
-do_trace <- function(current, mstart, risk,
-                     linebreak = options("width")$width / 2, mstop = 1000) {
-    current <- current - mstart
+do_trace <- function(current, risk, mstart,
+                     linebreak = round(options("width")$width / 2), mstop = 1000) {
+
+    current <- current - mstart  
+
     if (current != mstop) {
-        if ((current - 1) %/% linebreak == (current - 1) / linebreak) {
+        if (current %% linebreak == 1) {
             mchr <- formatC(current + mstart, format = "d",
                             width = nchar(mstop) + 1, big.mark = "'")
             cat(paste("[", mchr, "] ",sep = ""))
-        } else {
-            if ((current %/% linebreak != current / linebreak)) {
-                cat(".")
-            } else {
-                cat(" -- risk:", risk[current + mstart], "\n")
-            }
+        }
+        cat(".")
+        if (current %% linebreak == 0) {
+          cat(" -- risk:", risk[current + mstart], "\n")
         }
     } else {
         cat("\nFinal risk:", risk[current + mstart], "\n")

diff --git a/R/mboostLSS.R b/R/mboostLSS.R
diff --git a/R/methods.R b/R/methods.R
@@ -1,7 +1,7 @@
 ## Methods
 
 "[.mboostLSS" <- function(x, i, return = TRUE, ...) {
-    stopifnot((length(i) == 1 | length(i) == length(x)) && i > 0)
+    stopifnot(length(i) == 1 | length(i) == length(x))
     attr(x, "subset")(i)
     if (return) return(x)
     invisible(NULL)
@@ -455,8 +455,7 @@ stabsel.mboostLSS <- function(x, cutoff, q, PFER,
             if (nsel >= q)
                 break
         }
-        #this changes nothing for method = "cycling" but fixes mstop for
-        #method = "inner" or "outer"
+        #this changes nothing for method = "cyclic" but fixes mstop for method = "noncyclic"
         mstop <- check(mstop, "mstop", names(x))
         ## complete paths
         if (any(sapply(xs, length) < mstop)) {
@@ -476,7 +475,7 @@ stabsel.mboostLSS <- function(x, cutoff, q, PFER,
         ret <- unlist(ret)
 
         ## compute selection paths
-        #merging for method cycling
+        #merging for method cyclic
         if(!inherits(x, "nc_mboostLSS")){
             sequences <- lapply(1:length(xs), function(i) {
                 res <- matrix(FALSE, nrow = length(nms[[i]]), ncol = mstop[[i]])

diff --git a/README.md b/README.md
@@ -22,6 +22,8 @@ For installation instructions see below.
 Instructions on how to use `gamboostLSS` can be found here:
 - [gamboostLSS tutorial](https://www.jstatsoft.org/article/view/v074i01).
 
+Details on the noncyclical fitting method can be found here:
+- [noncyclical fitting](https://arxiv.org/abs/1611.10171); This is a preleminary version currently under review.
 
 ## Issues & Feature Requests
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -12,21 +12,13 @@ install:
   - ps: Bootstrap
 
 # Adapt as necessary starting from here
-
-# environment:
-#   matrix:
-#   - TEST_DIR: patch
-#   - TEST_DIR: pkg
-
 before_build:
-#   - cp ../travis-tool.sh ./travis-tool.sh
-#   - cp travis-tool.sh.cmd %TEST_DIR%/travis-tool.sh.cmd
-#   - cd %TEST_DIR%
   - bash -c "echo '^travis-tool\.sh\.cmd$' >> .Rbuildignore"
-#   - dir
 
 build_script:
   - travis-tool.sh install_deps
+  - travis-tool.sh install_github hofnerb/stabs
+  - travis-tool.sh install_github boost-R/mboost
 
 test_script:
   - travis-tool.sh run_tests

diff --git a/inst/CITATION b/inst/CITATION
@@ -59,4 +59,24 @@ citEntry(
 )
 
 
+citEntry(
+  entry= "TechReport",
+  title = "Stability selection for component-wise gradient boosting in multiple dimensions",
+  author   =  personList(as.person("Janek Thomas"),
+                         as.person("Andreas Mayr"),
+			 as.person("Bernd Bischl"),
+			 as.person("Matthias Schmid"),
+			 as.person("Adam Smith"),
+                         as.person("Benjamin Hofner")),
+  year	   = "2016",
+  institution = "ArXiv",
+  header   = "To cite the noncyclical fitting method of 'gamboostLSS' use:",
+  url = "https://arxiv.org/abs/1611.10171",
+  textVersion =
+    paste("Thomas, J., Mayr, A., Bischl, B., Schmid, M., Smith, A., and Hofner, B. (2016).",
+          "Stability selection for component-wise gradient boosting in multiple dimensions.",
+          "arXiv preprint arXiv:1611.10171.")
+)
+
+
 citFooter('\nUse ', sQuote('toBibtex(citation("gamboostLSS"))'), ' to extract BibTeX references.')
diff --git a/man/cvrisk.Rd b/man/cvrisk.Rd
@@ -40,13 +40,13 @@ make.grid(max, length.out = 10, min = NULL, log = TRUE,
     bootstrap samples.
   }
   \item{grid}{
-    If the model was fitted with \code{method = "cycling"}, grid is
+    If the model was fitted with \code{method = "cyclic"}, grid is
     a matrix of stopping parameters the empirical risk is to be evaluated for. 
     Each row represents a parameter combination. The number of columns must be 
-    equal to the number of     parameters of the GAMLSS family. Per default, 
+    equal to the number of parameters of the GAMLSS family. Per default, 
     \code{make.grid(mstop(object))} is used.
 
-    Otherwise (i.e., for \code{method = "inner"} or \code{method = "outer"}) grid
+    Otherwise (i.e., for \code{method = "noncyclic"}) grid
     is a vector of mstop values. Per default all steps up to the intial stopping 
     iteration, i.e., \code{1:mstop(object)} are used.
   }
@@ -132,8 +132,7 @@ make.grid(max, length.out = 10, min = NULL, log = TRUE,
   fold to fold. The heatmap shows only the average risk but in a nicer
   fashion.
 
-  For the noncyclical fitting methods (i.e. \code{method = "inner"} or 
-  \code{method = "outer"}) only the line plot for exists.
+  For the \code{method = "noncyclic"} only the line plot for exists.
 
   Hofner et al. (2015) provide a detailed description of
   cross-validation for \code{\link{gamboostLSS}} models and show a

diff --git a/man/mboostLSS.Rd b/man/mboostLSS.Rd
@@ -18,13 +18,17 @@
 }
 \usage{
 mboostLSS(formula, data = list(), families = GaussianLSS(),
-          control = boost_control(), weights = NULL, method = c("cycling", "inner", "outer"), ...)
+          control = boost_control(), weights = NULL, 
+          method = c("cyclic", "noncyclic"), ...)
 glmboostLSS(formula, data = list(), families = GaussianLSS(),
-            control = boost_control(), weights = NULL, method = c("cycling", "inner", "outer"), ...)
+            control = boost_control(), weights = NULL, 
+            method = c("cyclic", "noncyclic"), ...)
 gamboostLSS(formula, data = list(), families = GaussianLSS(),
-            control = boost_control(), weights = NULL, method = c("cycling", "inner", "outer"), ...)
+            control = boost_control(), weights = NULL, 
+            method = c("cyclic", "noncyclic"), ...)
 blackboostLSS(formula, data = list(), families = GaussianLSS(),
-              control = boost_control(), weights = NULL,method = c("cycling", "inner", "outer"), ...)
+              control = boost_control(), weights = NULL,
+              method = c("cyclic", "noncyclic"), ...)
 
 ## fit function:
 mboostLSS_fit(formula, data = list(), families = GaussianLSS(),
@@ -58,9 +62,9 @@ mboostLSS_fit(formula, data = list(), families = GaussianLSS(),
   \item{call}{ used to forward the call from \code{mboostLSS},
     \code{glmboostLSS}, \code{gamboostLSS} and \code{blackboostLSS}.
     This argument should not be directly specified by users!}
-  \item{method}{ fitting method, currently three methods are supported:
-  \code{"cycling"}, \code{"inner"} and \code{"outer"}. The latter two require a
-  one dimensional \code{mstop} value.}
+  \item{method}{ fitting method, currently two methods are supported:
+  \code{"cyclic"} and \code{"noncycli"}. The latter two requires a one dimensional \code{mstop} 
+    value.}
   \item{\dots}{Further arguments to be passed to \code{mboostLSS_fit}.
     In  \code{mboostLSS_fit}, \code{\dots} represent further arguments to be
     passed to \code{\link{mboost}} and \code{\link{mboost_fit}}. So
@@ -99,7 +103,7 @@ mboostLSS_fit(formula, data = list(), families = GaussianLSS(),
   function without \code{LSS}. For further possible arguments see
   these functions as well as \code{\link{mboost_fit}}.
 
-  For \code{method = "cycling"} it is possible to specify one or
+  For \code{method = "cyclic"} it is possible to specify one or
   multiple \code{mstop} and \code{nu} values via
   \code{\link{boost_control}}. In the case of one single value, this
   value is used for all distribution parameters of the GAMLSS model. 
@@ -113,12 +117,11 @@ mboostLSS_fit(formula, data = list(), families = GaussianLSS(),
   specify, e.g., \code{mstop = 100} via \code{\link{boost_control}}. For
   more-dimensional stopping, one can specify, e.g., \code{mstop =
     list(mu = 100, sigma = 200)} (see examples).
-  If \code{method} is set to \code{"inner"} or \code{"outer"}, \code{mstop} has
+  If \code{method} is set to \code{"noncyclic"}, \code{mstop} has
   to be a one dimensional integer. Instead of cycling through all distribution 
-  parameters, in each iteration only the best baselearner is used. For 
-  \code{"inner"} one baselearner of every parameter is selected via RSS, the 
-  distribution parameter is then chosen via the loss. For \code{"outer"} the
-  selection is solely based on the (outer) loss.
+  parameters, in each iteration only the best baselearner is used. One baselearner of every 
+  parameter is selected via RSS, the distribution parameter is then chosen via the loss.
+  For details on the noncyclic fitting method see Thomas et. al. (2016).
 
   To (potentially) stabilize the model estimation by standardizing the
   negative gradients one can use the argument \code{stabilization} of
@@ -152,6 +155,10 @@ Statistical Society, Series C (Applied Statistics), 54, 507-554.
 
 Buehlmann, P. and Hothorn, T. (2007), Boosting algorithms: Regularization,
 prediction and model fitting. Statistical Science, 22(4), 477--505.
+
+Thomas, J., Mayr, A., Bischl, B., Schmid, M., Smith, A., and Hofner, B. (2016), 
+Stability selection for component-wise gradient boosting in multiple dimensions. 
+arXiv preprint arXiv:1611.10171.
 }
 
 \seealso{

diff --git a/man/stabsel.mboostLSS.Rd b/man/stabsel.mboostLSS.Rd
@@ -127,7 +127,7 @@ dat <- data.frame(x1, x2, x3, x4, x5, x6, y)
 ### linear model with y ~ . for both components: 400 boosting iterations
 model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat,
                      control = boost_control(mstop = 400),
-                     center = TRUE, method = "inner")
+                     center = TRUE, method = "noncyclic")
 
 \donttest{### Do not test the following code per default on CRAN as it takes some time to run: