diff --git a/Changes.md b/Changes.md index 732f0de19ac7..333619f7f537 100644 --- a/Changes.md +++ b/Changes.md @@ -2,6 +2,88 @@ ## H2O +### 3.46.0.5 - 8/28/2024 + +Download at: http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/5/index.html + +#### Bug +- [[#16328]](https://github.com/h2oai/h2o-3/issues/16328) - Updated how ModelSelection handles categorical predictors to preserve the best categorical predictor when the best categorical level performs well relative to other predictors. +- [[#16120]](https://github.com/h2oai/h2o-3/issues/16120) - Resolved that MOJO is working for Isolation Forest and Extended Isolation forest for implemented versions. + +#### New Feature +- [[#16327]](https://github.com/h2oai/h2o-3/issues/16327) - Ensured H2O-3 can load data from Snowflake using JDBC connector. + +#### Docs +- [[#16215]](https://github.com/h2oai/h2o-3/issues/16215) - Updated the following user guide pages to adhere to style guide updates: Algorithms, Supported data types, Quantiles, and Early stopping. +- [[#16207]](https://github.com/h2oai/h2o-3/issues/16207) - Updated the Starting H2O user guide page to adhere to style guide updates. +- [[#15989]](https://github.com/h2oai/h2o-3/issues/15989) - Updated Python documentation for Decision Tree algorithm. + +#### Security +- [[#16349]](https://github.com/h2oai/h2o-3/issues/16349) - Addressed sonatype-2024-0171 by upgrading jackson-databind to 2.17.2. +- [[#16342]](https://github.com/h2oai/h2o-3/issues/16342) - Addressed SNYK-JAVA-DNSJAVA-7547403, SNYK-JAVA-DNSJAVA-7547404, SNYK-JAVA-DNSJAVA-7547405, and CVE-2024-25638 by upgrading dnsjava to 3.6.0. + +### 3.46.0.4 - 7/9/2024 + +Download at: http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/4/index.html + +#### Docs +- [[#16212]](https://github.com/h2oai/h2o-3/issues/16212) - Updating user guide - H2O Clients. +- [[#16214]](https://github.com/h2oai/h2o-3/issues/16214) - Updating user guide - Data Manipulation. +- [[#16213]](https://github.com/h2oai/h2o-3/issues/16213) - Updating user guide - Getting data into your H2O cluster. + +#### Security +- [[#15748]](https://github.com/h2oai/h2o-3/issues/15748) - Addressed PRISMA-2023-0067 by upgrading jackson-databind. + +### 3.46.0.3 - 6/11/2024 + +Download at: http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/3/index.html + +#### Bug Fix +- [[#16274]](https://github.com/h2oai/h2o-3/issues/16274) - Fixed plotting for H2O Explainabilty by resolving issue in the matplotlib wrapper. +- [[#16192]](https://github.com/h2oai/h2o-3/issues/16192) - Fixed `h2o.findSynonyms` failing if the `word` parameter is unknown to the Word2Vec model. +- [[#15947]](https://github.com/h2oai/h2o-3/issues/15947) - Fixed `skipped_columns` error caused by mismatch during the call to `parse_setup` when constructing an `H2OFrame`. + +#### Improvement +- [[#16278]](https://github.com/h2oai/h2o-3/issues/16278) - Added flag to enable `use_multi_thread` automatically when using `as_data_frame`. + +#### New Feature +- [[#16284]](https://github.com/h2oai/h2o-3/issues/16284) - Added support for Websockets to steam.jar. + +#### Docs +- [[#16189]](https://github.com/h2oai/h2o-3/issues/16189) - Updating user guide - Downloading & Installing H2O. +- [[#16288]](https://github.com/h2oai/h2o-3/issues/16288) - Fixed GBM Python example in user guide. +- [[#16188]](https://github.com/h2oai/h2o-3/issues/16188) - Updated API-related changes page to adhere to style guide requirements. +- [[#16016]](https://github.com/h2oai/h2o-3/issues/16016) - Added examples to Python documentation for Uplift DRF. +- [[#15988]](https://github.com/h2oai/h2o-3/issues/15988) - Added examples to Python documentation for Isotonic Regression. + +### 3.46.0.2 - 5/13/2024 + +Download at: http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/2/index.html + +#### Bug Fix +- [[#16161]](https://github.com/h2oai/h2o-3/issues/16161) - Fixed parquet export throwing NPEs when column types are strings. +- [[#16149]](https://github.com/h2oai/h2o-3/issues/16149) - Fixed GAM models failing with datasets of certain size by rebalancing the dataset to avoid collision. +- [[#16130]](https://github.com/h2oai/h2o-3/issues/16130) - Removed `distutils` version check to stop deprecation warnings with Python 3.12. +- [[#16026]](https://github.com/h2oai/h2o-3/issues/16026) - Removed `custom_metric_func` from ModelSelection. +- [[#15697]](https://github.com/h2oai/h2o-3/issues/15697) - Fixed MOJO failing to recognize `fold_column` and therefore using wrong index calculated for the `offset_column`. + +#### Improvement +- [[#16116]](https://github.com/h2oai/h2o-3/issues/16116) - Implemented a warning if you want to use monotone splines for GAM but don’t set `non_negative=True` that you will not get a monotone output. +- [[#16056]](https://github.com/h2oai/h2o-3/issues/16066) - Added support to XGBoost for all `gblinear` parameters. +- [[#6722]](https://github.com/h2oai/h2o-3/issues/6722) - Implemented linear constraint support to GLM toolbox. + +#### New Feature +- [[#16146]](https://github.com/h2oai/h2o-3/issues/16146) - Added ZSTD compression format support. + +#### Docs +- [[#16193]](https://github.com/h2oai/h2o-3/issues/16193) - Added mapr7.0 to the download page for the Install on Hadoop tab. +- [[#16180]](https://github.com/h2oai/h2o-3/issues/16180) - Updated Index page to adhere to style guide requirements. +- [[#16131]](https://github.com/h2oai/h2o-3/issues/16131) - Added 3.46 release blog to the user guide. + +#### Security +- [[#16170]](https://github.com/h2oai/h2o-3/issues/16170) - Addressed CVE-2024-21634 by upgrading aws-java-sdk-*. +- [[#16135]](https://github.com/h2oai/h2o-3/issues/16135) - Addressed CVE-2024-29131 by upgrading commons-configuration2. + ### 3.46.0.1 - 3/13/2024 Download at: http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/1/index.html diff --git a/build.gradle b/build.gradle index 9f34251638c6..57107b67810f 100644 --- a/build.gradle +++ b/build.gradle @@ -179,7 +179,7 @@ ext { // Versions of libraries shared cross all projects // junitVersion = '4.12' - awsJavaSdkVersion = '1.12.268' + awsJavaSdkVersion = '1.12.705' // // Optional H2O modules which can be included h2o.jar assembly diff --git a/docker/Jenkinsfile-build-docker b/docker/Jenkinsfile-build-docker index c6ae3654059f..61f541ee9575 100644 --- a/docker/Jenkinsfile-build-docker +++ b/docker/Jenkinsfile-build-docker @@ -22,7 +22,7 @@ IMAGE_NAME_PREFIX = 'harbor.h2o.ai/opsh2oai/h2o-3' JDK_VERSIONS = ['8', '10', '11', '12', '13', '14', '15', '16', '17'] JDK_VERSIONS_PY_R = ['8', '11', '17'] // stable, last-supported, latest PYTHON_VERSIONS = ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] -R_VERSION = ['3.3.3', '3.4.1', '3.5.3', '3.6.2', '4.0.2'] +R_VERSION = ['3.3.3', '3.4.1', '3.5.3', '3.6.2', '4.0.2', '4.4.0'] def pipelineContext diff --git a/docker/Makefile b/docker/Makefile index 1680a8c1f93b..9847d489c01c 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -276,6 +276,22 @@ ifeq ($(shell echo $(PUSH) | tr [:upper:] [:lower:] ),true) docker push harbor.h2o.ai/opsh2oai/h2o-3/dev-r-4.0.2-jdk-$*:$(VERSION) endif +ifneq ($(CI), 1) +dev-r-4.4.0-jdk-%: dev-r-4.4.0 +endif +dev-r-4.4.0-jdk-%: + docker build -t harbor.h2o.ai/opsh2oai/h2o-3/dev-r-4.4.0-jdk-$*:$(VERSION) \ + $(NO_CACHE) \ + -f jenkins-images/Dockerfile-jdk-others \ + --build-arg FROM_VERSION=$(VERSION) \ + --build-arg FROM_IMAGE=harbor.h2o.ai/opsh2oai/h2o-3/dev-r-4.4.0 \ + --build-arg INSTALL_JAVA_VERSION=$* \ + --build-arg H2O_BRANCH=$(H2O_BRANCH) \ + . +ifeq ($(shell echo $(PUSH) | tr [:upper:] [:lower:] ),true) + docker push harbor.h2o.ai/opsh2oai/h2o-3/dev-r-4.4.0-jdk-$*:$(VERSION) +endif + ifneq ($(CI), 1) dev-r-%: dev-r-base endif diff --git a/docker/jenkins-images/Dockerfile-r b/docker/jenkins-images/Dockerfile-r index e9b592580a9f..4acda2697da8 100644 --- a/docker/jenkins-images/Dockerfile-r +++ b/docker/jenkins-images/Dockerfile-r @@ -9,7 +9,8 @@ ENV R_VERSION=${R_VERSION} # In C, global variables with multiple tentative definitions now result in linker errors. With -fcommon such definitions # are silently merged during linking.). I tried using -fcommon but without much luck so due to the time constrain I # decided to use gcc-9 instead. -RUN apt-get update && apt-get install -y gcc-9 gfortran-9 g++-9 tcl-dev tk-dev && apt-get clean && apt-get autoremove -y && \ +RUN apt-get update && apt-get install -y gcc-9 gfortran-9 g++-9 tcl-dev tk-dev tidy && \ + apt-get clean && apt-get autoremove -y && \ rm -rf /var/cache/apt/* &&\ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99 --slave /usr/bin/g++ g++ /usr/bin/g++-9 \ --slave /usr/bin/gcov gcov /usr/bin/gcov-9 && update-alternatives --install /usr/bin/gfortran f95 /usr/bin/gfortran-9 99 diff --git a/docker/prisma/Dockerfile b/docker/prisma/Dockerfile index f0cf92efca0c..60eb3b381dac 100644 --- a/docker/prisma/Dockerfile +++ b/docker/prisma/Dockerfile @@ -8,6 +8,4 @@ RUN for dir in $DIRECTORIES; do \ chown -R 2117:2117 /$dir; \ done -RUN npm install snyk -g - CMD ["/bin/bash"] diff --git a/docker/scripts/install_R_version b/docker/scripts/install_R_version index bdc9d89e5a13..193e226e7ddc 100755 --- a/docker/scripts/install_R_version +++ b/docker/scripts/install_R_version @@ -111,7 +111,8 @@ function getAndInstallFromCRAN(){ NAME="$1" VERSION="$2" getAndInstall "https://cran.r-project.org/src/contrib/Archive/${NAME}/${NAME}_${VERSION}.tar.gz" "$NAME" "$VERSION" || \ - getAndInstall "https://cran.r-project.org/src/contrib/${NAME}_${VERSION}.tar.gz" "$NAME" "$VERSION" || exit 1 + getAndInstall "https://cran.r-project.org/src/contrib/${NAME}_${VERSION}.tar.gz" "$NAME" "$VERSION" || \ + getAndInstall "https://cloud.r-project.org/src/contrib/Archive/${NAME}/${NAME}_${VERSION}.tar.gz" "$NAME" "$VERSION" || exit 1 set -x } @@ -175,214 +176,452 @@ function getAndInstallFromCRAN(){ # Install dependencies echo "Installing dependencies for R ${R_VERSION}" -getAndInstallFromCRAN abind 1.4-5 -getAndInstallFromCRAN acepack 1.4.1 -getAndInstallFromCRAN AUC 0.3.0 -getAndInstallFromCRAN backports 1.3.0 -getAndInstallFromCRAN base64enc 0.1-3 -getAndInstallFromCRAN bit 4.0.4 -getAndInstallFromCRAN bitops 1.0-7 -getAndInstallFromCRAN boot 1.3-18 -getAndInstallFromCRAN brew 1.0-6 -getAndInstallFromCRAN brio 1.1.2 -getAndInstallFromCRAN clipr 0.7.1 -getAndInstallFromCRAN cluster 2.0.5 -getAndInstallFromCRAN codetools 0.2-15 -getAndInstallFromCRAN colorspace 2.0-2 -getAndInstallFromCRAN commonmark 1.7 -getAndInstallFromCRAN cpp11 0.4.1 -getAndInstallFromCRAN crayon 1.4.2 -getAndInstallFromCRAN curl 4.3.2 -getAndInstallFromCRAN data.table 1.14.2 -getAndInstallFromCRAN DEoptimR 1.0-9 -getAndInstallFromCRAN digest 0.6.28 -getAndInstallFromCRAN diptest 0.76-0 -getAndInstallFromCRAN evaluate 0.14 -getAndInstallFromCRAN fansi 0.5.0 -getAndInstallFromCRAN farver 2.1.0 -getAndInstallFromCRAN fastmap 1.1.0 -getAndInstallFromCRAN foreign 0.8-67 -getAndInstallFromCRAN formatR 1.11 -getAndInstallFromCRAN Formula 1.2-4 -getAndInstallFromCRAN fs 1.5.0 -getAndInstallFromCRAN futile.options 1.0.1 -getAndInstallFromCRAN generics 0.1.1 -getAndInstallFromCRAN getopt 1.20.3 -getAndInstallFromCRAN gitcreds 0.1.1 -getAndInstallFromCRAN glue 1.5.0 -getAndInstallFromCRAN gtable 0.3.0 -getAndInstallFromCRAN gtools 3.9.2 -getAndInstallFromCRAN HDtweedie 1.1 -getAndInstallFromCRAN highlight 0.5.0 -getAndInstallFromCRAN ini 0.3.1 -getAndInstallFromCRAN isoband 0.2.5 -getAndInstallFromCRAN iterators 1.0.13 -getAndInstallFromCRAN jsonlite 1.7.2 -getAndInstallFromCRAN kernlab 0.9-29 -getAndInstallFromCRAN KernSmooth 2.23-15 -getAndInstallFromCRAN labeling 0.4.2 -getAndInstallFromCRAN lattice 0.20-34 -getAndInstallFromCRAN lazyeval 0.2.2 -getAndInstallFromCRAN LiblineaR 1.94-2 -getAndInstallFromCRAN magrittr 2.0.1 -getAndInstallFromCRAN MASS 7.3-45 -getAndInstallFromCRAN mclust 5.4.8 -getAndInstallFromCRAN mime 0.12 -getAndInstallFromCRAN misc3d 0.9-1 -getAndInstallFromCRAN mlbench 2.1-3 -getAndInstallFromCRAN MLmetrics 1.0.0 -getAndInstallFromCRAN modeltools 0.2-23 -getAndInstallFromCRAN mvtnorm 1.0-0 -getAndInstallFromCRAN nnet 7.3-12 -getAndInstallFromCRAN packrat 0.7.0 -getAndInstallFromCRAN pixmap 0.4-12 -getAndInstallFromCRAN pkgconfig 2.0.3 -getAndInstallFromCRAN praise 1.0.0 -getAndInstallFromCRAN prettyunits 1.1.1 -getAndInstallFromCRAN proxy 0.4-20 -getAndInstallFromCRAN ps 1.6.0 -getAndInstallFromCRAN R.methodsS3 1.8.1 -getAndInstallFromCRAN R6 2.5.1 -getAndInstallFromCRAN randomForest 4.6-14 -getAndInstallFromCRAN rappdirs 0.3.3 -getAndInstallFromCRAN RColorBrewer 1.1-2 -getAndInstallFromCRAN Rcpp 1.0.12 -getAndInstallFromCRAN RcppParallel 5.1.4 -getAndInstallFromCRAN remotes 2.4.1 -getAndInstallFromCRAN rlang 1.0.0 -getAndInstallFromCRAN rpart 4.1-10 -getAndInstallFromCRAN rprojroot 2.0.2 -getAndInstallFromCRAN rstudioapi 0.13 -getAndInstallFromCRAN RUnit 0.4.32 -getAndInstallFromCRAN slam 0.1-40 -getAndInstallFromCRAN sourcetools 0.1.7 -getAndInstallFromCRAN SparseM 1.81 -getAndInstallFromCRAN sparsepp 1.22 -getAndInstallFromCRAN spatial 7.3-11 -getAndInstallFromCRAN statmod 1.4.36 -getAndInstallFromCRAN stringi 1.7.5 -getAndInstallFromCRAN svd 0.5 -getAndInstallFromCRAN sys 3.4 -getAndInstallFromCRAN systemfonts 0.2.3 -getAndInstallFromCRAN tweedie 2.3.5 -getAndInstallFromCRAN utf8 1.2.2 -getAndInstallFromCRAN uuid 1.0-3 -getAndInstallFromCRAN versions 0.3 -getAndInstallFromCRAN viridisLite 0.4.0 -getAndInstallFromCRAN whisker 0.4 -getAndInstallFromCRAN withr 2.4.2 -getAndInstallFromCRAN xfun 0.28 -getAndInstallFromCRAN xml2 1.3.2 -getAndInstallFromCRAN xtable 1.8-4 -getAndInstallFromCRAN yaml 2.2.1 -getAndInstallFromCRAN zip 2.2.0 -getAndInstallFromCRAN checkmate 2.0.0 -getAndInstallFromCRAN bit64 4.0.5 -getAndInstallFromCRAN caTools 1.17.1.4 -getAndInstallFromCRAN RCurl 1.95-4.12 -getAndInstallFromCRAN munsell 0.5.0 -getAndInstallFromCRAN diffobj 0.3.5 -getAndInstallFromCRAN robustbase 0.93-7 -getAndInstallFromCRAN lambda.r 1.2.4 -getAndInstallFromCRAN optparse 1.7.1 -getAndInstallFromCRAN cli 3.1.0 -getAndInstallFromCRAN gridExtra 2.3 -getAndInstallFromCRAN foreach 1.5.1 -getAndInstallFromCRAN Matrix 1.2-8 -getAndInstallFromCRAN nlme 3.1-131 -getAndInstallFromCRAN sp 1.4-5 -getAndInstallFromCRAN class 7.3-14 -getAndInstallFromCRAN prabclus 2.3-2 -getAndInstallFromCRAN plot3D 1.4 -getAndInstallFromCRAN flexmix 2.3-17 -getAndInstallFromCRAN R.oo 1.24.0 -getAndInstallFromCRAN processx 3.5.2 -getAndInstallFromCRAN latticeExtra 0.6-28 -getAndInstallFromCRAN RcppArmadillo 0.10.7.3.0 -getAndInstallFromCRAN cachem 1.0.6 -getAndInstallFromCRAN ellipsis 0.3.2 -getAndInstallFromCRAN htmltools 0.5.2 -getAndInstallFromCRAN later 1.3.0 -getAndInstallFromCRAN lifecycle 1.0.1 -getAndInstallFromCRAN purrr 0.3.4 -getAndInstallFromCRAN desc 1.4.0 -getAndInstallFromCRAN stringr 1.4.0 -getAndInstallFromCRAN askpass 1.1 -getAndInstallFromCRAN highr 0.9 -getAndInstallFromCRAN tinytex 0.35 -getAndInstallFromCRAN rversions 2.1.1 -getAndInstallFromCRAN gplots 3.1.1 -getAndInstallFromCRAN futile.logger 1.4.3 -getAndInstallFromCRAN sessioninfo 1.2.1 -getAndInstallFromCRAN glmnet 2.0-2 -getAndInstallFromCRAN irlba 2.3.3 -getAndInstallFromCRAN survival 2.44-1.1 -getAndInstallFromCRAN xgboost 1.0.0.2 -getAndInstallFromCRAN mgcv 1.8-17 -getAndInstallFromCRAN ade4 1.7-18 -getAndInstallFromCRAN e1071 1.7-9 -getAndInstallFromCRAN flexclust 1.4-0 -getAndInstallFromCRAN fpc 2.2-9 -getAndInstallFromCRAN R.utils 2.11.0 -getAndInstallFromCRAN callr 3.7.0 -getAndInstallFromCRAN xopen 1.0.0 -getAndInstallFromCRAN memoise 2.0.0 -getAndInstallFromCRAN vctrs 0.3.8 -getAndInstallFromCRAN crosstalk 1.2.0 -getAndInstallFromCRAN fontawesome 0.2.2 -getAndInstallFromCRAN htmlwidgets 1.5.4 -getAndInstallFromCRAN jquerylib 0.1.4 -getAndInstallFromCRAN sass 0.4.0 -getAndInstallFromCRAN promises 1.2.0.1 -getAndInstallFromCRAN scales 1.1.1 -getAndInstallFromCRAN pkgload 1.2.3 -getAndInstallFromCRAN openssl 1.4.5 -getAndInstallFromCRAN knitr 1.36 -getAndInstallFromCRAN ROCR 1.0-7 -getAndInstallFromCRAN text2vec 0.5.0 -getAndInstallFromCRAN coin 1.0-0 -getAndInstallFromCRAN gbm 2.1.3 -getAndInstallFromCRAN penalized 0.9-51 -getAndInstallFromCRAN RItools 0.1-17 -getAndInstallFromCRAN pkgbuild 1.2.0 -getAndInstallFromCRAN webshot 0.5.2 -getAndInstallFromCRAN pillar 1.6.4 -getAndInstallFromCRAN tidyselect 1.1.1 -getAndInstallFromCRAN bslib 0.3.1 -getAndInstallFromCRAN httpuv 1.6.3 -getAndInstallFromCRAN credentials 1.3.0 -getAndInstallFromCRAN httr 1.4.2 -getAndInstallFromCRAN rsconnect 0.8.24 -getAndInstallFromCRAN htmlTable 2.3.0 -getAndInstallFromCRAN rmarkdown 2.11 -getAndInstallFromCRAN roxygen2 7.2.2 -getAndInstallFromCRAN tables 0.9.6 -getAndInstallFromCRAN rcmdcheck 1.4.0 -getAndInstallFromCRAN repr 1.1.3 -getAndInstallFromCRAN tibble 3.1.6 -getAndInstallFromCRAN shiny 1.7.1 -getAndInstallFromCRAN gert 1.3.0 -getAndInstallFromCRAN gh 1.3.0 -getAndInstallFromCRAN uplift 0.3.5 -getAndInstallFromCRAN IRdisplay 1.0 -getAndInstallFromCRAN dplyr 1.0.7 -getAndInstallFromCRAN ggplot2 3.3.5 -getAndInstallFromCRAN rematch2 2.1.2 -getAndInstallFromCRAN miniUI 0.1.1.1 -getAndInstallFromCRAN shinyjs 2.0.0 -getAndInstallFromCRAN usethis 2.0.1 -getAndInstallFromCRAN tidyr 1.1.4 -getAndInstallFromCRAN viridis 0.6.2 -getAndInstallFromCRAN pkgdown 1.5.1 -getAndInstallFromCRAN waldo 0.3.1 -getAndInstallFromCRAN manipulateWidget 0.11.1 -getAndInstallFromCRAN plotly 4.10.0 -getAndInstallFromCRAN Hmisc 4.3-0 -getAndInstallFromCRAN testthat 3.1.0 -getAndInstallFromCRAN rgl 0.100.54 -getAndInstallFromCRAN devtools 2.4.2 -getAndInstallFromCRAN plot3Drgl 1.0.2 +if [ "$R_VERSION" = "4.4.0" ]; then + getAndInstallFromCRAN abind 1.4-5 + getAndInstallFromCRAN acepack 1.4.2 + getAndInstallFromCRAN AUC 0.3.2 + getAndInstallFromCRAN backports 1.5.0 + getAndInstallFromCRAN base64enc 0.1-3 + getAndInstallFromCRAN bit 4.0.5 + getAndInstallFromCRAN bitops 1.0-7 + getAndInstallFromCRAN boot 1.3-30 + getAndInstallFromCRAN brew 1.0-10 + getAndInstallFromCRAN brio 1.1.5 + getAndInstallFromCRAN cli 3.6.2 + getAndInstallFromCRAN clipr 0.8.0 + getAndInstallFromCRAN cluster 2.1.6 + getAndInstallFromCRAN codetools 0.2-20 + getAndInstallFromCRAN colorspace 2.1-0 + getAndInstallFromCRAN commonmark 1.9.1 + getAndInstallFromCRAN cpp11 0.4.7 + getAndInstallFromCRAN crayon 1.5.2 + getAndInstallFromCRAN curl 5.2.1 + getAndInstallFromCRAN data.table 1.15.4 + getAndInstallFromCRAN deldir 2.0-4 + getAndInstallFromCRAN DEoptimR 1.1-3 + getAndInstallFromCRAN digest 0.6.35 + getAndInstallFromCRAN diptest 0.77-1 + getAndInstallFromCRAN evaluate 0.23 + getAndInstallFromCRAN fansi 1.0.6 + getAndInstallFromCRAN farver 2.1.2 + getAndInstallFromCRAN fastmap 1.2.0 + getAndInstallFromCRAN float 0.3-2 + getAndInstallFromCRAN foreign 0.8-86 + getAndInstallFromCRAN formatR 1.14 + getAndInstallFromCRAN Formula 1.2-5 + getAndInstallFromCRAN fs 1.6.4 + getAndInstallFromCRAN futile.options 1.0.1 + getAndInstallFromCRAN generics 0.1.3 + getAndInstallFromCRAN getopt 1.20.4 + getAndInstallFromCRAN gitcreds 0.1.2 + getAndInstallFromCRAN glue 1.7.0 + getAndInstallFromCRAN gtools 3.9.5 + getAndInstallFromCRAN HDtweedie 1.2 + getAndInstallFromCRAN highlight 0.5.1 + getAndInstallFromCRAN ini 0.3.1 + getAndInstallFromCRAN isoband 0.2.7 + getAndInstallFromCRAN iterators 1.0.14 + getAndInstallFromCRAN jpeg 0.1-10 + getAndInstallFromCRAN jsonlite 1.8.8 + getAndInstallFromCRAN kernlab 0.9-32 + getAndInstallFromCRAN KernSmooth 2.23-24 + getAndInstallFromCRAN labeling 0.4.3 + getAndInstallFromCRAN lattice 0.22-6 + getAndInstallFromCRAN lazyeval 0.2.2 + getAndInstallFromCRAN LiblineaR 2.10-23 + getAndInstallFromCRAN magrittr 2.0.3 + getAndInstallFromCRAN MASS 7.3-60.2 + getAndInstallFromCRAN matrixStats 1.3.0 + getAndInstallFromCRAN mclust 6.1.1 + getAndInstallFromCRAN mime 0.12 + getAndInstallFromCRAN misc3d 0.9-1 + getAndInstallFromCRAN mlbench 2.1-5 + getAndInstallFromCRAN modeltools 0.2-23 + getAndInstallFromCRAN mvtnorm 1.2-5 + getAndInstallFromCRAN nnet 7.3-19 + getAndInstallFromCRAN packrat 0.9.2 + getAndInstallFromCRAN pixmap 0.4-13 + getAndInstallFromCRAN pkgconfig 2.0.3 + getAndInstallFromCRAN png 0.1-8 + getAndInstallFromCRAN praise 1.0.0 + getAndInstallFromCRAN prettyunits 1.2.0 + getAndInstallFromCRAN proxy 0.4-27 + getAndInstallFromCRAN ps 1.7.6 + getAndInstallFromCRAN R.methodsS3 1.8.2 + getAndInstallFromCRAN R6 2.5.1 + getAndInstallFromCRAN randomForest 4.7-1.1 + getAndInstallFromCRAN rappdirs 0.3.3 + getAndInstallFromCRAN RColorBrewer 1.1-3 + getAndInstallFromCRAN Rcpp 1.0.12 + getAndInstallFromCRAN RcppParallel 5.1.7 + getAndInstallFromCRAN remotes 2.5.0 + getAndInstallFromCRAN renv 1.0.7 + getAndInstallFromCRAN RhpcBLASctl 0.23-42 + getAndInstallFromCRAN rlang 1.1.3 + getAndInstallFromCRAN rpart 4.1.23 + getAndInstallFromCRAN rprojroot 2.0.4 + getAndInstallFromCRAN rstudioapi 0.16.0 + getAndInstallFromCRAN RUnit 0.4.33 + getAndInstallFromCRAN shape 1.4.6.1 + getAndInstallFromCRAN slam 0.1-50 + getAndInstallFromCRAN sourcetools 0.1.7-1 + getAndInstallFromCRAN SparseM 1.83 + getAndInstallFromCRAN sparsepp 1.22 + getAndInstallFromCRAN spatial 7.3-17 + getAndInstallFromCRAN statmod 1.5.0 + getAndInstallFromCRAN stringi 1.8.4 + getAndInstallFromCRAN svd 0.5.5 + getAndInstallFromCRAN sys 3.4.2 + getAndInstallFromCRAN tweedie 2.3.5 + getAndInstallFromCRAN utf8 1.2.4 + getAndInstallFromCRAN uuid 1.2-0 + getAndInstallFromCRAN versions 0.3 + getAndInstallFromCRAN viridisLite 0.4.2 + getAndInstallFromCRAN whisker 0.4.1 + getAndInstallFromCRAN withr 3.0.0 + getAndInstallFromCRAN xfun 0.44 + getAndInstallFromCRAN xtable 1.8-4 + getAndInstallFromCRAN yaml 2.3.8 + getAndInstallFromCRAN zip 2.3.1 + getAndInstallFromCRAN checkmate 2.3.1 + getAndInstallFromCRAN PKI 0.1-12 + getAndInstallFromCRAN bit64 4.0.5 + getAndInstallFromCRAN caTools 1.18.2 + getAndInstallFromCRAN RCurl 1.98-1.14 + getAndInstallFromCRAN sessioninfo 1.2.2 + getAndInstallFromCRAN munsell 0.5.1 + getAndInstallFromCRAN diffobj 0.3.5 + getAndInstallFromCRAN robustbase 0.99-2 + getAndInstallFromCRAN lambda.r 1.2.4 + getAndInstallFromCRAN optparse 1.7.5 + getAndInstallFromCRAN foreach 1.5.2 + getAndInstallFromCRAN Matrix 1.7-0 + getAndInstallFromCRAN nlme 3.1-164 + getAndInstallFromCRAN sp 2.1-4 + getAndInstallFromCRAN zoo 1.8-12 + getAndInstallFromCRAN class 7.3-22 + getAndInstallFromCRAN prabclus 2.3-3 + getAndInstallFromCRAN plot3D 1.4.1 + getAndInstallFromCRAN libcoin 1.0-10 + getAndInstallFromCRAN flexmix 2.3-19 + getAndInstallFromCRAN R.oo 1.26.0 + getAndInstallFromCRAN desc 1.4.3 + getAndInstallFromCRAN lgr 0.4.4 + getAndInstallFromCRAN processx 3.8.4 + getAndInstallFromCRAN RcppArmadillo 0.12.8.3.0 + getAndInstallFromCRAN RcppEigen 0.3.4.0.0 + getAndInstallFromCRAN interp 1.1-6 + getAndInstallFromCRAN cachem 1.1.0 + getAndInstallFromCRAN ellipsis 0.3.2 + getAndInstallFromCRAN htmltools 0.5.8.1 + getAndInstallFromCRAN later 1.3.2 + getAndInstallFromCRAN lifecycle 1.0.4 + getAndInstallFromCRAN xml2 1.3.6 + getAndInstallFromCRAN askpass 1.2.0 + getAndInstallFromCRAN highr 0.11 + getAndInstallFromCRAN tinytex 0.51 + getAndInstallFromCRAN gplots 3.1.3.1 + getAndInstallFromCRAN futile.logger 1.4.3 + getAndInstallFromCRAN irlba 2.3.5.1 + getAndInstallFromCRAN MatrixExtra 0.1.15 + getAndInstallFromCRAN mlapi 0.1.1 + getAndInstallFromCRAN survival 3.6-4 + getAndInstallFromCRAN xgboost 1.7.7.1 + getAndInstallFromCRAN mgcv 1.9-1 + getAndInstallFromCRAN ade4 1.7-22 + getAndInstallFromCRAN sandwich 3.1-0 + getAndInstallFromCRAN e1071 1.7-14 + getAndInstallFromCRAN flexclust 1.4-2 + getAndInstallFromCRAN fpc 2.2-12 + getAndInstallFromCRAN R.utils 2.12.3 + getAndInstallFromCRAN callr 3.7.6 + getAndInstallFromCRAN xopen 1.0.1 + getAndInstallFromCRAN latticeExtra 0.6-30 + getAndInstallFromCRAN memoise 2.0.1 + getAndInstallFromCRAN crosstalk 1.2.1 + getAndInstallFromCRAN fontawesome 0.5.2 + getAndInstallFromCRAN jquerylib 0.1.4 + getAndInstallFromCRAN sass 0.4.9 + getAndInstallFromCRAN promises 1.3.0 + getAndInstallFromCRAN gtable 0.3.5 + getAndInstallFromCRAN scales 1.3.0 + getAndInstallFromCRAN systemfonts 1.1.0 + getAndInstallFromCRAN vctrs 0.6.5 + getAndInstallFromCRAN rversions 2.1.2 + getAndInstallFromCRAN urlchecker 1.0.1 + getAndInstallFromCRAN openssl 2.2.0 + getAndInstallFromCRAN knitr 1.47 + getAndInstallFromCRAN ROCR 1.0-11 + getAndInstallFromCRAN rsparse 0.5.1 + getAndInstallFromCRAN gbm 2.1.9 + getAndInstallFromCRAN glmnet 4.1-8 + getAndInstallFromCRAN penalized 0.9-52 + getAndInstallFromCRAN TH.data 1.1-2 + getAndInstallFromCRAN pkgbuild 1.4.4 + getAndInstallFromCRAN webshot 0.5.5 + getAndInstallFromCRAN bslib 0.7.0 + getAndInstallFromCRAN httpuv 1.6.15 + getAndInstallFromCRAN gridExtra 2.3 + getAndInstallFromCRAN textshaping 0.4.0 + getAndInstallFromCRAN downlit 0.4.3 + getAndInstallFromCRAN pillar 1.9.0 + getAndInstallFromCRAN purrr 1.0.2 + getAndInstallFromCRAN stringr 1.5.1 + getAndInstallFromCRAN tidyselect 1.2.1 + getAndInstallFromCRAN credentials 2.0.1 + getAndInstallFromCRAN httr 1.4.7 + getAndInstallFromCRAN httr2 1.0.1 + getAndInstallFromCRAN rsconnect 1.3.0 + getAndInstallFromCRAN tables 0.9.25 + getAndInstallFromCRAN MLmetrics 1.1.3 + getAndInstallFromCRAN text2vec 0.6.4 + getAndInstallFromCRAN multcomp 1.4-25 + getAndInstallFromCRAN pkgload 1.3.4 + getAndInstallFromCRAN rcmdcheck 1.4.0 + getAndInstallFromCRAN rmarkdown 2.27 + getAndInstallFromCRAN shiny 1.8.1.1 + getAndInstallFromCRAN ragg 1.3.2 + getAndInstallFromCRAN repr 1.1.7 + getAndInstallFromCRAN tibble 3.2.1 + getAndInstallFromCRAN gert 2.0.1 + getAndInstallFromCRAN gh 1.4.1 + getAndInstallFromCRAN coin 1.4-3 + getAndInstallFromCRAN roxygen2 7.3.1 + getAndInstallFromCRAN htmlwidgets 1.6.4 + getAndInstallFromCRAN miniUI 0.1.1.1 + getAndInstallFromCRAN shinyjs 2.1.0 + getAndInstallFromCRAN IRdisplay 1.1 + getAndInstallFromCRAN dplyr 1.1.4 + getAndInstallFromCRAN ggplot2 3.5.1 + getAndInstallFromCRAN pkgdown 2.0.9 + getAndInstallFromCRAN rematch2 2.1.2 + getAndInstallFromCRAN usethis 2.2.3 + getAndInstallFromCRAN htmlTable 2.4.2 + getAndInstallFromCRAN profvis 0.3.8 + getAndInstallFromCRAN rgl 1.3.1 + getAndInstallFromCRAN tidyr 1.3.1 + getAndInstallFromCRAN viridis 0.6.5 + getAndInstallFromCRAN waldo 0.5.2 + getAndInstallFromCRAN plot3Drgl 1.0.4 + getAndInstallFromCRAN plotly 4.10.4 + getAndInstallFromCRAN RItools 0.3-3 + getAndInstallFromCRAN Hmisc 5.1-3 + getAndInstallFromCRAN testthat 3.2.1.1 + getAndInstallFromCRAN devtools 2.4.5 + getAndInstallFromCRAN uplift 0.3.5 + getAndInstallFromCRAN DT 0.33 + +else + + getAndInstallFromCRAN abind 1.4-5 + getAndInstallFromCRAN acepack 1.4.1 + getAndInstallFromCRAN AUC 0.3.0 + getAndInstallFromCRAN backports 1.3.0 + getAndInstallFromCRAN base64enc 0.1-3 + getAndInstallFromCRAN bit 4.0.4 + getAndInstallFromCRAN bitops 1.0-7 + getAndInstallFromCRAN boot 1.3-18 + getAndInstallFromCRAN brew 1.0-6 + getAndInstallFromCRAN brio 1.1.2 + getAndInstallFromCRAN clipr 0.7.1 + getAndInstallFromCRAN cluster 2.0.5 + getAndInstallFromCRAN codetools 0.2-15 + getAndInstallFromCRAN colorspace 2.0-2 + getAndInstallFromCRAN commonmark 1.7 + getAndInstallFromCRAN cpp11 0.4.1 + getAndInstallFromCRAN crayon 1.4.2 + getAndInstallFromCRAN curl 4.3.2 + getAndInstallFromCRAN data.table 1.14.2 + getAndInstallFromCRAN DEoptimR 1.0-9 + getAndInstallFromCRAN digest 0.6.28 + getAndInstallFromCRAN diptest 0.76-0 + getAndInstallFromCRAN evaluate 0.14 + getAndInstallFromCRAN fansi 0.5.0 + getAndInstallFromCRAN farver 2.1.0 + getAndInstallFromCRAN fastmap 1.1.0 + getAndInstallFromCRAN foreign 0.8-67 + getAndInstallFromCRAN formatR 1.11 + getAndInstallFromCRAN Formula 1.2-4 + getAndInstallFromCRAN fs 1.5.0 + getAndInstallFromCRAN futile.options 1.0.1 + getAndInstallFromCRAN generics 0.1.1 + getAndInstallFromCRAN getopt 1.20.3 + getAndInstallFromCRAN gitcreds 0.1.1 + getAndInstallFromCRAN glue 1.5.0 + getAndInstallFromCRAN gtable 0.3.0 + getAndInstallFromCRAN gtools 3.9.2 + getAndInstallFromCRAN HDtweedie 1.1 + getAndInstallFromCRAN highlight 0.5.0 + getAndInstallFromCRAN ini 0.3.1 + getAndInstallFromCRAN isoband 0.2.5 + getAndInstallFromCRAN iterators 1.0.13 + getAndInstallFromCRAN jsonlite 1.7.2 + getAndInstallFromCRAN kernlab 0.9-29 + getAndInstallFromCRAN KernSmooth 2.23-15 + getAndInstallFromCRAN labeling 0.4.2 + getAndInstallFromCRAN lattice 0.20-34 + getAndInstallFromCRAN lazyeval 0.2.2 + getAndInstallFromCRAN LiblineaR 1.94-2 + getAndInstallFromCRAN magrittr 2.0.1 + getAndInstallFromCRAN MASS 7.3-45 + getAndInstallFromCRAN mclust 5.4.8 + getAndInstallFromCRAN mime 0.12 + getAndInstallFromCRAN misc3d 0.9-1 + getAndInstallFromCRAN mlbench 2.1-3 + getAndInstallFromCRAN MLmetrics 1.0.0 + getAndInstallFromCRAN modeltools 0.2-23 + getAndInstallFromCRAN mvtnorm 1.0-0 + getAndInstallFromCRAN nnet 7.3-12 + getAndInstallFromCRAN packrat 0.7.0 + getAndInstallFromCRAN pixmap 0.4-12 + getAndInstallFromCRAN pkgconfig 2.0.3 + getAndInstallFromCRAN praise 1.0.0 + getAndInstallFromCRAN prettyunits 1.1.1 + getAndInstallFromCRAN proxy 0.4-20 + getAndInstallFromCRAN ps 1.6.0 + getAndInstallFromCRAN R.methodsS3 1.8.1 + getAndInstallFromCRAN R6 2.5.1 + getAndInstallFromCRAN randomForest 4.6-14 + getAndInstallFromCRAN rappdirs 0.3.3 + getAndInstallFromCRAN RColorBrewer 1.1-2 + getAndInstallFromCRAN Rcpp 1.0.12 + getAndInstallFromCRAN RcppParallel 5.1.4 + getAndInstallFromCRAN remotes 2.4.1 + getAndInstallFromCRAN rlang 1.0.0 + getAndInstallFromCRAN rpart 4.1-10 + getAndInstallFromCRAN rprojroot 2.0.2 + getAndInstallFromCRAN rstudioapi 0.13 + getAndInstallFromCRAN RUnit 0.4.32 + getAndInstallFromCRAN slam 0.1-40 + getAndInstallFromCRAN sourcetools 0.1.7 + getAndInstallFromCRAN SparseM 1.81 + getAndInstallFromCRAN sparsepp 1.22 + getAndInstallFromCRAN spatial 7.3-11 + getAndInstallFromCRAN statmod 1.4.36 + getAndInstallFromCRAN stringi 1.7.5 + getAndInstallFromCRAN svd 0.5 + getAndInstallFromCRAN sys 3.4 + getAndInstallFromCRAN systemfonts 0.2.3 + getAndInstallFromCRAN tweedie 2.3.5 + getAndInstallFromCRAN utf8 1.2.2 + getAndInstallFromCRAN uuid 1.0-3 + getAndInstallFromCRAN versions 0.3 + getAndInstallFromCRAN viridisLite 0.4.0 + getAndInstallFromCRAN whisker 0.4 + getAndInstallFromCRAN withr 2.4.2 + getAndInstallFromCRAN xfun 0.28 + getAndInstallFromCRAN xml2 1.3.2 + getAndInstallFromCRAN xtable 1.8-4 + getAndInstallFromCRAN yaml 2.2.1 + getAndInstallFromCRAN zip 2.2.0 + getAndInstallFromCRAN checkmate 2.0.0 + getAndInstallFromCRAN bit64 4.0.5 + getAndInstallFromCRAN caTools 1.17.1.4 + getAndInstallFromCRAN RCurl 1.95-4.12 + getAndInstallFromCRAN munsell 0.5.0 + getAndInstallFromCRAN diffobj 0.3.5 + getAndInstallFromCRAN robustbase 0.93-7 + getAndInstallFromCRAN lambda.r 1.2.4 + getAndInstallFromCRAN optparse 1.7.1 + getAndInstallFromCRAN cli 3.1.0 + getAndInstallFromCRAN gridExtra 2.3 + getAndInstallFromCRAN foreach 1.5.1 + getAndInstallFromCRAN Matrix 1.2-8 + getAndInstallFromCRAN nlme 3.1-131 + getAndInstallFromCRAN sp 1.4-5 + getAndInstallFromCRAN class 7.3-14 + getAndInstallFromCRAN prabclus 2.3-2 + getAndInstallFromCRAN plot3D 1.4 + getAndInstallFromCRAN flexmix 2.3-17 + getAndInstallFromCRAN R.oo 1.24.0 + getAndInstallFromCRAN processx 3.5.2 + getAndInstallFromCRAN latticeExtra 0.6-28 + getAndInstallFromCRAN RcppArmadillo 0.10.7.3.0 + getAndInstallFromCRAN cachem 1.0.6 + getAndInstallFromCRAN ellipsis 0.3.2 + getAndInstallFromCRAN htmltools 0.5.2 + getAndInstallFromCRAN later 1.3.0 + getAndInstallFromCRAN lifecycle 1.0.1 + getAndInstallFromCRAN purrr 0.3.4 + getAndInstallFromCRAN desc 1.4.0 + getAndInstallFromCRAN stringr 1.4.0 + getAndInstallFromCRAN askpass 1.1 + getAndInstallFromCRAN highr 0.9 + getAndInstallFromCRAN tinytex 0.35 + getAndInstallFromCRAN rversions 2.1.1 + getAndInstallFromCRAN gplots 3.1.1 + getAndInstallFromCRAN futile.logger 1.4.3 + getAndInstallFromCRAN sessioninfo 1.2.1 + getAndInstallFromCRAN glmnet 2.0-2 + getAndInstallFromCRAN irlba 2.3.3 + getAndInstallFromCRAN survival 2.44-1.1 + getAndInstallFromCRAN xgboost 1.0.0.2 + getAndInstallFromCRAN mgcv 1.8-17 + getAndInstallFromCRAN ade4 1.7-18 + getAndInstallFromCRAN e1071 1.7-9 + getAndInstallFromCRAN flexclust 1.4-0 + getAndInstallFromCRAN fpc 2.2-9 + getAndInstallFromCRAN R.utils 2.11.0 + getAndInstallFromCRAN callr 3.7.0 + getAndInstallFromCRAN xopen 1.0.0 + getAndInstallFromCRAN memoise 2.0.0 + getAndInstallFromCRAN vctrs 0.3.8 + getAndInstallFromCRAN crosstalk 1.2.0 + getAndInstallFromCRAN fontawesome 0.2.2 + getAndInstallFromCRAN htmlwidgets 1.5.4 + getAndInstallFromCRAN jquerylib 0.1.4 + getAndInstallFromCRAN sass 0.4.0 + getAndInstallFromCRAN promises 1.2.0.1 + getAndInstallFromCRAN scales 1.1.1 + getAndInstallFromCRAN pkgload 1.2.3 + getAndInstallFromCRAN openssl 1.4.5 + getAndInstallFromCRAN knitr 1.36 + getAndInstallFromCRAN ROCR 1.0-7 + getAndInstallFromCRAN text2vec 0.5.0 + getAndInstallFromCRAN coin 1.0-0 + getAndInstallFromCRAN gbm 2.1.3 + getAndInstallFromCRAN penalized 0.9-51 + getAndInstallFromCRAN RItools 0.1-17 + getAndInstallFromCRAN pkgbuild 1.2.0 + getAndInstallFromCRAN webshot 0.5.2 + getAndInstallFromCRAN pillar 1.6.4 + getAndInstallFromCRAN tidyselect 1.1.1 + getAndInstallFromCRAN bslib 0.3.1 + getAndInstallFromCRAN httpuv 1.6.3 + getAndInstallFromCRAN credentials 1.3.0 + getAndInstallFromCRAN httr 1.4.2 + getAndInstallFromCRAN rsconnect 0.8.24 + getAndInstallFromCRAN htmlTable 2.3.0 + getAndInstallFromCRAN rmarkdown 2.11 + getAndInstallFromCRAN roxygen2 7.2.2 + getAndInstallFromCRAN tables 0.9.6 + getAndInstallFromCRAN rcmdcheck 1.4.0 + getAndInstallFromCRAN repr 1.1.3 + getAndInstallFromCRAN tibble 3.1.6 + getAndInstallFromCRAN shiny 1.7.1 + getAndInstallFromCRAN gert 1.3.0 + getAndInstallFromCRAN gh 1.3.0 + getAndInstallFromCRAN uplift 0.3.5 + getAndInstallFromCRAN IRdisplay 1.0 + getAndInstallFromCRAN dplyr 1.0.7 + getAndInstallFromCRAN ggplot2 3.3.5 + getAndInstallFromCRAN rematch2 2.1.2 + getAndInstallFromCRAN miniUI 0.1.1.1 + getAndInstallFromCRAN shinyjs 2.0.0 + getAndInstallFromCRAN usethis 2.0.1 + getAndInstallFromCRAN tidyr 1.1.4 + getAndInstallFromCRAN viridis 0.6.2 + getAndInstallFromCRAN pkgdown 1.5.1 + getAndInstallFromCRAN waldo 0.3.1 + getAndInstallFromCRAN manipulateWidget 0.11.1 + getAndInstallFromCRAN plotly 4.10.0 + getAndInstallFromCRAN Hmisc 4.3-0 + getAndInstallFromCRAN testthat 3.1.0 + getAndInstallFromCRAN rgl 0.100.54 + getAndInstallFromCRAN devtools 2.4.2 + getAndInstallFromCRAN plot3Drgl 1.0.2 +fi # dependecies from GitHub R_REMOTES_UPGRADE=never /usr/local/R/current/bin/R -e "chooseCRANmirror(graphics=FALSE, ind=1); library(devtools); \ diff --git a/gradle/s3sync.gradle b/gradle/s3sync.gradle index e69b87c2aa26..360263c445c7 100644 --- a/gradle/s3sync.gradle +++ b/gradle/s3sync.gradle @@ -103,9 +103,11 @@ def syncData(subdir) { println("Going to download ${downloadList.size} files...") downloadList.each { - def fpath = 'https://h2o-public-test-data.s3.amazonaws.com/'+it.Key + def key = it.Key.toString().replaceAll(" ", "+") // handle filenames with spaces + def fpath = 'https://h2o-public-test-data.s3.amazonaws.com/'+key def localDestPath = localDestDir + it.Key.text().substring(trimLength).replaceAll("/", Matcher.quoteReplacement(File.separator)) - try { + try { + println "Downloading " + fpath download { src fpath dest localDestPath diff --git a/h2o-algos/src/main/java/hex/api/Word2VecHandler.java b/h2o-algos/src/main/java/hex/api/Word2VecHandler.java index bc559e1159fb..37e563e2e9e1 100644 --- a/h2o-algos/src/main/java/hex/api/Word2VecHandler.java +++ b/h2o-algos/src/main/java/hex/api/Word2VecHandler.java @@ -7,6 +7,7 @@ import water.api.Handler; import water.api.schemas3.KeyV3; import water.fvec.Frame; +import water.util.Log; import java.util.*; @@ -28,11 +29,16 @@ public int compare(Map.Entry o1, Map.Entry o2) { }); args.synonyms = new String[result.size()]; args.scores = new double[result.size()]; - int i = 0; - for (Map.Entry entry : result) { - args.synonyms[i] = entry.getKey(); - args.scores[i] = entry.getValue(); - i++; + if(result.size() > 0) { + int i = 0; + for (Map.Entry entry : result) { + args.synonyms[i] = entry.getKey(); + args.scores[i] = entry.getValue(); + i++; + } + } + if (result.size() < args.count) { + Log.warn(String.format("The result number of synonyms (%d) is less than the 'count' parameter (%d).", args.synonyms.length, args.count)); } return args; } diff --git a/h2o-algos/src/main/java/hex/glm/ComputationState.java b/h2o-algos/src/main/java/hex/glm/ComputationState.java index 23c343202e0d..1aaaf697e3bc 100644 --- a/h2o-algos/src/main/java/hex/glm/ComputationState.java +++ b/h2o-algos/src/main/java/hex/glm/ComputationState.java @@ -91,7 +91,6 @@ public final class ComputationState { int _totalBetaLength; // actual coefficient length without taking into account active columns only int _betaLengthPerClass; public boolean _noReg; - public boolean _hasConstraints; public ConstrainedGLMUtils.ConstraintGLMStates _csGLMState; public ComputationState(Job job, GLMParameters parms, DataInfo dinfo, BetaConstraint bc, GLM.BetaInfo bi){ @@ -1414,7 +1413,7 @@ protected GramXY computeNewGram(DataInfo activeData, double [] beta, GLMParamete double obj_reg = _parms._obj_reg; if(_glmw == null) _glmw = new GLMModel.GLMWeightsFun(_parms); GLMTask.GLMIterationTask gt = new GLMTask.GLMIterationTask(_job._key, activeData, _glmw, beta, - _activeClass, _hasConstraints).doAll(activeData._adaptedFrame); + _activeClass).doAll(activeData._adaptedFrame); gt._gram.mul(obj_reg); if (_parms._glmType.equals(GLMParameters.GLMType.gam)) { // add contribution from GAM smoothness factor Integer[] activeCols=null; @@ -1463,7 +1462,7 @@ protected GramGrad computeGram(double [] beta, GLMGradientInfo gradientInfo){ double obj_reg = _parms._obj_reg; if(_glmw == null) _glmw = new GLMModel.GLMWeightsFun(_parms); GLMTask.GLMIterationTask gt = new GLMTask.GLMIterationTask(_job._key, activeData, _glmw, beta, - _activeClass, _hasConstraints).doAll(activeData._adaptedFrame); + _activeClass).doAll(activeData._adaptedFrame); double[][] fullGram = gt._gram.getXX(); // only extract gram matrix mult(fullGram, obj_reg); if (_gramEqual != null) diff --git a/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java b/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java index ac198226e727..476ba8392f92 100644 --- a/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java +++ b/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java @@ -84,7 +84,7 @@ public static class ConstraintGLMStates { double _ckCS; double _ckCSHalf; // = ck/2 double _epsilonkCS; - double _epsilonkCSSquare; + public double _epsilonkCSSquare; double _etakCS; double _etakCSSquare; double _epsilon0; @@ -137,30 +137,35 @@ public static int[] extractBetaConstraints(ComputationState state, String[] coef List equalityC = new ArrayList<>(); List lessThanEqualToC = new ArrayList<>(); List betaIndexOnOff = new ArrayList<>(); - if (betaC._betaLB != null) { - int numCons = betaC._betaLB.length-1; - for (int index=0; indexx).toArray(); + return betaIndexOnOff.size() == 0 ? null : betaIndexOnOff.stream().mapToInt(x -> x).toArray(); } /*** @@ -506,11 +511,10 @@ public static boolean extractCoeffNames(List coeffList, LinearConstraint public static List foundRedundantConstraints(ComputationState state, final double[][] initConstraintMatrix) { Matrix constMatrix = new Matrix(initConstraintMatrix); - Matrix constMatrixLessConstant = constMatrix.getMatrix(0, constMatrix.getRowDimension() -1, 1, constMatrix.getColumnDimension()-1); - Matrix constMatrixTConstMatrix = constMatrixLessConstant.times(constMatrixLessConstant.transpose()); - int rank = constMatrixLessConstant.rank(); + Matrix matrixSquare = constMatrix.times(constMatrix.transpose()); + int rank = matrixSquare.rank(); if (rank < constMatrix.getRowDimension()) { // redundant constraints are specified - double[][] rMatVal = constMatrixTConstMatrix.qr().getR().getArray(); + double[][] rMatVal = matrixSquare.qr().getR().getArray(); List diag = IntStream.range(0, rMatVal.length).mapToDouble(x->Math.abs(rMatVal[x][x])).boxed().collect(Collectors.toList()); int[] sortedIndices = IntStream.range(0, diag.size()).boxed().sorted((i, j) -> diag.get(i).compareTo(diag.get(j))).mapToInt(ele->ele).toArray(); List duplicatedEleIndice = IntStream.range(0, diag.size()-rank).map(x -> sortedIndices[x]).boxed().collect(Collectors.toList()); @@ -645,6 +649,16 @@ public static void genInitialLambda(Random randObj, LinearConstraints[] constrai } } + public static void adjustLambda(LinearConstraints[] constraints, double[] lambda) { + int numC = constraints.length; + LinearConstraints oneC; + for (int index=0; index coefNam if (equalityConstraints != null) // equality constraints Arrays.stream(equalityConstraints).forEach(constraint -> { evalOneConstraint(constraint, betaCnd, coefNames); - constraint._active = (Math.abs(constraint._constraintsVal) > EPS2); + // constraint._active = (Math.abs(constraint._constraintsVal) > EPS2); + constraint._active = true; }); if (lessThanEqualToConstraints != null) // less than or equal to constraints Arrays.stream(lessThanEqualToConstraints).forEach(constraint -> { evalOneConstraint(constraint, betaCnd, coefNames); - constraint._active = constraint._constraintsVal > 0; + constraint._active = constraint._constraintsVal >= 0; }); } diff --git a/h2o-algos/src/main/java/hex/glm/GLM.java b/h2o-algos/src/main/java/hex/glm/GLM.java index 8e13acb10215..29b51f5db4ff 100644 --- a/h2o-algos/src/main/java/hex/glm/GLM.java +++ b/h2o-algos/src/main/java/hex/glm/GLM.java @@ -1403,7 +1403,8 @@ void checkInitLinearConstraints() { } // no regularization for constrainted GLM except during testing if ((notZeroLambdas(_parms._lambda) || _parms._lambda_search) && !_parms._testCSZeroGram) { - error("lambda or lambda_search", "Regularization is not allowed for constrained GLM."); + error("lambda or lambda_search", "Regularization is not allowed for constrained GLM. Set" + + " lambda to 0.0."); return; } if ("multinomial".equals(_parms._solver) || "ordinal".equals(_parms._solver)) { @@ -1439,16 +1440,14 @@ void checkAssignLinearConstraints() { String[] constraintCoefficientNames = constraintNames.toArray(new String[0]); if (countNumConst(_state) > coefNames.length) warn("number of constraints", " exceeds the number of coefficients. The system is" + - " over-constraints, and probably may not yield a valid solution due to possible conflicting " + - "constraints. Consider reducing the number of constraints."); + " over-constraints with duplicated constraints. Consider reducing the number of constraints."); List redundantConstraints = foundRedundantConstraints(_state, initConstraintMatrix); if (redundantConstraints != null) { int numRedundant = redundantConstraints.size(); for (int index = 0; index < numRedundant; index++) - error("redundant and possibly conflicting linear constraints", redundantConstraints.get(index)); + error("redundant linear constraints", redundantConstraints.get(index)); } else { _state._csGLMState = new ConstraintGLMStates(constraintCoefficientNames, initConstraintMatrix, _parms); - _state._hasConstraints = true; } } @@ -2352,9 +2351,9 @@ private void fitIRLSM(Solver s) { * This method fits the constraint GLM for IRLSM. We implemented the algorithm depicted in the document (H2O * Constrained GLM Implementation.pdf) attached to this issue: https://github.com/h2oai/h2o-3/issues/6722. We will * hereby use the word the doc to refere to this document. In particular, we following the algorithm described in - * Section VII (and table titled Algorithm 19.1) of the doc. + * Section VII (and table titled Algorithm 19.1) of the doc. Not as good as when considering magnitude of gradient. */ - private void fitIRLSMCS() { + private void fitIRLSMCS9() { double[] betaCnd = _checkPointFirstIter ? _model._betaCndCheckpoint : _state.beta(); double[] tempBeta = _parms._separate_linear_beta ? new double[betaCnd.length] : null; List coefNames = Arrays.stream(_state.activeData()._coefNames).collect(Collectors.toList()); @@ -2389,7 +2388,153 @@ private void fitIRLSMCS() { _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, _dinfo, 0, _state.activeBC(), _betaInfo); GLMGradientInfo gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, - equalityConstraints, lessThanEqualToConstraints); + equalityConstraints, lessThanEqualToConstraints); // add dpenalty/dx to gradient from penalty term + _state.setConstraintInfo(gradientInfo, equalityConstraints, lessThanEqualToConstraints, lambdaEqual, lambdaLessThan); // update state ginfo with contributions from GLMGradientInfo + boolean predictorSizeChange; + boolean applyBetaConstraints = _parms._separate_linear_beta && _betaConstraintsOn; + // short circuit check here: if gradient magnitude is small and all constraints are satisfied, quit right away + if (constraintsStop(gradientInfo, _state)) { + Log.info(LogMsg("GLM with constraints model building completed successfully!!")); + return; + } + double gradMagSquare = ArrayUtils.innerProduct(gradientInfo._gradient, gradientInfo._gradient); + boolean done; + boolean gradSmallEnough = (gradMagSquare <= _state._csGLMState._epsilonkCSSquare); + int origIter = iterCnt+1; + boolean lineSearchSuccess; + try { + while (true) { + do { // implement Algorithm 11.8 of the doc to find coefficients with epsilon k as the precision + iterCnt++; + long t1 = System.currentTimeMillis(); + ComputationState.GramGrad gram = _state.computeGram(betaCnd, gradientInfo); // calculate gram (hessian), xy, objective values + if (iterCnt == origIter) { + Matrix gramMatrix = new Matrix(gram._gram); + if (gramMatrix.cond() >= BAD_CONDITION_NUMBER) + if (_parms._init_optimal_glm) { + warn("init_optimal_glm", " should be disabled. This lead to gram matrix being close to" + + " singular. Please re-run with init_optimal_glm set to false."); + } + } + predictorSizeChange = !coefNames.equals(Arrays.asList(_state.activeData().coefNames())); + if (predictorSizeChange) { // reset if predictors changed + coefNames = changeCoeffBetainfo(_state.activeData()._coefNames); + _state.resizeConstraintInfo(equalityConstraints, lessThanEqualToConstraints); + ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _state.activeData(), 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _state.activeData(), 0, _state.activeBC(), _betaInfo); + tempBeta = new double[coefNames.size()]; + } + // solve for GLM coefficients + betaCnd = constraintGLM_solve(gram); // beta_k+1 = beta_k+dk where dk = beta_k+1-beta_k + predictorSizeChange = !coefNames.equals(Arrays.asList(_state.activeData().coefNames())); + if (predictorSizeChange) { // reset if predictors changed + coefNames = changeCoeffBetainfo(_state.activeData()._coefNames); + _state.resizeConstraintInfo(equalityConstraints, lessThanEqualToConstraints); + ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _state.activeData(), 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _state.activeData(), 0, _state.activeBC(), _betaInfo); + tempBeta = new double[betaCnd.length]; + } + // add exact line search for GLM coefficients. Refer to the doc, Algorithm 11.5 + if (ls == null) + ls = new ExactLineSearch(betaCnd, _state, coefNames); + else + ls.reset(betaCnd, _state, coefNames); + + // line search can fail when the gradient is close to zero. In this case, we need to update the + // constraint parameters. + lineSearchSuccess = ls.findAlpha(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo); + gradMagSquare = ArrayUtils.innerProduct(ls._ginfoOriginal._gradient, ls._ginfoOriginal._gradient); + gradSmallEnough = gradMagSquare <= _state._csGLMState._epsilonkCSSquare; + if (lineSearchSuccess) { + betaCnd = ls._newBeta; + gradientInfo = ls._ginfoOriginal; + } else { // ls failed, reset to + if (applyBetaConstraints) // separate beta and linear constraints + bc.applyAllBounds(_state.beta()); + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, lessThanEqualToConstraints, + ginfo, _state.beta()); + Log.info(LogMsg("Line search failed " + ls)); + return; + } + + if (applyBetaConstraints) { // if beta constraints are applied, may need to update constraints, derivatives, gradientInfo + System.arraycopy(betaCnd, 0, tempBeta, 0, betaCnd.length); + bc.applyAllBounds(betaCnd); + ArrayUtils.subtract(betaCnd, tempBeta, tempBeta); + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo, betaCnd); + gradientInfo = ls._ginfoOriginal; + } + + // check for stopping conditions which also updates the variables in state. + // stopping condition is to stop us getting stuck in improvements that are too insignificant. + // However, we will only exit the while loop when the gradMagSquare is still too high. There is no hope + // for improvement here anymore since the beta values and gradient values are not changing much anymore. + done = stop_requested() || (_state._iter >= _parms._max_iterations) || _earlyStop; // time to go + if (!progress(betaCnd, gradientInfo)) { + checkKKTConditions(betaCnd, gradientInfo, iterCnt); + return; + } + + Log.info(LogMsg("computed in " + (System.currentTimeMillis() - t1) + "ms, step = " + iterCnt + + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); + } while (!gradSmallEnough); + // update constraint parameters, ck, lambdas and others + updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, + lessThanEqualToConstraints, _parms); + // update gradient calculation with new value (lambda and/or ck). + gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); + _state.updateState(betaCnd, gradientInfo); // update computation state with new info + } + } catch (NonSPDMatrixException e) { + Log.warn(LogMsg("Got Non SPD matrix, stopped.")); + } + } + + // original algo, set lambda = 0 for inactive constraints, no good effect. + private void fitIRLSMCS8() { + double[] betaCnd = _checkPointFirstIter ? _model._betaCndCheckpoint : _state.beta(); + double[] tempBeta = _parms._separate_linear_beta ? new double[betaCnd.length] : null; + List coefNames = Arrays.stream(_state.activeData()._coefNames).collect(Collectors.toList()); + LinearConstraints[] equalityConstraints; + LinearConstraints[] lessThanEqualToConstraints; + final BetaConstraint bc = _state.activeBC(); + if (_parms._separate_linear_beta) { // keeping linear and beta constraints separate in this case + equalityConstraints = _state._equalityConstraintsLinear; + lessThanEqualToConstraints = _state._lessThanEqualToConstraintsLinear; + } else { // combine beta and linear constraints together + equalityConstraints = combineConstraints(_state._equalityConstraintsBeta, _state._equalityConstraintsLinear); + lessThanEqualToConstraints = combineConstraints(_state._lessThanEqualToConstraintsBeta, + _state._lessThanEqualToConstraintsLinear); + } + boolean hasEqualityConstraints = equalityConstraints != null; + boolean hasLessConstraints = lessThanEqualToConstraints != null; + double[] lambdaEqual = hasEqualityConstraints ? new double[equalityConstraints.length] : null; + double[] lambdaLessThan = hasLessConstraints ? new double[lessThanEqualToConstraints.length] : null; + Long startSeed = _parms._seed == -1 ? new Random().nextLong() : _parms._seed; + Random randObj = new Random(startSeed); + updateConstraintValues(betaCnd, coefNames, equalityConstraints, lessThanEqualToConstraints); + if (hasEqualityConstraints) // set lambda values for constraints + genInitialLambda(randObj, equalityConstraints, lambdaEqual); + if (hasLessConstraints) { + genInitialLambda(randObj, lessThanEqualToConstraints, lambdaLessThan); + adjustLambda(lessThanEqualToConstraints, lambdaLessThan); + } + + ExactLineSearch ls = null; + int iterCnt = (_checkPointFirstIter ? _state._iter : 0)+_initIter; + // contribution to gradient and hessian from constraints + _state.initConstraintDerivatives(equalityConstraints, lessThanEqualToConstraints, coefNames); + + GLMGradientSolver ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _dinfo, 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _dinfo, 0, _state.activeBC(), _betaInfo); + GLMGradientInfo gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); // add dpenalty/dx to gradient from penalty term _state.setConstraintInfo(gradientInfo, equalityConstraints, lessThanEqualToConstraints, lambdaEqual, lambdaLessThan); // update state ginfo with contributions from GLMGradientInfo boolean predictorSizeChange; boolean applyBetaConstraints = _parms._separate_linear_beta && _betaConstraintsOn; @@ -2398,8 +2543,11 @@ private void fitIRLSMCS() { Log.info(LogMsg("GLM with constraints model building completed successfully!!")); return; } - double gradMagSquare; + double gradMagSquare = ArrayUtils.innerProduct(gradientInfo._gradient, gradientInfo._gradient); + boolean done; + boolean gradSmallEnough = (gradMagSquare <= _state._csGLMState._epsilonkCSSquare); int origIter = iterCnt+1; + boolean lineSearchSuccess; try { while (true) { do { // implement Algorithm 11.8 of the doc to find coefficients with epsilon k as the precision @@ -2440,11 +2588,16 @@ private void fitIRLSMCS() { else ls.reset(betaCnd, _state, coefNames); - if (ls.findAlpha(lambdaEqual, lambdaLessThan, _state, equalityConstraints, lessThanEqualToConstraints, ginfo)) { - gradMagSquare = ArrayUtils.innerProduct(gradientInfo._gradient, gradientInfo._gradient); + // line search can fail when the gradient is close to zero. In this case, we need to update the + // constraint parameters. + lineSearchSuccess = ls.findAlpha(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo); + gradMagSquare = ArrayUtils.innerProduct(ls._ginfoOriginal._gradient, ls._ginfoOriginal._gradient); + gradSmallEnough = gradMagSquare <= _state._csGLMState._epsilonkCSSquare; + if (lineSearchSuccess) { betaCnd = ls._newBeta; gradientInfo = ls._ginfoOriginal; - } else { // ls failed, reset to + } else { // ls failed, reset to if (applyBetaConstraints) // separate beta and linear constraints bc.applyAllBounds(_state.beta()); ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, lessThanEqualToConstraints, @@ -2457,19 +2610,329 @@ private void fitIRLSMCS() { System.arraycopy(betaCnd, 0, tempBeta, 0, betaCnd.length); bc.applyAllBounds(betaCnd); ArrayUtils.subtract(betaCnd, tempBeta, tempBeta); - ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, lessThanEqualToConstraints, ginfo, betaCnd); gradientInfo = ls._ginfoOriginal; } - - // check for stopping conditions - if (checkIterationDone(betaCnd, gradientInfo, iterCnt)) // ratio of objective drops. + + // check for stopping conditions which also updates the variables in state. + // stopping condition is to stop us getting stuck in improvements that are too insignificant. + // However, we will only exit the while loop when the gradMagSquare is still too high. There is no hope + // for improvement here anymore since the beta values and gradient values are not changing much anymore. + done = stop_requested() || (_state._iter >= _parms._max_iterations) || _earlyStop; // time to go + if (!progress(betaCnd, gradientInfo)) { + checkKKTConditions(betaCnd, gradientInfo, iterCnt); return; - Log.info(LogMsg("computed in " + (System.currentTimeMillis() - t1) + "ms, step = " + iterCnt + + } + + Log.info(LogMsg("computed in " + (System.currentTimeMillis() - t1) + "ms, step = " + iterCnt + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); - } while (gradMagSquare > _state._csGLMState._epsilonkCSSquare); + } while (!gradSmallEnough); // update constraint parameters, ck, lambdas and others - updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, lessThanEqualToConstraints, _parms); + updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, + lessThanEqualToConstraints, _parms); + if (hasLessConstraints) + adjustLambda(lessThanEqualToConstraints, lambdaLessThan); + // update gradient calculation with new value (lambda and/or ck). + gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); + _state.updateState(betaCnd, gradientInfo); // update computation state with new info + } + } catch (NonSPDMatrixException e) { + Log.warn(LogMsg("Got Non SPD matrix, stopped.")); + } + } + + // original implementation but will not quit when magnitude of gradient is small. If exit condition is triggered + // (either ls failed or no progress is made, if the magnitude of gradient is small, we will exit thw while loop + // but will arrive at the part to change the constrained parameters. This seems to help. + private void fitIRLSMCS() { + double[] betaCnd = _checkPointFirstIter ? _model._betaCndCheckpoint : _state.beta(); + double[] tempBeta = _parms._separate_linear_beta ? new double[betaCnd.length] : null; + List coefNames = Arrays.stream(_state.activeData()._coefNames).collect(Collectors.toList()); + LinearConstraints[] equalityConstraints; + LinearConstraints[] lessThanEqualToConstraints; + final BetaConstraint bc = _state.activeBC(); + if (_parms._separate_linear_beta) { // keeping linear and beta constraints separate in this case + equalityConstraints = _state._equalityConstraintsLinear; + lessThanEqualToConstraints = _state._lessThanEqualToConstraintsLinear; + } else { // combine beta and linear constraints together + equalityConstraints = combineConstraints(_state._equalityConstraintsBeta, _state._equalityConstraintsLinear); + lessThanEqualToConstraints = combineConstraints(_state._lessThanEqualToConstraintsBeta, + _state._lessThanEqualToConstraintsLinear); + } + boolean hasEqualityConstraints = equalityConstraints != null; + boolean hasLessConstraints = lessThanEqualToConstraints != null; + double[] lambdaEqual = hasEqualityConstraints ? new double[equalityConstraints.length] : null; + double[] lambdaLessThan = hasLessConstraints ? new double[lessThanEqualToConstraints.length] : null; + Long startSeed = _parms._seed == -1 ? new Random().nextLong() : _parms._seed; + Random randObj = new Random(startSeed); + updateConstraintValues(betaCnd, coefNames, equalityConstraints, lessThanEqualToConstraints); + if (hasEqualityConstraints) // set lambda values for constraints + genInitialLambda(randObj, equalityConstraints, lambdaEqual); + if (hasLessConstraints) + genInitialLambda(randObj, lessThanEqualToConstraints, lambdaLessThan); + ExactLineSearch ls = null; + int iterCnt = (_checkPointFirstIter ? _state._iter : 0)+_initIter; + // contribution to gradient and hessian from constraints + _state.initConstraintDerivatives(equalityConstraints, lessThanEqualToConstraints, coefNames); + + GLMGradientSolver ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _dinfo, 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _dinfo, 0, _state.activeBC(), _betaInfo); + GLMGradientInfo gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); // add dpenalty/dx to gradient from penalty term + _state.setConstraintInfo(gradientInfo, equalityConstraints, lessThanEqualToConstraints, lambdaEqual, lambdaLessThan); // update state ginfo with contributions from GLMGradientInfo + boolean predictorSizeChange; + boolean applyBetaConstraints = _parms._separate_linear_beta && _betaConstraintsOn; + // short circuit check here: if gradient magnitude is small and all constraints are satisfied, quit right away + if (constraintsStop(gradientInfo, _state)) { + Log.info(LogMsg("GLM with constraints model building completed successfully!!")); + return; + } + double gradMagSquare = ArrayUtils.innerProduct(gradientInfo._gradient, gradientInfo._gradient); + boolean done; + boolean gradSmallEnough = (gradMagSquare <= _state._csGLMState._epsilonkCSSquare); + int origIter = iterCnt+1; + boolean lineSearchSuccess; + try { + while (true) { + do { // implement Algorithm 11.8 of the doc to find coefficients with epsilon k as the precision + iterCnt++; + long t1 = System.currentTimeMillis(); + ComputationState.GramGrad gram = _state.computeGram(betaCnd, gradientInfo); // calculate gram (hessian), xy, objective values + if (iterCnt == origIter) { + Matrix gramMatrix = new Matrix(gram._gram); + if (gramMatrix.cond() >= BAD_CONDITION_NUMBER) + if (_parms._init_optimal_glm) { + warn("init_optimal_glm", " should be disabled. This lead to gram matrix being close to" + + " singular. Please re-run with init_optimal_glm set to false."); + } + } + predictorSizeChange = !coefNames.equals(Arrays.asList(_state.activeData().coefNames())); + if (predictorSizeChange) { // reset if predictors changed + coefNames = changeCoeffBetainfo(_state.activeData()._coefNames); + _state.resizeConstraintInfo(equalityConstraints, lessThanEqualToConstraints); + ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _state.activeData(), 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _state.activeData(), 0, _state.activeBC(), _betaInfo); + tempBeta = new double[coefNames.size()]; + } + // solve for GLM coefficients + betaCnd = constraintGLM_solve(gram); // beta_k+1 = beta_k+dk where dk = beta_k+1-beta_k + predictorSizeChange = !coefNames.equals(Arrays.asList(_state.activeData().coefNames())); + if (predictorSizeChange) { // reset if predictors changed + coefNames = changeCoeffBetainfo(_state.activeData()._coefNames); + _state.resizeConstraintInfo(equalityConstraints, lessThanEqualToConstraints); + ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _state.activeData(), 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _state.activeData(), 0, _state.activeBC(), _betaInfo); + tempBeta = new double[betaCnd.length]; + } + // add exact line search for GLM coefficients. Refer to the doc, Algorithm 11.5 + if (ls == null) + ls = new ExactLineSearch(betaCnd, _state, coefNames); + else + ls.reset(betaCnd, _state, coefNames); + + // line search can fail when the gradient is close to zero. In this case, we need to update the + // constraint parameters. + lineSearchSuccess = ls.findAlpha(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo); + gradMagSquare = ArrayUtils.innerProduct(ls._ginfoOriginal._gradient, ls._ginfoOriginal._gradient); + gradSmallEnough = gradMagSquare <= _state._csGLMState._epsilonkCSSquare; + if (lineSearchSuccess ||gradSmallEnough) { + betaCnd = ls._newBeta; + gradientInfo = ls._ginfoOriginal; + } else { // ls failed, reset to + if (_betaConstraintsOn) // separate beta and linear constraints + bc.applyAllBounds(_state.beta()); + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, lessThanEqualToConstraints, + ginfo, _state.beta()); + Log.info(LogMsg("Line search failed " + ls)); + return; + } + + if (applyBetaConstraints) { // if beta constraints are applied separately, may need to update constraints, derivatives, gradientInfo + System.arraycopy(betaCnd, 0, tempBeta, 0, betaCnd.length); + bc.applyAllBounds(betaCnd); + ArrayUtils.subtract(betaCnd, tempBeta, tempBeta); + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo, betaCnd); + gradientInfo = ls._ginfoOriginal; + } + + // check for stopping conditions which also updates the variables in state. + // stopping condition is to stop us getting stuck in improvements that are too insignificant. + // However, we will only exit the while loop when the gradMagSquare is still too high. There is no hope + // for improvement here anymore since the beta values and gradient values are not changing much anymore. + done = stop_requested() || (_state._iter >= _parms._max_iterations) || _earlyStop; // time to go + if ((!progress(betaCnd, gradientInfo) && !gradSmallEnough) || done) { + checkKKTConditions(betaCnd, gradientInfo, iterCnt); + if (_betaConstraintsOn) + bc.applyAllBounds(_state.beta()); + return; + } + + Log.info(LogMsg("computed in " + (System.currentTimeMillis() - t1) + "ms, step = " + iterCnt + + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); + } while (!gradSmallEnough); + // update constraint parameters, ck, lambdas and others + updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, + lessThanEqualToConstraints, _parms); + // update gradient calculation with new value (lambda and/or ck). + gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); + _state.updateState(betaCnd, gradientInfo); // update computation state with new info + } + } catch (NonSPDMatrixException e) { + Log.warn(LogMsg("Got Non SPD matrix, stopped.")); + } + } + + // only has penalty and no constrained multipliers, original algorithm + private void fitIRLSMCS2() { + double[] betaCnd = _checkPointFirstIter ? _model._betaCndCheckpoint : _state.beta(); + double[] tempBeta = _parms._separate_linear_beta ? new double[betaCnd.length] : null; + List coefNames = Arrays.stream(_state.activeData()._coefNames).collect(Collectors.toList()); + LinearConstraints[] equalityConstraints; + LinearConstraints[] lessThanEqualToConstraints; + final BetaConstraint bc = _state.activeBC(); + if (_parms._separate_linear_beta) { // keeping linear and beta constraints separate in this case + equalityConstraints = _state._equalityConstraintsLinear; + lessThanEqualToConstraints = _state._lessThanEqualToConstraintsLinear; + } else { // combine beta and linear constraints together + equalityConstraints = combineConstraints(_state._equalityConstraintsBeta, _state._equalityConstraintsLinear); + lessThanEqualToConstraints = combineConstraints(_state._lessThanEqualToConstraintsBeta, + _state._lessThanEqualToConstraintsLinear); + } + boolean hasEqualityConstraints = equalityConstraints != null; + boolean hasLessConstraints = lessThanEqualToConstraints != null; + double[] lambdaEqual = hasEqualityConstraints ? new double[equalityConstraints.length] : null; + double[] lambdaLessThan = hasLessConstraints ? new double[lessThanEqualToConstraints.length] : null; + Long startSeed = _parms._seed == -1 ? new Random().nextLong() : _parms._seed; + Random randObj = new Random(startSeed); + updateConstraintValues(betaCnd, coefNames, equalityConstraints, lessThanEqualToConstraints); +/* + if (hasEqualityConstraints) // set lambda values for constraints + genInitialLambda(randObj, equalityConstraints, lambdaEqual); + if (hasLessConstraints) + genInitialLambda(randObj, lessThanEqualToConstraints, lambdaLessThan); +*/ + ExactLineSearch ls = null; + int iterCnt = (_checkPointFirstIter ? _state._iter : 0)+_initIter; + // contribution to gradient and hessian from constraints + _state.initConstraintDerivatives(equalityConstraints, lessThanEqualToConstraints, coefNames); + + GLMGradientSolver ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _dinfo, 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _dinfo, 0, _state.activeBC(), _betaInfo); + GLMGradientInfo gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); // add dpenalty/dx to gradient from penalty term + _state.setConstraintInfo(gradientInfo, equalityConstraints, lessThanEqualToConstraints, lambdaEqual, lambdaLessThan); // update state ginfo with contributions from GLMGradientInfo + boolean predictorSizeChange; + boolean applyBetaConstraints = _parms._separate_linear_beta && _betaConstraintsOn; + // short circuit check here: if gradient magnitude is small and all constraints are satisfied, quit right away + if (constraintsStop(gradientInfo, _state)) { + Log.info(LogMsg("GLM with constraints model building completed successfully!!")); + return; + } + double gradMagSquare = ArrayUtils.innerProduct(gradientInfo._gradient, gradientInfo._gradient); + boolean done; + boolean gradSmallEnough = (gradMagSquare <= _state._csGLMState._epsilonkCSSquare); + int origIter = iterCnt+1; + boolean lineSearchSuccess; + try { + while (true) { + do { // implement Algorithm 11.8 of the doc to find coefficients with epsilon k as the precision + iterCnt++; + long t1 = System.currentTimeMillis(); + ComputationState.GramGrad gram = _state.computeGram(betaCnd, gradientInfo); // calculate gram (hessian), xy, objective values + if (iterCnt == origIter) { + Matrix gramMatrix = new Matrix(gram._gram); + if (gramMatrix.cond() >= BAD_CONDITION_NUMBER) + if (_parms._init_optimal_glm) { + warn("init_optimal_glm", " should be disabled. This lead to gram matrix being close to" + + " singular. Please re-run with init_optimal_glm set to false."); + } + } + predictorSizeChange = !coefNames.equals(Arrays.asList(_state.activeData().coefNames())); + if (predictorSizeChange) { // reset if predictors changed + coefNames = changeCoeffBetainfo(_state.activeData()._coefNames); + _state.resizeConstraintInfo(equalityConstraints, lessThanEqualToConstraints); + ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _state.activeData(), 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _state.activeData(), 0, _state.activeBC(), _betaInfo); + tempBeta = new double[coefNames.size()]; + } + // solve for GLM coefficients + betaCnd = constraintGLM_solve(gram); // beta_k+1 = beta_k+dk where dk = beta_k+1-beta_k + predictorSizeChange = !coefNames.equals(Arrays.asList(_state.activeData().coefNames())); + if (predictorSizeChange) { // reset if predictors changed + coefNames = changeCoeffBetainfo(_state.activeData()._coefNames); + _state.resizeConstraintInfo(equalityConstraints, lessThanEqualToConstraints); + ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _state.activeData(), 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices) : new GLMGradientSolver(_job, _parms, + _state.activeData(), 0, _state.activeBC(), _betaInfo); + tempBeta = new double[betaCnd.length]; + } + // add exact line search for GLM coefficients. Refer to the doc, Algorithm 11.5 + if (ls == null) + ls = new ExactLineSearch(betaCnd, _state, coefNames); + else + ls.reset(betaCnd, _state, coefNames); + + // line search can fail when the gradient is close to zero. In this case, we need to update the + // constraint parameters. + lineSearchSuccess = ls.findAlpha(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo); + if (lineSearchSuccess) { + betaCnd = ls._newBeta; + gradientInfo = ls._ginfoOriginal; + gradMagSquare = ArrayUtils.innerProduct(ls._ginfoOriginal._gradient, ls._ginfoOriginal._gradient); + gradSmallEnough = gradMagSquare <= _state._csGLMState._epsilonkCSSquare; + } else { // ls failed, reset to + if (applyBetaConstraints) // separate beta and linear constraints + bc.applyAllBounds(_state.beta()); + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, lessThanEqualToConstraints, + ginfo, _state.beta()); + Log.info(LogMsg("Line search failed " + ls)); + return; + } + + if (applyBetaConstraints) { // if beta constraints are applied, may need to update constraints, derivatives, gradientInfo + System.arraycopy(betaCnd, 0, tempBeta, 0, betaCnd.length); + bc.applyAllBounds(betaCnd); + ArrayUtils.subtract(betaCnd, tempBeta, tempBeta); + ls.setBetaConstraintsDeriv(lambdaEqual, lambdaLessThan, _state, equalityConstraints, + lessThanEqualToConstraints, ginfo, betaCnd); + gradientInfo = ls._ginfoOriginal; + } + + // check for stopping conditions which also updates the variables in state. + // stopping condition is to stop us getting stuck in improvements that are too insignificant. + // However, we will only exit the while loop when the gradMagSquare is still too high. There is no hope + // for improvement here anymore since the beta values and gradient values are not changing much anymore. + done = stop_requested() || (_state._iter >= _parms._max_iterations) || _earlyStop; // time to go + if (!progress(betaCnd, gradientInfo)) { + checkKKTConditions(betaCnd, gradientInfo, iterCnt); + return; + } + + Log.info(LogMsg("computed in " + (System.currentTimeMillis() - t1) + "ms, step = " + iterCnt + + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); + } while (!gradSmallEnough); + // update constraint parameters, ck, lambdas and others + updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, + lessThanEqualToConstraints, _parms); + // update gradient calculation with new value (lambda and/or ck). + // set lambda to all zeros + lambdaEqual = hasEqualityConstraints ? new double[lambdaEqual.length] : null; + lambdaLessThan = hasLessConstraints ? new double[lambdaLessThan.length] : null; + + gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, + equalityConstraints, lessThanEqualToConstraints); + _state.updateState(betaCnd, gradientInfo); // update computation state with new info } } catch (NonSPDMatrixException e) { Log.warn(LogMsg("Got Non SPD matrix, stopped.")); @@ -2477,25 +2940,17 @@ private void fitIRLSMCS() { } /*** - * This method will first check if enough progress has been made with progress method. - * If no more progress is made, we will check it the constraint stopping conditions are met. - * The model building process will stop if no more progress is made regardless of whether the constraint stopping - * conditions are met or not. + * We will check it the constraint stopping conditions are met. */ - public boolean checkIterationDone(double[] betaCnd, GLMGradientInfo gradientInfo, int iterCnt) { + public void checkKKTConditions(double[] betaCnd, GLMGradientInfo gradientInfo, int iterCnt) { // check for stopping conditions - boolean done = !progress(betaCnd, gradientInfo); // no good change in coeff, time-out or max_iteration reached - if (done) { - _model._betaCndCheckpoint = betaCnd; - boolean kktAchieved = constraintsStop(gradientInfo, _state); - if (kktAchieved) - Log.info("KKT Conditions achieved after " + iterCnt + " iterations "); - else - Log.warn("KKT Conditions not achieved but no further progress made due to time out or no changes" + - " to coefficients after " + iterCnt + " iterations"); - return true; - } - return false; + _model._betaCndCheckpoint = betaCnd; + boolean kktAchieved = constraintsStop(gradientInfo, _state); + if (kktAchieved) + Log.info("KKT Conditions achieved after " + iterCnt + " iterations "); + else + Log.warn("KKT Conditions not achieved but no further progress made due to time out or no changes" + + " to coefficients after " + iterCnt + " iterations"); } public List changeCoeffBetainfo(String[] coefNames) { @@ -4151,14 +4606,27 @@ private void checkCoeffsBounds() { BetaConstraint bc = _parms._beta_constraints != null ? new BetaConstraint(_parms._beta_constraints.get()) : new BetaConstraint(); // bounds for columns _dinfo.fullN()+1 only double[] coeffs = _parms._standardize ? _model._output.getNormBeta() :_model._output.beta(); - if (bc._betaLB == null || bc._betaUB == null || coeffs == null) + if (coeffs == null) + return; + if (bc._betaLB == null && bc._betaUB == null) return; - int coeffsLen = bc._betaLB.length; + int coeffsLen = bc._betaLB != null ? bc._betaLB.length : bc._betaUB.length; StringBuffer errorMessage = new StringBuffer(); + boolean lowerBoundNull = bc._betaLB == null; + boolean upperBoundNull = bc._betaUB == null; for (int index=0; index < coeffsLen; index++) { - if (!(coeffs[index] == 0 || (coeffs[index] >= bc._betaLB[index] && coeffs[index] <= bc._betaUB[index]))) - errorMessage.append("GLM model coefficient " + coeffs[index]+" exceeds beta constraint bounds. Lower: " - +bc._betaLB[index]+", upper: "+bc._betaUB[index]+"\n"); + if (coeffs[index] != 0) { + if (lowerBoundNull && !Double.isInfinite(bc._betaUB[index]) && (coeffs[index] > bc._betaUB[index])) { + errorMessage.append("GLM model coefficient " + coeffs[index]+" exceeds beta constraint upper bounds: " + + "upper: "+bc._betaUB[index]+"\n"); + } else if (upperBoundNull && !Double.isInfinite(bc._betaLB[index]) && (coeffs[index] < bc._betaLB[index])) { + errorMessage.append("GLM model coefficient " + coeffs[index]+" falls below beta constraint lower bounds: " + + "upper: "+bc._betaLB[index]+"\n"); + } else if (!lowerBoundNull && !upperBoundNull && (coeffs[index] < bc._betaLB[index] && coeffs[index] > bc._betaUB[index])) { + errorMessage.append("GLM model coefficient " + coeffs[index]+" exceeds beta constraint bounds. Lower: " + +bc._betaLB[index]+", upper: "+bc._betaUB[index]+"\n"); + } + } } if (errorMessage.length() > 0) throw new H2OFailException("\n"+errorMessage.toString()); diff --git a/h2o-algos/src/main/java/hex/glm/GLMTask.java b/h2o-algos/src/main/java/hex/glm/GLMTask.java index 75fac2d5c4c2..4449014c186d 100644 --- a/h2o-algos/src/main/java/hex/glm/GLMTask.java +++ b/h2o-algos/src/main/java/hex/glm/GLMTask.java @@ -1547,15 +1547,6 @@ public GLMIterationTask(Key jobKey, DataInfo dinfo, GLMWeightsFun glmw, double _c = c; } - public GLMIterationTask(Key jobKey, DataInfo dinfo, GLMWeightsFun glmw, double [] beta, int c, boolean hasConst) { - super(null,dinfo,jobKey); - _beta = beta; - _ymu = null; - _glmf = glmw; - _c = c; - _hasConstraints = hasConst; - } - @Override public boolean handlesSparseData(){return true;} transient private double _sparseOffset; diff --git a/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java b/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java index c70b76f91ee7..ec60fde3ac8c 100644 --- a/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java +++ b/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java @@ -1063,10 +1063,10 @@ public static int findMinZValue(GLMModel model, List numPredNames, List< } // grab min z-values for numerical and categorical columns PredNameMinZVal numericalPred = findNumMinZVal(numPredNames, zValList, coeffNames); - PredNameMinZVal categoricalPred = findCatMinZVal(model, zValList); + PredNameMinZVal categoricalPred = findCatMinOfMaxZScore(model, zValList); // null if all predictors are inactive // choose the min z-value from numerical and categorical predictors and return its index in predNames - if (categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value + if (categoricalPred != null && categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value return predNames.indexOf(categoricalPred._predName); } else { // numerical pred has minimum z-value return predNames.indexOf(numericalPred._predName); @@ -1095,26 +1095,28 @@ public static PredNameMinZVal findNumMinZVal(List numPredNames, List zValList) { + public static PredNameMinZVal findCatMinOfMaxZScore(GLMModel model, List zValList) { String[] columnNames = model.names(); // column names of dinfo._adaptedFrame int[] catOffsets = model._output.getDinfo()._catOffsets; - double minCatVal = -1; - String catPredMinZ = null; + List bestZValues = new ArrayList<>(); + List catPredNames = new ArrayList<>(); if (catOffsets != null) { - minCatVal = Double.MAX_VALUE; int numCatCol = catOffsets.length - 1; - int numNaN = (int) zValList.stream().filter(x -> Double.isNaN(x)).count(); if (numNaN == zValList.size()) { // if all levels are NaN, this predictor is redundant - new PredNameMinZVal(catPredMinZ, Double.POSITIVE_INFINITY); + return null; } else { for (int catInd = 0; catInd < numCatCol; catInd++) { // go through each categorical column List catZValues = new ArrayList<>(); @@ -1130,15 +1132,17 @@ public static PredNameMinZVal findCatMinZVal(GLMModel model, List zValLi } if (catZValues.size() > 0) { double oneCatMinZ = catZValues.stream().max(Double::compare).get(); // choose the best z-value here - if (oneCatMinZ < minCatVal) { - minCatVal = oneCatMinZ; - catPredMinZ = columnNames[catInd]; - } + bestZValues.add(oneCatMinZ); + catPredNames.add(columnNames[catInd]); } } } } - return new PredNameMinZVal(catPredMinZ, minCatVal); + if (bestZValues.size() < 1) + return null; + double maxCatLevel = bestZValues.stream().min(Double::compare).get(); + String catPredBestZ = catPredNames.get(bestZValues.indexOf(maxCatLevel)); + return new PredNameMinZVal(catPredBestZ, maxCatLevel); } static class PredNameMinZVal { diff --git a/h2o-algos/src/main/java/hex/optimization/OptimizationUtils.java b/h2o-algos/src/main/java/hex/optimization/OptimizationUtils.java index 221afa9779cc..d75ebcaa5dc5 100644 --- a/h2o-algos/src/main/java/hex/optimization/OptimizationUtils.java +++ b/h2o-algos/src/main/java/hex/optimization/OptimizationUtils.java @@ -554,6 +554,7 @@ public boolean findAlpha(double[] lambdaEqual, double[] lambdaLessThan, Computat ConstrainedGLMUtils.LinearConstraints[] lessThanEqualToConstraints, GLM.GLMGradientSolver gradientSolver) { if (_currGradDirIP > 0) { + _newBeta = _originalBeta; return false; } GLM.GLMGradientInfo newGrad; @@ -563,6 +564,8 @@ public boolean findAlpha(double[] lambdaEqual, double[] lambdaLessThan, Computat boolean firstWolfe; boolean secondWolfe; boolean alphaiChange; + double gradMagSquare; + boolean gradSmallEnough; for (int index=0; index<_maxIteration; index++) { ArrayUtils.mult(_direction, tempDirection, _alphai); // tempCoef=alpha_i*direction newCoef = ArrayUtils.add(tempDirection, _originalBeta); // newCoef = coef + alpha_i*direction @@ -575,6 +578,8 @@ public boolean findAlpha(double[] lambdaEqual, double[] lambdaLessThan, Computat // calculate new gradient and objective function for new coefficients newCoef newGrad = calGradient(newCoef, state, gradientSolver, lambdaEqual, lambdaLessThan, equalityConstraints, lessThanEqualToConstraints); + gradMagSquare = ArrayUtils.innerProduct(newGrad._gradient, newGrad._gradient); + gradSmallEnough = gradMagSquare <= state._csGLMState._epsilonkCSSquare; // evaluate if first Wolfe condition is satisfied; firstWolfe = evaluateFirstWolfe(newGrad); // evaluate if second Wolfe condition is satisfied; @@ -589,7 +594,11 @@ public boolean findAlpha(double[] lambdaEqual, double[] lambdaLessThan, Computat // set alphai if first Wolfe condition is not satisfied, set alpha i if second Wolfe condition is not satisfied; alphaiChange = setAlphai(firstWolfe, secondWolfe); if (!alphaiChange || _alphar < EPS_CS_SQUARE) { // if alphai, alphar value are not changed and alphar is too small, quit - return false; + if (gradSmallEnough) { + _newBeta = newCoef; + _ginfoOriginal = newGrad; + } + return false; } } return false; diff --git a/h2o-algos/src/main/java/water/tools/MojoConvertTool.java b/h2o-algos/src/main/java/water/tools/MojoConvertTool.java index 9abd23123068..eed6537dc743 100644 --- a/h2o-algos/src/main/java/water/tools/MojoConvertTool.java +++ b/h2o-algos/src/main/java/water/tools/MojoConvertTool.java @@ -33,25 +33,28 @@ void convert() throws IOException { Files.write(pojoPath, pojo.getBytes(StandardCharsets.UTF_8)); } - private static void usage() { - System.err.println("java -cp h2o.jar " + MojoConvertTool.class.getName() + " source_mojo.zip target_pojo.java"); - } - public static void main(String[] args) throws IOException { - if (args.length < 2) { - usage(); + try { + mainInternal(args); + } + catch (IllegalArgumentException e) { + System.err.println(e.getMessage()); System.exit(1); } + } + + public static void mainInternal(String[] args) throws IOException { + if (args.length < 2 || args[0] == null || args[1] == null) { + throw new IllegalArgumentException("java -cp h2o.jar " + MojoConvertTool.class.getName() + " source_mojo.zip target_pojo.java"); + } File mojoFile = new File(args[0]); - if (!mojoFile.isFile()) { - System.err.println("Specified MOJO file (" + mojoFile.getAbsolutePath() + ") doesn't exist!"); - System.exit(2); + if (!mojoFile.exists() || !mojoFile.isFile()) { + throw new IllegalArgumentException("Specified MOJO file (" + mojoFile.getAbsolutePath() + ") doesn't exist!"); } File pojoFile = new File(args[1]); if (pojoFile.isDirectory() || (pojoFile.getParentFile() != null && !pojoFile.getParentFile().isDirectory())) { - System.err.println("Invalid target POJO file (" + pojoFile.getAbsolutePath() + ")! Please specify a file in an existing directory."); - System.exit(3); + throw new IllegalArgumentException("Invalid target POJO file (" + pojoFile.getAbsolutePath() + ")! Please specify a file in an existing directory."); } System.out.println(); diff --git a/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java b/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java index 77aa7031195f..a4ae87ea0b03 100644 --- a/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java +++ b/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java @@ -77,8 +77,6 @@ public class GLMConstrainedTest extends TestUtil { double[] _lessGradContr; double _ck = 10; double[] _beta; - double[] _equalGradPenalty; - double[] _lessGradPenalty; Random _obj = new Random(123); @@ -324,15 +322,16 @@ public void generateConstraint4FrameNAnswer() { _coeffNames1.get(19), _coeffNames1.get(20), _coeffNames1.get(21), _coeffNames1.get(22), "constant", _coeffNames1.get(4), _coeffNames1.get(5), _coeffNames1.get(6), "constant", _coeffNames1.get(6), _coeffNames1.get(33), _coeffNames1.get(7), _coeffNames1.get(24), _coeffNames1.get(25), "constant", - _coeffNames1.get(1), _coeffNames1.get(coefLen-3), "constant"}) + _coeffNames1.get(1), _coeffNames1.get(coefLen-3), "constant", _coeffNames1.get(0), _coeffNames1.get(1), "constant"}) .withDataForCol(1, new double [] {-0.3, 0.5, 1.0, -3.0, 3, -4, 0.5, 0.1, -0.2, 2.0, -0.1, -0.4, - 0.8, 0.1, -0.5, 0.7, -1.1, 2.0, 0.5, -0.3, 0.5, -1.5, -0.3, -1.0, 1.0, -9.0}) + 0.8, 0.1, -0.5, 0.7, -1.1, 2.0, 0.5, -0.3, 0.5, -1.5, -0.3, -1.0, 1.0, -9.0,-1, -1, 0}) .withDataForCol(2, new String[] {"lessthanequal", "lessthanequal", "lessthanequal", "lessthanequal", "lessthanequal", "lessthanequal", "lessthanequal", "equal", "equal", "lessthanequal", "lessthanequal", "lessthanequal", "lessthanequal", "equal", "equal", "equal", "equal", "equal", - "equal", "equal", "equal", "equal", "equal", "lessthanequal", "lessthanequal", "lessthanequal"}) + "equal", "equal", "equal", "equal", "equal", "lessthanequal", "lessthanequal", "lessthanequal", + "lessthanequal", "lessthanequal", "lessthanequal"}) .withDataForCol(3, new int[]{0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, - 6, 7, 7 ,7}).build(); + 6, 7, 7 ,7, 8, 8, 8}).build(); Scope.track(_linearConstraint4); } @@ -595,54 +594,6 @@ public void assertCorrectGramMaps(IcedHashMap= 2, beta1 >= 2 - Frame betaConstraint = - new TestFrameBuilder() - .withColNames("names", "lower_bounds", "upper_bounds") - .withVecTypes(T_STR, T_NUM, T_NUM) - .withDataForCol(0, new String[] {_coeffNames1.get(30), _coeffNames1.get(31)}) - .withDataForCol(1, new double [] {2, 2}) - .withDataForCol(2, new double[] {Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY}).build(); - Scope.track(betaConstraint); - - // linear constraint: beta0 + beta1 <= 2, contradicts with beta0 >= 2 and beta1 >= 2 - Frame linearConstraint = new TestFrameBuilder() - .withColNames("names", "values", "types", "constraint_numbers") - .withVecTypes(T_STR, T_NUM, T_STR, T_NUM) - .withDataForCol(0, new String[] {_coeffNames1.get(30), _coeffNames1.get(31), "constant"}) - .withDataForCol(1, new double [] {1,1,-2}) - .withDataForCol(2, new String[] {"lessthanequal", "lessthanequal", "lessthanequal"}) - .withDataForCol(3, new int[]{0,0,0}).build(); - Scope.track(linearConstraint); - - Frame train = parseAndTrackTestFile("smalldata/glm_test/gaussian_20cols_10000Rows.csv"); - transformFrameCreateCoefNames(train); - GLMModel.GLMParameters params = new GLMModel.GLMParameters(gaussian); - params._standardize = false; - params._response_column = "C21"; - params._solver = IRLSM; - params._train = train._key; - params._beta_constraints = betaConstraint._key; - params._max_iterations = 1; - params._expose_constraints = true; - params._linear_constraints = linearConstraint._key; - params._lambda = new double[]{0}; - GLMModel glm2 = new GLM(params).trainModel().get(); - Scope.track_generic(glm2); - assertTrue("Should have thrown an error due to duplicated constraints.", 1==2); - } catch(IllegalArgumentException ex) { - assert ex.getMessage().contains("redundant and possibly conflicting linear constraints") : "Wrong error message. Error should be about" + - " redundant linear constraints"; - } finally { - Scope.exit(); - } - } - // linear constraints with two duplicated constraints @Test public void testDuplicateLinearConstraints() { @@ -663,7 +614,7 @@ public void testDuplicateLinearConstraints() { Scope.track_generic(glm2); assert 1==2 : "Should have thrown an error due to duplicated constraints."; } catch(IllegalArgumentException ex) { - assert ex.getMessage().contains("redundant and possibly conflicting linear constraints") : "Wrong error message. Error should be about" + + assert ex.getMessage().contains("redundant linear constraints:") : "Wrong error message. Error should be about" + " redundant linear constraints"; } finally { Scope.exit(); @@ -691,7 +642,7 @@ public void testDuplicateBetaLinearConstraints() { Scope.track_generic(glm2); assert 1==2 : "Should have thrown an error due to duplicated constraints."; } catch(IllegalArgumentException ex) { - assert ex.getMessage().contains("redundant and possibly conflicting linear constraints") : "Wrong error message. Error should be about" + + assert ex.getMessage().contains("redundant linear constraints") : "Wrong error message. Error should be about" + " redundant linear constraints"; } finally { Scope.exit(); diff --git a/h2o-assemblies/main/build.gradle b/h2o-assemblies/main/build.gradle index 6231736b5867..17b240dc7418 100644 --- a/h2o-assemblies/main/build.gradle +++ b/h2o-assemblies/main/build.gradle @@ -52,11 +52,14 @@ dependencies { // Upgrade dependencies coming from Hadoop to address vulnerabilities api "org.apache.commons:commons-compress:1.26.0" - api "com.google.protobuf:protobuf-java:3.21.7" + api "com.google.protobuf:protobuf-java:3.25.5" constraints { - api('com.fasterxml.jackson.core:jackson-databind:2.13.4.2') { + api('com.fasterxml.jackson.core:jackson-databind:2.17.2') { because 'Fixes CVE-2022-42003' + because 'Fixes PRISMA-2023-0067' + because 'Fixes CVE-2023-35116' + because 'Fixes sonatype-2024-0171' } api('org.jetbrains.kotlin:kotlin-stdlib:1.6.21') { because 'Fixes CVE-2020-29582' @@ -86,6 +89,12 @@ dependencies { api('org.apache.commons:commons-configuration2:2.10.1') { because 'Fixes CVE-2024-29131' } + api('dnsjava:dnsjava:3.6.0') { + because 'Fixes SNYK-JAVA-DNSJAVA-7547403' + because 'Fixes SNYK-JAVA-DNSJAVA-7547404' + because 'Fixes SNYK-JAVA-DNSJAVA-7547405' + because 'Fixes CVE-2024-25638' + } } } @@ -93,10 +102,6 @@ shadowJar { zip64 true mergeServiceFiles() classifier = '' - // CDH 5.3.0 provides joda-time v1.6 which is too old, shadow the library instead - if (!project.hasProperty("jacocoCoverage")) { - relocate 'org.joda.time', 'ai.h2o.org.joda.time' - } exclude 'META-INF/*.DSA' exclude 'META-INF/*.SF' exclude 'synchronize.properties' @@ -104,6 +109,7 @@ shadowJar { exclude 'test.properties' exclude 'cockpitlite.properties' exclude 'devpay_products.properties' + exclude 'javax/servlet/jsp/resources/*.*' // Need to a newer org.apache.hadoop.hive.shims.ShimLoader to make older hive JDBC drivers work on Hadoop 3. // Excluding other classes of org.apache.hive.shims:hive-shims-common. diff --git a/h2o-assemblies/minimal/build.gradle b/h2o-assemblies/minimal/build.gradle index 2a0fe0b54359..e83faa9912bf 100644 --- a/h2o-assemblies/minimal/build.gradle +++ b/h2o-assemblies/minimal/build.gradle @@ -41,6 +41,7 @@ shadowJar { exclude 'test.properties' exclude 'cockpitlite.properties' exclude 'devpay_products.properties' + exclude 'javax/servlet/jsp/resources/*.*' manifest { attributes 'Main-Class': 'water.H2OApp' attributes 'Add-Opens': 'java.base/java.lang java.base/java.util java.base/java.lang.reflect' diff --git a/h2o-assemblies/steam/build.gradle b/h2o-assemblies/steam/build.gradle index 2f177ecd991b..d7612f7f9e1f 100644 --- a/h2o-assemblies/steam/build.gradle +++ b/h2o-assemblies/steam/build.gradle @@ -51,10 +51,11 @@ dependencies { api "com.google.oauth-client:google-oauth-client:1.33.3" constraints { - api('com.fasterxml.jackson.core:jackson-databind:2.16.1') { + api('com.fasterxml.jackson.core:jackson-databind:2.17.2') { because 'Fixes CVE-2022-42003' because 'Fixes PRISMA-2023-0067' because 'Fixes CVE-2023-35116' + because 'Fixes sonatype-2024-0171' } api('org.codehaus.jettison:jettison:1.5.4') { because 'Fixes CVE-2023-1436' @@ -84,6 +85,12 @@ dependencies { api('org.apache.commons:commons-configuration2:2.10.1') { because 'Fixes CVE-2024-29131' } + api('dnsjava:dnsjava:3.6.0') { + because 'Fixes SNYK-JAVA-DNSJAVA-7547403' + because 'Fixes SNYK-JAVA-DNSJAVA-7547404' + because 'Fixes SNYK-JAVA-DNSJAVA-7547405' + because 'Fixes CVE-2024-25638' + } } } @@ -97,6 +104,7 @@ shadowJar { exclude 'test.properties' exclude 'cockpitlite.properties' exclude 'devpay_products.properties' + exclude 'javax/servlet/jsp/resources/*.*' manifest { attributes 'Main-Class': 'water.H2OApp' attributes 'Add-Opens': 'java.base/java.lang java.base/java.util java.base/java.lang.reflect' diff --git a/h2o-bindings/bin/custom/R/gen_stackedensemble.py b/h2o-bindings/bin/custom/R/gen_stackedensemble.py index 397eb1e62b1a..66ddb1b764da 100644 --- a/h2o-bindings/bin/custom/R/gen_stackedensemble.py +++ b/h2o-bindings/bin/custom/R/gen_stackedensemble.py @@ -90,8 +90,12 @@ def update_param(name, param): h2o.init() # Import a sample binary outcome train/test set -train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") -test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") +train <- h2o.importFile( + "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv" + ) +test <- h2o.importFile( + "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv" + ) # Identify predictors and response y <- "response" diff --git a/h2o-bindings/bin/custom/python/gen_dt.py b/h2o-bindings/bin/custom/python/gen_dt.py index 1b781ad19880..71a0ab5a1ee8 100644 --- a/h2o-bindings/bin/custom/python/gen_dt.py +++ b/h2o-bindings/bin/custom/python/gen_dt.py @@ -6,3 +6,62 @@ Builds a Decision Tree (DT) on a preprocessed dataset. """ ) +examples = dict( + categorical_encoding=""" + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate["RACE"] = prostate["RACE"].asfactor() + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5, + ... categorical_encoding="binary") + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) + """, + ignore_const_cols=""" + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> prostate["const_1"] = 6 + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5, + ... ignore_const_cols=True) + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) + """, + max_depth=""" + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5) + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) + """, + min_rows=""" + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5, + ... min_rows=20) + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) + """ +) diff --git a/h2o-bindings/bin/custom/python/gen_rulefit.py b/h2o-bindings/bin/custom/python/gen_rulefit.py index 55a01c4057db..9ac14b842743 100644 --- a/h2o-bindings/bin/custom/python/gen_rulefit.py +++ b/h2o-bindings/bin/custom/python/gen_rulefit.py @@ -6,6 +6,21 @@ def rule_importance(self): Retrieve rule importances for a Rulefit model :return: H2OTwoDimTable + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> rule_importance = rfit.rule_importance() + >>> print(rfit.rule_importance()) """ if self._model_json["algo"] != "rulefit": raise H2OValueError("This function is available for Rulefit models only") @@ -18,11 +33,29 @@ def rule_importance(self): def predict_rules(self, frame, rule_ids): """ - Evaluates validity of the given rules on the given data. + Evaluates validity of the given rules on the given data. :param frame: H2OFrame on which rule validity is to be evaluated :param rule_ids: string array of rule ids to be evaluated against the frame :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not. + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv" + >>> df = h2o.import_file(path=f, col_types={'species': "enum"}) + >>> x = df.columns + >>> y = "species" + >>> x.remove(y) + >>> train, test = df.split_frame(ratios=[.8], seed=1234) + >>> rfit = H2ORuleFitEstimator(min_rule_length=4, + ... max_rule_length=5, + ... max_num_rules=3, + ... seed=1234, + ... model_type="rules") + >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test) + >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica'])) """ from h2o.frame import H2OFrame from h2o.utils.typechecks import assert_is_type @@ -52,3 +85,126 @@ def predict_rules(self, frame, rule_ids): """ ), ) + +examples = dict( + algorithm=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... algorithm="gbm", +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) + +""", + max_categorical_levels=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... max_categorical_levels=11, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + max_num_rules=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=3, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + min_rule_length=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... min_rule_length=4, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + max_rule_length=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... min_rule_length=3, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + model_type=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... model_type="rules", +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + distribution=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... distribution="bernoulli", +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + rule_generation_ntrees=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... rule_generation_ntrees=60, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""" +) diff --git a/h2o-core/build.gradle b/h2o-core/build.gradle index 2720b7cfddad..bd32b554a2f6 100644 --- a/h2o-core/build.gradle +++ b/h2o-core/build.gradle @@ -17,6 +17,7 @@ dependencies { api 'org.javassist:javassist:3.28.0-GA' api 'org.apache.commons:commons-math3:3.6.1' api "commons-io:commons-io:2.11.0" + api 'com.github.luben:zstd-jni:1.5.6-2' compileOnly "javax.servlet:javax.servlet-api:3.1.0" api("com.github.wendykierp:JTransforms:3.1") { exclude module: "junit" } api project(":h2o-jaas-pam") diff --git a/h2o-core/src/main/java/water/api/CloudHandler.java b/h2o-core/src/main/java/water/api/CloudHandler.java index 684dc9202d77..4058c9931d02 100644 --- a/h2o-core/src/main/java/water/api/CloudHandler.java +++ b/h2o-core/src/main/java/water/api/CloudHandler.java @@ -40,7 +40,7 @@ public CloudV3 status(int version, CloudV3 cloud) { cloud.consensus = Paxos._commonKnowledge; cloud.locked = Paxos._cloudLocked; cloud.internal_security_enabled = H2OSecurityManager.instance().securityEnabled; - + cloud.web_ip = H2O.ARGS.web_ip; // set leader H2ONode leader = H2O.CLOUD.leaderOrNull(); // leader might be null in client mode if clouding didn't finish yet diff --git a/h2o-core/src/main/java/water/api/schemas3/CloudV3.java b/h2o-core/src/main/java/water/api/schemas3/CloudV3.java index 7bdccce6b43c..99b2956e0a5c 100644 --- a/h2o-core/src/main/java/water/api/schemas3/CloudV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/CloudV3.java @@ -110,6 +110,9 @@ public CloudV3() {} @API(help="leader_idx", direction=API.Direction.OUTPUT) public int leader_idx = -1; + @API(help="web_ip", direction=API.Direction.OUTPUT) + public String web_ip=null; + // Output fields one-per-JVM public static class NodeV3 extends SchemaV3 { public NodeV3() {} diff --git a/h2o-core/src/main/java/water/api/schemas3/FramesV3.java b/h2o-core/src/main/java/water/api/schemas3/FramesV3.java index b95c91458b12..610ee0adf1de 100644 --- a/h2o-core/src/main/java/water/api/schemas3/FramesV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/FramesV3.java @@ -47,7 +47,7 @@ public class FramesV3 extends RequestSchemaV3 { @API(help="Output file format. Defaults to 'csv'.", values = { "csv", "parquet"} , json=false) public ExportFileFormat format; - @API(help="Compression method (default none; gzip, bzip2 and snappy available depending on runtime environment)") + @API(help="Compression method (default none; gzip, bzip2, zstd and snappy available depending on runtime environment)") public String compression; @API(help="Specifies if checksum should be written next to data files on export (if supported by export format).") diff --git a/h2o-core/src/main/java/water/parser/ParseDataset.java b/h2o-core/src/main/java/water/parser/ParseDataset.java index c2f06a4398e5..c339f54bb926 100644 --- a/h2o-core/src/main/java/water/parser/ParseDataset.java +++ b/h2o-core/src/main/java/water/parser/ParseDataset.java @@ -1,5 +1,6 @@ package water.parser; +import com.github.luben.zstd.ZstdInputStream; import jsr166y.CountedCompleter; import jsr166y.ForkJoinTask; import jsr166y.RecursiveAction; @@ -906,6 +907,17 @@ private FVecParseWriter makeDout(ParseSetup localSetup, int chunkOff, int nchunk chunksAreLocal(vec,chunkStartIdx,key); break; } + case ZSTD: { + localSetup = ParserService.INSTANCE.getByInfo(localSetup._parse_type).setupLocal(vec, localSetup); + try (InputStream bvs = vec.openStream(_jobKey); + InputStream dec = decryptionTool.decryptInputStream(bvs); + ZstdInputStream zstdIs = new ZstdInputStream(dec)) { + _dout[_lo] = streamParse(zstdIs, localSetup, makeDout(localSetup, chunkStartIdx, vec.nChunks()), bvs); + } + _errors = _dout[_lo].removeErrors(); + chunksAreLocal(vec, chunkStartIdx, key); + break; + } } Log.trace("Finished a map stage of a file parse with start index "+chunkStartIdx+"."); } catch( IOException ioe ) { diff --git a/h2o-core/src/main/java/water/parser/ZipUtil.java b/h2o-core/src/main/java/water/parser/ZipUtil.java index aa2d333e24f7..61f3793008d6 100644 --- a/h2o-core/src/main/java/water/parser/ZipUtil.java +++ b/h2o-core/src/main/java/water/parser/ZipUtil.java @@ -1,5 +1,6 @@ package water.parser; +import com.github.luben.zstd.ZstdInputStream; import water.DKV; import water.Iced; import water.Key; @@ -23,7 +24,8 @@ public abstract class ZipUtil { - public enum Compression { NONE, ZIP, GZIP } + public enum Compression { NONE, ZIP, GZIP, ZSTD } + public static int ZSTD_MAGIC = 0xFD2FB528; /** * This method will attempt to read the few bytes off a file which will in turn be used @@ -147,6 +149,8 @@ static Compression guessCompressionMethod(byte [] bits) { return Compression.ZIP; if( bits.length > 2 && (UnsafeUtils.get2(bits,0)&0xffff) == GZIPInputStream.GZIP_MAGIC ) return Compression.GZIP; + if (bits.length >= 4 && UnsafeUtils.get4(bits, 0) == ZSTD_MAGIC) + return Compression.ZSTD; return Compression.NONE; } @@ -185,7 +189,7 @@ static byte[] unzipBytes( byte[] bs, Compression cmp, int chkSize ) { if( cmp == Compression.NONE ) return bs; // No compression // Wrap the bytes in a stream ByteArrayInputStream bais = new ByteArrayInputStream(bs); - InflaterInputStream is = null; + InputStream is = null; try { if (cmp == Compression.ZIP) { ZipInputStream zis = new ZipInputStream(bais); @@ -194,7 +198,10 @@ static byte[] unzipBytes( byte[] bs, Compression cmp, int chkSize ) { if (ze == null || ze.isDirectory()) zis.getNextEntry(); // read the next entry which should be a file is = zis; - } else { + } else if (cmp == Compression.ZSTD) { + is = new ZstdInputStream(bais); + } + else { assert cmp == Compression.GZIP; is = new GZIPInputStream(bais); } diff --git a/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java b/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java index 3fe4bf179866..533e9ce7f26d 100644 --- a/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java +++ b/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java @@ -33,10 +33,12 @@ public ValStr apply(Env env, Env.StackHelp stk, AstRoot[] asts) { try { // only allow to run approved tools (from our package), not just anything on classpath Class clazz = Class.forName(TOOLS_PACKAGE + toolClassName); - Method mainMethod = clazz.getDeclaredMethod("main", String[].class); + Method mainMethod = clazz.getDeclaredMethod("mainInternal", String[].class); mainMethod.invoke(null, new Object[]{args}); } catch (Exception e) { - throw new RuntimeException(e); + RuntimeException shorterException = new RuntimeException(e.getCause().getMessage()); + shorterException.setStackTrace(new StackTraceElement[0]); + throw shorterException; } return new ValStr("OK"); } diff --git a/h2o-core/src/main/java/water/tools/EncryptionTool.java b/h2o-core/src/main/java/water/tools/EncryptionTool.java index e3faabf7d12e..73f12f0c9a96 100644 --- a/h2o-core/src/main/java/water/tools/EncryptionTool.java +++ b/h2o-core/src/main/java/water/tools/EncryptionTool.java @@ -47,6 +47,9 @@ public void encrypt(File input, File output) throws IOException, GeneralSecurity } public static void main(String[] args) throws GeneralSecurityException, IOException { + mainInternal(args); + } + public static void mainInternal(String[] args) throws GeneralSecurityException, IOException { EncryptionTool et = new EncryptionTool(); et._keystore_file = new File(args[0]); et._keystore_type = args[1]; diff --git a/h2o-core/src/main/java/water/util/CompressionFactory.java b/h2o-core/src/main/java/water/util/CompressionFactory.java index c141e4f2a865..c1af171ae8f0 100644 --- a/h2o-core/src/main/java/water/util/CompressionFactory.java +++ b/h2o-core/src/main/java/water/util/CompressionFactory.java @@ -1,5 +1,6 @@ package water.util; +import com.github.luben.zstd.ZstdOutputStream; import water.Iced; import java.io.ByteArrayOutputStream; @@ -23,6 +24,8 @@ OutputStream wrapOutputStream(OutputStream os) throws IOException { return os; case "gzip": return new GZIPOutputStream(os); + case "zstd": + return new ZstdOutputStream(os); case "bzip2": return wrapDynamic("org.python.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream", os); case "snappy": diff --git a/h2o-core/src/main/java/water/util/DecompressionFactory.java b/h2o-core/src/main/java/water/util/DecompressionFactory.java index 1dd7553e0e7c..ac72c820198c 100644 --- a/h2o-core/src/main/java/water/util/DecompressionFactory.java +++ b/h2o-core/src/main/java/water/util/DecompressionFactory.java @@ -1,5 +1,6 @@ package water.util; +import com.github.luben.zstd.ZstdInputStream; import water.Iced; import java.io.IOException; @@ -26,6 +27,8 @@ InputStream wrapInputStream(InputStream is) throws IOException { return wrapDynamic("org.python.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream", is); case "snappy": return wrapDynamic("org.xerial.snappy.SnappyInputStream", is); + case "zstd": + return new ZstdInputStream(is); default: return wrapDynamic(_name, is); } diff --git a/h2o-core/src/main/java/water/util/Log.java b/h2o-core/src/main/java/water/util/Log.java index dc1658f2ec8f..72f8ebf354ba 100644 --- a/h2o-core/src/main/java/water/util/Log.java +++ b/h2o-core/src/main/java/water/util/Log.java @@ -29,8 +29,8 @@ abstract public class Log { public static final byte INFO = 3; public static final byte DEBUG= 4; public static final byte TRACE= 5; - public static final String[] LVLS = { "FATAL", "ERRR", "WARN", "INFO", "DEBUG", "TRACE" }; + private static final String PROP_MAX_PID_LENGTH = H2O.OptArgs.SYSTEM_PROP_PREFIX + "log.max.pid.length"; private static int _level = INFO; private static boolean _quiet = false; @@ -262,7 +262,15 @@ public static String getLogFilePath(String level) { private static String getHostPortPid() { String host = H2O.SELF_ADDRESS.getHostAddress(); - return fixedLength(host + ":" + H2O.API_PORT + " ", 22) + fixedLength(H2O.PID + " ", 6); + return fixedLength(host + ":" + H2O.API_PORT + " ", 22) + fixedLength(H2O.PID + " ", maximumPidLength() + 2); + } + + // set sys.ai.h2o.log.max.pid.length to avoid h2o-3 trimming PID in the logs + private static int maximumPidLength() { + String maxPidPropertyValue = System.getProperty(PROP_MAX_PID_LENGTH); + return maxPidPropertyValue != null + ? Integer.parseInt(maxPidPropertyValue) + : 4; } private static synchronized Logger createLog4j() { diff --git a/h2o-dist/buildinfo.json b/h2o-dist/buildinfo.json index cef7ad96ff8d..8e258695c572 100644 --- a/h2o-dist/buildinfo.json +++ b/h2o-dist/buildinfo.json @@ -178,6 +178,11 @@ "zip_file_name" : "h2o-SUBST_PROJECT_VERSION-mapr6.2.zip", "zip_file_path" : "h2o-SUBST_PROJECT_VERSION-mapr6.2.zip" }, + { + "distribution" : "mapr7.0", + "zip_file_name" : "h2o-SUBST_PROJECT_VERSION-mapr7.0.zip", + "zip_file_path" : "h2o-SUBST_PROJECT_VERSION-mapr7.0.zip" + }, { "distribution" : "iop4.2", "zip_file_name" : "h2o-SUBST_PROJECT_VERSION-iop4.2.zip", diff --git a/h2o-docs/src/product/admissible.rst b/h2o-docs/src/product/admissible.rst index c673223ec491..361fe1db5eff 100644 --- a/h2o-docs/src/product/admissible.rst +++ b/h2o-docs/src/product/admissible.rst @@ -167,7 +167,7 @@ The code below generates an infogram, and we plot the infogram and view the data h2o.init() # Import credit dataset - f <- "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv" + f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/taiwan_credit_card_uci_prep.csv" col_types <- list(by.col.name = c("SEX", "MARRIAGE", "default_payment_next_month"), types = c("factor", "factor", "factor")) df <- h2o.importFile(path = f, col.types = col_types) @@ -201,7 +201,7 @@ The code below generates an infogram, and we plot the infogram and view the data h2o.init() # Import credit dataset - f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv" + f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/taiwan_credit_card_uci_prep.csv" col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"} df = h2o.import_file(path=f, col_types=col_types) @@ -259,7 +259,7 @@ The code below generates an infogram, and we plot the infogram and view the data h2o.init() # Import HMDA dataset - f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types <- list(by.col.name = c("high_priced"), types = c("factor")) df <- h2o.importFile(path = f, col.types = col_types) @@ -303,7 +303,7 @@ The code below generates an infogram, and we plot the infogram and view the data h2o.init() # Import HDMA dataset - f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types = {'high_priced': "enum"} df = h2o.import_file(path=f, col_types=col_types) @@ -548,7 +548,7 @@ impact ratio (air), significant adverse impact ratio (calculated only using the .. tabs:: .. code-tab:: r R - f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types <- list(by.col.name = c("high_priced"), types = c("factor")) df <- h2o.importFile(path = f, col.types = col_types) @@ -590,7 +590,7 @@ impact ratio (air), significant adverse impact ratio (calculated only using the .. code-tab:: python # Import HDMA dataset - f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types = {'high_priced': "enum"} df = h2o.import_file(path=f, col_types=col_types) @@ -638,7 +638,7 @@ Characteristics or Precision-Recall Curves. .. tabs:: .. code-tab:: r R - f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types <- list(by.col.name = c("high_priced"), types = c("factor")) df <- h2o.importFile(path = f, col.types = col_types) @@ -670,7 +670,7 @@ Characteristics or Precision-Recall Curves. .. code-tab:: python # Import HDMA dataset - f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types = {'high_priced': "enum"} df = h2o.import_file(path=f, col_types=col_types) @@ -749,7 +749,7 @@ This kind of SHAP plot can be obtained using ``model.fair_shap_plot``/``h2o.fair .. tabs:: .. code-tab:: r R - f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types <- list(by.col.name = c("high_priced"), types = c("factor")) df <- h2o.importFile(path = f, col.types = col_types) @@ -781,7 +781,7 @@ This kind of SHAP plot can be obtained using ``model.fair_shap_plot``/``h2o.fair .. code-tab:: python # Import HDMA dataset - f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv" + f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv" col_types = {'high_priced': "enum"} df = h2o.import_file(path=f, col_types=col_types) @@ -836,4 +836,4 @@ Subhadeep Mukhopadhyay. *InfoGram and Admissible Machine Learning*, August 2021. LUM, Kristian, ZHANG, Yunfeng and BOWER, Amanda. *De-biasing “bias” measurement*, June 2022. `arXiv Url `__. -HARDT, Moritz, PRICE, Eric and SREBRO, Nathan. *Equality of Opportunity in Supervised Learning*, October 2016. `arXiv Url `__. \ No newline at end of file +HARDT, Moritz, PRICE, Eric and SREBRO, Nathan. *Equality of Opportunity in Supervised Learning*, October 2016. `arXiv Url `__. diff --git a/h2o-docs/src/product/api-changes.rst b/h2o-docs/src/product/api-changes.rst index eb69a094c339..6ce3357ba0c1 100644 --- a/h2o-docs/src/product/api-changes.rst +++ b/h2o-docs/src/product/api-changes.rst @@ -1,67 +1,72 @@ -API-Related Changes -------------------- +API-related changes +=================== H2O-3 does its best to keep backwards compatibility between major versions, but sometimes breaking changes are needed in order to improve code quality and to address issues. This section provides a list of current breaking changes between specific releases. From 3.32.0.1 -~~~~~~~~~~~~~ +------------- Modules -''''''' +~~~~~~~ The deprecated ``h2o-scala`` module has been removed. Target Encoding -''''''''''''''' +~~~~~~~~~~~~~~~ -The Target Encoder API has been clarified and its consistency across clients has been improved: +The Target Encoder API has been clarified and its consistency across clients has been improved. The following parameters are now deprecated in all clients and officially replaced by their new alternative: -The following parameters are now deprecated in all clients and officially replaced by their new alternative: -- ``k`` -> ``inflection_point`` -- ``f`` -> ``smoothing`` -- ``noise_level`` -> ``noise`` -- ``use_blending`` (R only) -> ``blending`` +- ``k`` :math:`\to` ``inflection_point`` +- ``f`` :math:`\to` ``smoothing`` +- ``noise_level`` :math:`\to` ``noise`` +- ``use_blending`` (R only) :math:`\to` ``blending`` -Legacy client code using the deprecated parameters should expect a deprecation warning when using them, they are strongly encouraged to update their code to use the new naming. +Legacy client code using the deprecated parameters should expect a deprecation warning when using them. You are strongly encouraged to update your code to use the new naming. + +``transform`` parameter updates +''''''''''''''''''''''''''''''' + +In an objective of performance optimization on the backend, and of simplification of the API, the ``transform`` method used to apply target encoding was modified as follows: -In an objective of performance optimization on the backend, and of simplification of the API, the ``transform`` method used to apply target encoding was modified as follow. - The R ``h2o.transform`` function (accepting a target encoder model as the first argument) and the Python ``H2OTargetEncoderEstimator.transform`` methods are now fully compatible: they accept the same parameters and work consistently. -- The parameters ``data_leakage_handling``, ``seed`` are now ignored on those methods: ``transform`` will use by default the corresponding values defined when building the TargetEncoder model. +- The parameters ``data_leakage_handling``, ``seed`` are now ignored on those methods: by default, ``transform`` will use the corresponding values defined when building the TargetEncoder model. - The other regularization parameters on these ``transform`` methods (e.g. ``noise``, ``blending``, ``inflection_point``, ``smoothing``), always default to the value defined on the TargetEncoder model. - A new ``as_training`` parameter has been introduced to simplify and enforce a correct usage of target encoding: - - When transforming a training dataset, user should use (R) ``h2o.transform(te_model, train_dataset, as_training=TRUE)`` or (Python) ``te_model.transform(train_dataset, as_training=True)``. - - When transforming any other dataset (validation, test, ...), user can just use (R) ``h2o.transform(te_model, train_dataset)`` or (Python) ``te_model.transform(train_dataset)``. + - When transforming a training dataset, you should use (R) ``h2o.transform(te_model, train_dataset, as_training=TRUE)`` or (Python) ``te_model.transform(train_dataset, as_training=True)``. + - When transforming any other dataset (validation, test, ...), you can just use (R) ``h2o.transform(te_model, train_dataset)`` or (Python) ``te_model.transform(train_dataset)``. - Legacy code using for example ``h2o.transform(te_model, train_dataset, data_leakage_handling="KFold")`` will now be translated internally to ``h2o.transform(te_model, train_dataset, as_training=TRUE)``. -Finally the following APIs, deprecated since 3.28, have been fully removed: +Finally the following APIs (deprecated since 3.28) have been fully removed: + - Python: ``h2o.targetencoder`` module. - R: ``h2o.target_encode_fit`` and ``h2o.target_encode_transform`` functions. Parameters -'''''''''' -The ``max_hit_ratio_k`` param has been removed. +~~~~~~~~~~ + +The ``max_hit_ratio_k`` parameter has been removed. From 3.30.1.2 -~~~~~~~~~~~~~ +------------- -The ``max_hit_ratio_k`` param is deprecated in version 3.30.1.2 and will be completely removed in the next major version, 3.32.0.1. +The ``max_hit_ratio_k`` parameter is deprecated in version 3.30.1.2 and will be completely removed in the next major version, 3.32.0.1. From 3.30.1.1 -~~~~~~~~~~~~~ +------------- The deprecated ``h2o-scala`` module has been removed. From 3.30.0.5 -~~~~~~~~~~~~~ +------------- The ``h2o-scala`` module is deprecated in version 3.30.0.5 and will be completely removed in the next major version, 3.30.1.1. From 3.30.0.4 -~~~~~~~~~~~~~ +------------- The following options are no longer supported by native `XGBoost `__ and have been removed. @@ -69,21 +74,20 @@ The following options are no longer supported by native `XGBoost `__ -- `Word2vec `__ +H2O-3 also has methods for feature engineering. `Target Encoding `__ is a categorical encoding technique which replaces a categorical value with the mean of the target variable (this is especially useful for high-cardinality features). `Word2vec `__ is a text processing method which converts a corpus of text into an output of word vectors. diff --git a/h2o-docs/src/product/data-munging/change-column-type.rst b/h2o-docs/src/product/data-munging/change-column-type.rst index 060e590644af..1bc27b79747a 100644 --- a/h2o-docs/src/product/data-munging/change-column-type.rst +++ b/h2o-docs/src/product/data-munging/change-column-type.rst @@ -1,46 +1,16 @@ .. _change-column-type: -Changing the Column Type ------------------------- - -H2O algorithms will treat a problem as a classification problem if the column type is ``factor`` and a regression problem if the column type is ``numeric``. You can force H2O to use either classification or regression by changing the column type. - -.. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Import the cars dataset: - cars_df <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") - - # Check the column type for the "cylinders" column: - print(h2o.isnumeric(cars_df["cylinders"])) - #TRUE - - # Change the column type to a factor: - cars_df["cylinders"] <- as.factor(cars_df["cylinders"]) - - # Verify that the column is now a factor: - print(h2o.isfactor(cars_df["cylinders"])) - #TRUE - - # Change the column type back to numeric: - cars_df["cylinders"] <- as.numeric(cars_df["cylinders"]) - # Verify that the column is now numeric and not a factor: - print(h2o.isfactor(cars_df["cylinders"])) - #FALSE - print(h2o.isnumeric(cars_df["cylinders"])) - #TRUE +Change the Column Type +====================== - # Change multiple columns to factors: - cars_df[c("cylinders","economy_20mpg")] <- as.factor(cars_df[c("cylinders","economy_20mpg")]) +You can change the column type using H2O-3's capabilities. - # Verify that the columns are now factors: - print(h2o.isfactor(cars_df[c("cylinders","economy_20mpg")])) - # TRUE TRUE +``factor`` and ``numeric`` +-------------------------- +H2O-3 algorithms will treat a problem as a classification problem if the column type is ``factor`` and as a regression problem if the column type is ``numeric``. You can force H2O-3 to use either classification or regression by changing the column type. +.. tabs:: .. code-tab:: python import h2o @@ -78,51 +48,63 @@ H2O algorithms will treat a problem as a classification problem if the column ty print(cars_df[['cylinders','economy_20mpg']].isfactor()) # [True, True] + .. code-tab:: r R -If the column type is ``enum`` and you want to convert it to ``numeric``, you should first convert it to ``character`` then convert it to ``numeric``. Otherwise, the values may be converted to underlying factor values, not the expected mapped values. + library(h2o) + h2o.init() -.. tabs:: - .. code-tab:: r R + # Import the cars dataset: + cars_df <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") - # Using the data from the above example, convert the 'name' column to numeric: - cars_df["name"] <- as.character(cars_df["name"]) - cars_df["name"] <- as.numeric(cars_df["name"]) + # Check the column type for the "cylinders" column: + print(h2o.isnumeric(cars_df["cylinders"])) + #TRUE + # Change the column type to a factor: + cars_df["cylinders"] <- as.factor(cars_df["cylinders"]) - .. code-tab:: python + # Verify that the column is now a factor: + print(h2o.isfactor(cars_df["cylinders"])) + #TRUE - # Using the data from the above example, convert the 'name' column to numeric: - cars_df['name'] = cars_df['name'].ascharacter().asnumeric() + # Change the column type back to numeric: + cars_df["cylinders"] <- as.numeric(cars_df["cylinders"]) + # Verify that the column is now numeric and not a factor: + print(h2o.isfactor(cars_df["cylinders"])) + #FALSE + print(h2o.isnumeric(cars_df["cylinders"])) + #TRUE -Converting Dates Columns -~~~~~~~~~~~~~~~~~~~~~~~~ + # Change multiple columns to factors: + cars_df[c("cylinders","economy_20mpg")] <- as.factor(cars_df[c("cylinders","economy_20mpg")]) -H2O represents dates as (unix) timestamps. These are then raw input to the algorithm, however, this is not very useful in most cases. You are expected to do your own feature engineering and break the data into day, month, and year using the functions H2O provides. + # Verify that the columns are now factors: + print(h2o.isfactor(cars_df[c("cylinders","economy_20mpg")])) + # TRUE TRUE + +``enum`` and ``numeric`` +------------------------ + +If the column type is ``enum`` and you want to convert it to ``numeric``, you should first convert it to ``character`` then convert it to ``numeric``. Otherwise, the values may be converted to underlying factor values, not the expected mapped values. .. tabs:: - .. code-tab:: r R + .. code-tab:: python - library(h2o) - h2o.init() + # Using the data from the above example, convert the 'name' column to numeric: + cars_df['name'] = cars_df['name'].ascharacter().asnumeric() - # convert the frame (containing strings / categoricals) into the date format: - hdf <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/jira/v-11-eurodate.csv") - h2o.as_date(hdf["ds5"], c("%d.%m.%y %H:%M")) + .. code-tab:: r R - # You can also access the date/time information from the raw data. - # Access the day of week: - h2o.dayOfWeek(hdf["ds3"]) + # Using the data from the above example, convert the 'name' column to numeric: + cars_df["name"] <- as.character(cars_df["name"]) + cars_df["name"] <- as.numeric(cars_df["name"]) - # Access the year, month, week, and day: - h2o.year(hdf["ds3"]) - h2o.month(hdf["ds3"]) - h2o.week(hdf["ds3"]) - h2o.day(hdf["ds3"]) +Convert dates columns +--------------------- - # Access the hour: - h2o.hour(hdf["ds3"]) - +H2O-3 represents dates as (unix) timestamps. These are then raw input to the algorithm. However, this is not very useful in most cases. You are expected to do your own feature engineering and break the data into day, month, and year using the functions H2O-3 provides. +.. tabs:: .. code-tab:: python import h2o @@ -147,4 +129,25 @@ H2O represents dates as (unix) timestamps. These are then raw input to the algor hdf["ds3"].minute() hdf["ds3"].second() + .. code-tab:: r R + + library(h2o) + h2o.init() + + # convert the frame (containing strings / categoricals) into the date format: + hdf <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/jira/v-11-eurodate.csv") + h2o.as_date(hdf["ds5"], c("%d.%m.%y %H:%M")) + + # You can also access the date/time information from the raw data. + # Access the day of week: + h2o.dayOfWeek(hdf["ds3"]) + + # Access the year, month, week, and day: + h2o.year(hdf["ds3"]) + h2o.month(hdf["ds3"]) + h2o.week(hdf["ds3"]) + h2o.day(hdf["ds3"]) + + # Access the hour: + h2o.hour(hdf["ds3"]) \ No newline at end of file diff --git a/h2o-docs/src/product/data-munging/combining-columns.rst b/h2o-docs/src/product/data-munging/combining-columns.rst index 58b3c947d51a..19507b034e19 100644 --- a/h2o-docs/src/product/data-munging/combining-columns.rst +++ b/h2o-docs/src/product/data-munging/combining-columns.rst @@ -1,65 +1,13 @@ -Combining Columns from Two Datasets ------------------------------------ +Combine columns from two datasets +================================= -The ``cbind`` function allows you to combine datasets by adding columns from one dataset into another. Note that when using ``cbind``, the two datasets must have the same number of rows. In addition, if the datasets contain common column names, H2O will append the joined column with ``0``. +The ``cbind`` function lets you combine datasets by adding columns from one dataset into another. If the datasets contain common column names, H2O will append the joined column with ``0``. -.. tabs:: - .. code-tab:: r R +.. note:: - library(h2o) - h2o.init() - - # Create two simple, two-column R data frames by inputting values, - # ensuring that both have a common column (in this case, "fruit"). - left <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'blueberry'), - color = c('red', 'orange', 'yellow', 'yellow', 'red', 'blue')) - right <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'watermelon'), - citrus = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE)) - - # Create the H2O data frames from the inputted data. - left_frame <- as.h2o(left) - print(left_frame) - fruit color - 1 apple red - 2 orange orange - 3 banana yellow - 4 lemon yellow - 5 strawberry red - 6 blueberry blue - - [6 rows x 2 columns] - - right_frame <- as.h2o(right) - print(right_frame) - fruit citrus - 1 apple FALSE - 2 orange TRUE - 3 banana FALSE - 4 lemon TRUE - 5 strawberry FALSE - 6 watermelon FALSE - - [6 rows x 2 columns] - - # Combine the l.hex and r.hex datasets into a single dataset. - # The columns from r.hex will be appended to the right side of the final dataset. - # In addition, because both datasets include a "fruit" column, H2O will append the - # second "fruit" column name with "0". Note that this is different than ``merge``, - # which combines data from two commonly named columns in two datasets. - - columns <- h2o.cbind(left_frame, right_frame) - print(columns) - fruit color fruit0 citrus - 1 apple red apple FALSE - 2 orange orange orange TRUE - 3 banana yellow banana FALSE - 4 lemon yellow lemon TRUE - 5 strawberry red strawberry FALSE - 6 blueberry blue watermelon FALSE - - [6 rows x 4 columns] + The two datasets you are combining columns from must have the same number of rows. - +.. tabs:: .. code-tab:: python import h2o @@ -122,4 +70,59 @@ The ``cbind`` function allows you to combine datasets by adding columns from one -0.881062 -0.897391 0.980548 -0.266982 0.363517 0.465146 [10 rows x 6 columns] + + .. code-tab:: r R + library(h2o) + h2o.init() + + # Create two simple, two-column R data frames by inputting values, + # ensuring that both have a common column (in this case, "fruit"). + left <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'blueberry'), + color = c('red', 'orange', 'yellow', 'yellow', 'red', 'blue')) + right <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'watermelon'), + citrus = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE)) + + # Create the H2O data frames from the inputted data. + left_frame <- as.h2o(left) + print(left_frame) + fruit color + 1 apple red + 2 orange orange + 3 banana yellow + 4 lemon yellow + 5 strawberry red + 6 blueberry blue + + [6 rows x 2 columns] + + right_frame <- as.h2o(right) + print(right_frame) + fruit citrus + 1 apple FALSE + 2 orange TRUE + 3 banana FALSE + 4 lemon TRUE + 5 strawberry FALSE + 6 watermelon FALSE + + [6 rows x 2 columns] + + # Combine the l.hex and r.hex datasets into a single dataset. + # The columns from r.hex will be appended to the right side of the final dataset. + # In addition, because both datasets include a "fruit" column, H2O will append the + # second "fruit" column name with "0". Note that this is different than ``merge``, + # which combines data from two commonly named columns in two datasets. + + columns <- h2o.cbind(left_frame, right_frame) + print(columns) + fruit color fruit0 citrus + 1 apple red apple FALSE + 2 orange orange orange TRUE + 3 banana yellow banana FALSE + 4 lemon yellow lemon TRUE + 5 strawberry red strawberry FALSE + 6 blueberry blue watermelon FALSE + + [6 rows x 4 columns] + diff --git a/h2o-docs/src/product/data-munging/combining-rows.rst b/h2o-docs/src/product/data-munging/combining-rows.rst index 5d63d5e34d75..d673da03b8cc 100644 --- a/h2o-docs/src/product/data-munging/combining-rows.rst +++ b/h2o-docs/src/product/data-munging/combining-rows.rst @@ -1,34 +1,13 @@ -Combining Rows from Two Datasets --------------------------------- +Combine rows from two datasets +============================== You can use the ``rbind`` function to combine two similar datasets into a single large dataset. This can be used, for example, to create a larger dataset by combining data from a validation dataset with its training or testing dataset. -Note that when using ``rbind``, the two datasets must have the same set of columns. +.. note:: + + When using ``rbind``, the two datasets must have the same set of columns. .. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Import an existing training dataset - ecg1_path <- "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_train.csv" - ecg1 <- h2o.importFile(path = ecg1_path) - print(dim(ecg1)) - [1] 20 210 - - # Import an existing testing dataset - ecg2_path <- "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv" - ecg2 <- h2o.importFile(path = ecg2_path) - print(dim(ecg2)) - [1] 23 210 - - # Combine the two datasets into a single, larger dataset - ecg_combine <- h2o.rbind(ecg1, ecg2) - print(dim(ecgCombine)) - [1] 43 210 - - .. code-tab:: python import h2o @@ -89,3 +68,25 @@ Note that when using ``rbind``, the two datasets must have the same set of colum 0.752855 -0.168504 -0.750161 -2.46084 [200 rows x 4 columns] + + .. code-tab:: r R + + library(h2o) + h2o.init() + + # Import an existing training dataset + ecg1_path <- "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_train.csv" + ecg1 <- h2o.importFile(path = ecg1_path) + print(dim(ecg1)) + [1] 20 210 + + # Import an existing testing dataset + ecg2_path <- "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv" + ecg2 <- h2o.importFile(path = ecg2_path) + print(dim(ecg2)) + [1] 23 210 + + # Combine the two datasets into a single, larger dataset + ecg_combine <- h2o.rbind(ecg1, ecg2) + print(dim(ecgCombine)) + [1] 43 210 diff --git a/h2o-docs/src/product/data-munging/downloading-data.rst b/h2o-docs/src/product/data-munging/downloading-data.rst index 3895c79d4faf..84d7aca07f78 100644 --- a/h2o-docs/src/product/data-munging/downloading-data.rst +++ b/h2o-docs/src/product/data-munging/downloading-data.rst @@ -1,23 +1,18 @@ -Downloading data ----------------- +Download data +============= -Sometimes it is desirable to download data from the H2O cluster. For example, when computing model predictions, it might be desirable to save these predictions for later. +Sometimes you need to download data from the H2O-3 cluster. For example, when computing model predictions, it might be desirable to save these predictions for later. Download to local memory -~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------ -H2O has functions like ``as_data_frame`` and ``get_frame_data`` in Python and ``as.data.frame`` in R that that allow you to download the data directly into the client program memory. +H2O-3 has functions like ``as_data_frame`` and ``get_frame_data`` in Python and ``as.data.frame`` in R that let you download the data directly into the client program memory. -**Note**: For very large data this might not be feasible since the whole frame is downloaded as CSV into the client program memory. +.. note:: + + For very large data this might not be feasible since the whole frame is downloaded as CSV into the client program memory. .. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - iris.hex <- h2o.importFile("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") - iris.df <- as.data.frame(iris.hex) - .. code-tab:: python import h2o @@ -26,23 +21,20 @@ H2O has functions like ``as_data_frame`` and ``get_frame_data`` in Python and `` iris_csv_string = iris_hex.get_frame_data() iris_pd = iris_hex.as_data_frame(use_pandas=True) -Save to a file system -~~~~~~~~~~~~~~~~~~~~~ - -The export file function can be used to save the data to an arbitrary location. The location has to be one that the server has access to, so either the server filesystem or a distributed filesystem like HDFS or S3. This function can save the data in either CSV format (default) or Parquet format. - -.. tabs:: .. code-tab:: r R - + library(h2o) h2o.init() iris.hex <- h2o.importFile("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") - h2o.exportFile(iris.hex, path = "hdfs://path/in/hdfs/iris.csv") - - # To save as a parquet: - path <- "file:///tmp/prostate.parquet" - h2o.exportFile(iris.hex, path, format="parquet") + iris.df <- as.data.frame(iris.hex) + +Save to a file system +--------------------- + +The export file function can be used to save the data to an arbitrary location. The location has to be one that the server has access to, so either the server filesystem or a distributed filesystem like HDFS or S3. This function can save the data in either CSV format (default) or Parquet format. + +.. tabs:: .. code-tab:: python import h2o @@ -54,19 +46,24 @@ The export file function can be used to save the data to an arbitrary location. path = "file:///tmp/iris.parquet" h2o.export_file(iris_hex, path, format="parquet") -Save as a Hive table -~~~~~~~~~~~~~~~~~~~~ - -When running on Hadoop, H2O can also export data into Hive tables. In order to do so, the user running the H2O cluster must have the privileges to create new Hive tables. The user can specify the table name and storage format (currently supported are ``csv`` and ``parquet``) as well as table location for external tables. - -.. tabs:: .. code-tab:: r R library(h2o) h2o.init() iris.hex <- h2o.importFile("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") - h2o.save_to_hive(iris.hex, jdbc_url = "jdbc:hive2://hive-server:10000/default", table_name = "airlines") + h2o.exportFile(iris.hex, path = "hdfs://path/in/hdfs/iris.csv") + # To save as a parquet: + path <- "file:///tmp/prostate.parquet" + h2o.exportFile(iris.hex, path, format="parquet") + + +Save as a Hive table +-------------------- + +When running on Hadoop, H2O-3 can also export data into Hive tables. In order to do so, you must have the privileges to create new Hive tables. You can specify the table name and storage format (currently supported are ``csv`` and ``parquet``) as well as table location for external tables. + +.. tabs:: .. code-tab:: python import h2o @@ -77,6 +74,16 @@ When running on Hadoop, H2O can also export data into Hive tables. In order to d table_name = "airlines", format = "parquet", table_path = "/user/bob/tables/iris" - ) + ) + + .. code-tab:: r R + + library(h2o) + h2o.init() + iris.hex <- h2o.importFile("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") + h2o.save_to_hive(iris.hex, jdbc_url = "jdbc:hive2://hive-server:10000/default", table_name = "airlines") + -**Note:** Provided JDBC URL must include necessary authentication details, for example when running on a kerberized Hadoop cluster some form of ``auth`` parameter must be used in the URL. +.. note:: + + The provided JDBC URL must include the necessary authentication details. For example, when running on a Kerberized Hadoop cluster, some form of ``auth`` parameter must be used in the URL. diff --git a/h2o-docs/src/product/data-munging/fillnas.rst b/h2o-docs/src/product/data-munging/fillnas.rst index 08fdb2d8866e..37bbeeb13aba 100644 --- a/h2o-docs/src/product/data-munging/fillnas.rst +++ b/h2o-docs/src/product/data-munging/fillnas.rst @@ -1,45 +1,13 @@ -Fill NAs --------- +Fill NA values +============== -Use this function to fill in NA values in a sequential manner up to a specified limit. When using this function, you will specify whether the method to fill the NAs should go forward (default) or backward, whether the NAs should be filled along rows (default) or columns, and the maximum number of consecutive NAs to fill (defaults to 1). +Use this function to fill in NA values in a sequential manner up to a specified limit. When using this function, you will specify the following: -.. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Create a random data frame with 6 rows and 2 columns. - # Specify that no more than 70% of the values are NAs. - fr_with_nas = h2o.createFrame(categorical_fraction = 0.0, - missing_fraction = 0.7, - rows = 6, - cols = 2, - seed = 123) - fr_with_nas - C1 C2 - 1 NaN NaN - 2 -77.10471 -93.64087 - 3 -13.65926 57.44389 - 4 NaN NaN - 5 39.10130 NaN - 6 NaN 55.43136 - - [6 rows x 2 columns] - - # Forward fill a row. In R, the values for axis are 1 (row-wise) and 2 (column-wise) - fr <- h2o.fillna(fr_with_nas, "forward", axis = 1, maxlen = 1L) - fr - C1 C2 - 1 NaN NaN - 2 -77.10471 -93.64087 - 3 -13.65926 57.44389 - 4 NaN NaN - 5 39.10130 39.10130 - 6 NaN 55.43136 - - [6 rows x 2 columns] +- Whether the method to fill the NA values should go forward (default) or backward. +- Whether the NA values should be filled along rows (default) or columns. +- The maximum number of consecutive NA values to fill (defaults to 1). +.. tabs:: .. code-tab:: python import h2o @@ -89,3 +57,38 @@ Use this function to fill in NA values in a sequential manner up to a specified [10 rows x 3 columns] + .. code-tab:: r R + + library(h2o) + h2o.init() + + # Create a random data frame with 6 rows and 2 columns. + # Specify that no more than 70% of the values are NAs. + fr_with_nas = h2o.createFrame(categorical_fraction = 0.0, + missing_fraction = 0.7, + rows = 6, + cols = 2, + seed = 123) + fr_with_nas + C1 C2 + 1 NaN NaN + 2 -77.10471 -93.64087 + 3 -13.65926 57.44389 + 4 NaN NaN + 5 39.10130 NaN + 6 NaN 55.43136 + + [6 rows x 2 columns] + + # Forward fill a row. In R, the values for axis are 1 (row-wise) and 2 (column-wise) + fr <- h2o.fillna(fr_with_nas, "forward", axis = 1, maxlen = 1L) + fr + C1 C2 + 1 NaN NaN + 2 -77.10471 -93.64087 + 3 -13.65926 57.44389 + 4 NaN NaN + 5 39.10130 39.10130 + 6 NaN 55.43136 + + [6 rows x 2 columns] diff --git a/h2o-docs/src/product/data-munging/groupby.rst b/h2o-docs/src/product/data-munging/groupby.rst index 8c5ad8b570c0..f87e278a685e 100644 --- a/h2o-docs/src/product/data-munging/groupby.rst +++ b/h2o-docs/src/product/data-munging/groupby.rst @@ -1,40 +1,52 @@ -Group By --------- +Group by +======== -The ``group_by`` function allows you to group one or more columns and apply a function to the result. Specifically, the ``group_by`` function performs the following actions on an H2O Frame: +The ``group_by`` function lets you group one or more columns and apply a function to the result. Specifically, the ``group_by`` function performs the following actions on an H2O Frame: -- splits the data into groups based on some criteria -- applies a function to each group independently -- combines the results into an H2OFrame +1. Splits the data into groups based on some criteria. +2. Applies a function to each group independently. +3. Combines the results into an H2OFrame. The result is a new H2OFrame with columns equivalent to the number of groups created. The returned groups are sorted by the natural group-by column sort. +Group by parameters +------------------- + The ``group_by`` function accepts the following parameters: -**Python and R** +Python and R +~~~~~~~~~~~~ - - H2O Frame: This specifies the H2OFrame that you want the group by operation to be performed on. - ``by``: The ``by`` option can take a list of columns if you want to group by more than one column to compute the summary. + - ``H2OFrame``: This specifies the H2OFrame that you want the group by operation to be performed on. -**Python Only** +Python only +~~~~~~~~~~~ -- ``na``, which controls treatment of NA values during the calculation. It can be one of: +- ``na``: This option controls the treatment of NA values during the calculation. It can be one of: - - ``all`` (default): any NAs are used in the calculation as-is; which usually results in the final result being NA too. + - ``all`` (default): Any NA values are used in the calculation as-is (which usually results in the final result being NA, too). - ``ignore``: NA entries are not included in calculations, but the total number of entries is taken as the total number of rows. For example, ``mean([1, 2, 3, nan], na="ignore")`` will produce ``1.5``. - - ``rm``: entries are skipped during the calculations, reducing the total effective count of entries. For example, ``mean([1, 2, 3, nan], na="rm")`` will produce ``2``. + - ``rm``: NA entries are skipped during the calculations, reducing the total effective count of entries. For example, ``mean([1, 2, 3, nan], na="rm")`` will produce ``2``. -**R Only** +R only +~~~~~~ - - ``gb.control``: In R, the ``gb.control`` option specifies how to handle NA values in the dataset as well as how to name output columns. Note that to specify a list of column names in the ``gb.control`` list, you must add the ``col.names`` argument. - - ``nrow``: Specify the name of the generated column. - - ``na.methods``, which controls treatment of NA values during the calculation. It can be one of: +- ``gb.control``: In R, the ``gb.control`` option specifies how to handle NA values in the dataset as well as how to name output columns. Note that to specify a list of column names in the ``gb.control`` list, you must add the ``col.names`` argument. +- ``na.methods``: This option controls the treatment of NA values during the calculation. It can be one of: - - ``all`` (default): any NAs are used in the calculation as-is; which usually results in the final result being NA too. + - ``all`` (default): Any NA values are used in the calculation as-is (which usually results in the final result being NA, too). - ``ignore``: NA entries are not included in calculations, but the total number of entries is taken as the total number of rows. For example, ``mean([1, 2, 3, nan], na="ignore")`` will produce ``1.5``. - - ``rm``: entries are skipped during the calculations, reducing the total effective count of entries. For example, ``mean([1, 2, 3, nan], na="rm")`` will produce ``2``. + - ``rm``: NA entries are skipped during the calculations, reducing the total effective count of entries. For example, ``mean([1, 2, 3, nan], na="rm")`` will produce ``2``. + +- ``nrow``: Specify the name of the generated column. - **Note**: If a list smaller than the number of columns groups is supplied, then the list will be padded by ``ignore``. +.. note:: + + If a list smaller than the number of columns groups is supplied, then the list will be padded by ``ignore``. + +Aggregations +~~~~~~~~~~~~ In addition to the above parameters, any number of the following aggregations can be chained together in the ``group_by`` function: @@ -48,11 +60,80 @@ In addition to the above parameters, any number of the following aggregations ca - ``sum``: Calculate the sum of each column specified in ``col`` for each group of a GroupBy object. - ``var``: Calculate the variance of each column specified in ``col`` for each group of a GroupBy object. - If no arguments are given to the aggregation (e.g., ``max()`` in ``grouped.sum(col="X1", na="all").mean(col="X5", na="all").max()``), then it is assumed that the aggregation should apply to all columns except the GroupBy columns. +.. note:: + + If no arguments are given to the aggregation (e.g. ``max()`` in ``grouped.sum(col="X1", na="all").mean(col="X5", na="all").max()``), then it is assumed that the aggregation should apply to all columns except the GroupBy columns. -Note that once the aggregation operations are complete, calling the GroupBy object with a new set of aggregations will yield no effect. You must generate a new GroupBy object in order to apply a new aggregation on it. In addition, certain aggregations are only defined for numerical or categorical columns. An error will be thrown for calling aggregation on the wrong data types. +Once the aggregation operations are complete, calling the GroupBy object with a new set of aggregations will yield no effect. You must generate a new GroupBy object in order to apply a new aggregation on it. In addition, certain aggregations are only defined for numerical or categorical columns. An error will be thrown for calling aggregation on the wrong data types. + +Examples +-------- + +The following examples in Python and R show how to find the months with the highest cancellation using ``group_by``. .. tabs:: + .. code-tab:: python + + import h2o + h2o.init() + + # Upload the airlines dataset + air = h2o.import_file("https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv") + air.dim + [43978, 31] + + # Find number of flights by airport + origin_flights = air.group_by("Origin") + origin_fights.count() + origin_fights.get_frame() + Origin nrow + -------- ------ + ABE 59 + ABQ 876 + ACY 31 + ... + + # Find number of flights per month based on the origin + cols = ["Origin","Month"] + flights_by_origin_month = air.group_by(by=cols).count(na ="all") + flights_by_origin_month.get_frame() + Origin Month nrow + -------- ------- ------ + ABE 1 59 + ABQ 1 846 + ABQ 10 30 + ... + + # Find months with the highest cancellation ratio + cancellation_by_month = air.group_by(by='Month').sum('Cancelled', na="all") + flights_by_month = air.group_by('Month').count(na="all") + cancelled = cancellation_by_month.get_frame()['sum_Cancelled'] + flights = flights_by_month.get_frame()['nrow'] + month_count = flights_by_month.get_frame()['Month'] + ratio = cancelled/flights + month_count.cbind(ratio) + Month sum_Cancelled + ------- --------------- + 1 0.0254175 + 10 0.00950475 + + [2 rows x 2 columns] + + # Use group_by with multiple columns. Summarize the destination, + # arrival delays, and departure delays for an origin + cols_1 = ['Origin', 'Dest', 'IsArrDelayed', 'IsDepDelayed'] + cols_2 = ["Dest", "IsArrDelayed", "IsDepDelayed"] + air[cols_1].group_by(by='Origin').sum(cols_2, na="ignore").get_frame() + Origin sum_Dest sum_IsDepDelayed sum_IsArrDelayed + -------- ---------- ------------------ ------------------ + ABE 5884 30 40 + ABQ 84505 370 545 + ACY 3131 7 9 + ALB 3646 50 49 + AMA 317 6 4 + ANC 100 1 0 + ... + .. code-tab:: r R library(h2o) @@ -134,64 +215,59 @@ Note that once the aggregation operations are complete, calling the GroupBy obje 5 AMA 317 4 6 6 ANC 100 0 1 - .. code-tab:: python +The following R code shows the options by-variable with ``gb.control``. - import h2o - h2o.init() +.. tabs:: - # Upload the airlines dataset - air = h2o.import_file("https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv") - air.dim - [43978, 31] + .. code-tab:: r R - # Find number of flights by airport - origin_flights = air.group_by("Origin") - origin_fights.count() - origin_fights.get_frame() - Origin nrow - -------- ------ - ABE 59 - ABQ 876 - ACY 31 - ... + # Import H2O-3: + library(h2o) + h2o.init() - # Find number of flights per month based on the origin - cols = ["Origin","Month"] - flights_by_origin_month = air.group_by(by=cols).count(na ="all") - flights_by_origin_month.get_frame() - Origin Month nrow - -------- ------- ------ - ABE 1 59 - ABQ 1 846 - ABQ 10 30 - ... + # Import the airlines dataset: + airlines.hex <- h2o.importFile("https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv") - # Find months with the highest cancellation ratio - cancellation_by_month = air.group_by(by='Month').sum('Cancelled', na="all") - flights_by_month = air.group_by('Month').count(na="all") - cancelled = cancellation_by_month.get_frame()['sum_Cancelled'] - flights = flights_by_month.get_frame()['nrow'] - month_count = flights_by_month.get_frame()['Month'] - ratio = cancelled/flights - month_count.cbind(ratio) - Month sum_Cancelled - ------- --------------- - 1 0.0254175 - 10 0.00950475 + # View quantiles and histograms: + quantile(x = airlines.hex$ArrDelay, na.rm = TRUE) + h2o.hist(airlines.hex$ArrDelay) - [2 rows x 2 columns] + # Find the number of flights by airport: + originFlights <- h2o.group_by(data = airlines.hex, by = "Origin", nrow("Origin"), gb.control <- list(na.methods = "rm")) + originFlights.R <- as.data.frame(originFlights) - # Use group_by with multiple columns. Summarize the destination, - # arrival delays, and departure delays for an origin - cols_1 = ['Origin', 'Dest', 'IsArrDelayed', 'IsDepDelayed'] - cols_2 = ["Dest", "IsArrDelayed", "IsDepDelayed"] - air[cols_1].group_by(by='Origin').sum(cols_2, na="ignore").get_frame() - Origin sum_Dest sum_IsDepDelayed sum_IsArrDelayed - -------- ---------- ------------------ ------------------ - ABE 5884 30 40 - ABQ 84505 370 545 - ACY 3131 7 9 - ALB 3646 50 49 - AMA 317 6 4 - ANC 100 1 0 - ... + # Find the number of flights per month: + flightsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", nrow("Month"), gb.control <- list(na.methods = "rm")) + flightsByMonth.R <- as.data.frame(flightsByMonth) + + # Find months with the highest cancellation ratio: + which(colnames(airlines.hex)=="Cancelled") + cancellationsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", sum("Cancelled"), gb.control <- list(na.methods = "rm")) + cancellation_rate <- cancellationsByMonth$sum_Cancelled/flightsByMonth$nrow + rates_table <- h2o.cbind(flightsByMonth$Month, cancellation_rate) + rates_table.R <- as.data.frame(rates_table) + + # Construct test and train sets using sampling: + airlines.split <- h2o.splitFrame(data = airlines.hex, ratio = 0.85) + airlines.train <- airlines.split[[1]] + airlines.test <- airlines.split[[2]] + + # Display a summary using table-like functions: + h2o.table(airlines.train$Cancelled) + h2o.table(airlines.test$Cancelled) + + # Set the predictor and response variables: + Y <- "IsDepDelayed" + X <- c("Origin", "Dest", "DayofMonth", "Year", "UniqueCarrier", "DayOfWeek", "Month", "DepTime", "ArrTime", "Distance") + + # Define the data for the model and display the results: + airlines.glm <- h2o.glm(training_frame = airlines.train, x = X, y = Y, family = "binomial", alpha = 0.5) + + # View the model information (training statistics, performance, important variables): + summary(airlines.glm) + + # Predict using the GLM model: + pred <- h2o.predict(object = airlines.glm, newdata = airlines.test) + + # Look at the summary of predictions (probability of TRUE class p1): + summary(pred$p1) diff --git a/h2o-docs/src/product/data-munging/importing-data.rst b/h2o-docs/src/product/data-munging/importing-data.rst index 40809881d8e3..518ed01051b3 100644 --- a/h2o-docs/src/product/data-munging/importing-data.rst +++ b/h2o-docs/src/product/data-munging/importing-data.rst @@ -1,17 +1,34 @@ -Importing a File ----------------- +Import a file +============= -Unlike the `upload `__ function, which is a push from the client to the server, the import function is a parallelized reader and pulls information from the server from a location specified by the client. The path is a server-side path. This is a fast, scalable, highly optimized way to read data. H2O pulls the data from a data store and initiates the data transfer as a read operation. +Unlike the `upload `__ function, which is a push from the client to the server, the import function is a parallelized reader and pulls information from the server from a location specified by the client. The path is a server-side path. This is a fast, scalable, highly optimized way to read data. H2O-3 pulls the data from a data store and initiates the data transfer as a read operation. -Refer to the `Supported File Formats `__ topic to ensure that you are using a supported file type. +`See more on supported file formats `__ to ensure that you are using a supported file type. -**Note**: When parsing a data file containing timestamps that do not include a timezone, the timestamps will be interpreted as UTC (GMT). You can override the parsing timezone using the following: +.. note:: + + When parsing a data file containing timestamps that do not include a timezone, the timestamps will be interpreted as UTC (GMT). You can override the parsing timezone using the following: - - R: ``h2o.setTimezone("America/Los Angeles")`` - - Python: ``h2o.cluster().timezone = "America/Los Angeles"`` + - **Python**: ``h2o.cluster().timezone = "America/Los Angeles"`` + - **R**: ``h2o.setTimezone("America/Los Angeles")`` .. tabs:: + .. code-tab:: python + + # Import a file from S3: + import h2o + h2o.init() + airlines = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip" + airlines_df = h2o.import_file(path=airlines) + + # Import a file from HDFS, you must include the node name: + import h2o + h2o.init() + airlines = "hdfs://node-1:/user/smalldata/airlines/allyears2k_headers.zip" + airlines_df = h2o.import_file(path=airlines) + .. code-tab:: r R + # To import airlines file from H2O’s package: library(h2o) h2o.init() @@ -30,17 +47,5 @@ Refer to the `Supported File Formats - 2 apple red FALSE - 3 banana yellow FALSE - 4 lemon yellow TRUE - 5 orange orange TRUE - 6 strawberry red FALSE - - [6 rows x 3 columns] - .. code-tab:: python import h2o @@ -133,4 +85,55 @@ Note that in order for a merge to work in multinode clusters, one of the dataset 3 To [21 rows x 2 columns] + + .. code-tab:: r R + + # Currently, this function only supports `all.x = TRUE`. All other permutations will fail. + library(h2o) + h2o.init() + + # Create two simple, two-column R data frames by inputting values, ensuring that both have a common column (in this case, "fruit"). + left <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'blueberry'), + color = c('red', 'orange', 'yellow', 'yellow', 'red', 'blue')) + right <- data.frame(fruit = c('apple', 'orange', 'banana', 'lemon', 'strawberry', 'watermelon'), + citrus = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE)) + + # Create the H2O data frames from the inputted data. + left_frame <- as.h2o(left) + print(left_frame) + fruit color + 1 apple red + 2 orange orange + 3 banana yellow + 4 lemon yellow + 5 strawberry red + 6 blueberry blue + + [6 rows x 2 columns] + + right_frame <- as.h2o(right) + print(right_frame) + fruit citrus + 1 apple FALSE + 2 orange TRUE + 3 banana FALSE + 4 lemon TRUE + 5 strawberry FALSE + 6 watermelon FALSE + + [6 rows x 2 columns] + + # Merge the data frames. The result is a single dataset with three columns. + new_frame <- h2o.merge(left_frame, right_frame, all.x = TRUE) + print(new_frame) + fruit color citrus + 1 blueberry blue + 2 apple red FALSE + 3 banana yellow FALSE + 4 lemon yellow TRUE + 5 orange orange TRUE + 6 strawberry red FALSE + + [6 rows x 3 columns] + diff --git a/h2o-docs/src/product/data-munging/pivot.rst b/h2o-docs/src/product/data-munging/pivot.rst index 3d0c22b6fa1f..2ef632c685c8 100644 --- a/h2o-docs/src/product/data-munging/pivot.rst +++ b/h2o-docs/src/product/data-munging/pivot.rst @@ -1,49 +1,20 @@ -Pivoting Tables ---------------- +Pivot tables +============ -Use this function to pivot tables. This is performed by designating three columns: index, column, and value. Index is the column where pivoted rows should be aligned on; column represents the column to pivot; and value specifies the values of the pivoted table. For cases with multiple indexes for a column label, the aggregation method is to pick the first occurrence in the data frame. +Use this function to pivot tables. This is performed by designating three columns: index, column, and value. -**Notes**: +- Index is the column where pivoted rows should be aligned on. +- Column represents the column to pivot. +- Value specifies the values of the pivoted table. - - All rows of a single index value must fit on one node. +For cases with multiple indexes for a column label, the aggregation method is to pick the first occurrence in the data frame. - - The maximum rows for a single index value and column label is ``Chunk size * Chunk size``. +.. note:: -.. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Create a simple data frame by inputting values - data <- data.frame(colorID = c('1', '2', '3', '3', '1', '4'), - value = c('red', 'orange', 'yellow', 'yellow', 'red', 'blue'), - amount = c('4', '2', '4', '3', '6', '3')) - df <- as.h2o(data) - - # View the dataset - df - colorID value amount - 1 1 red 4 - 2 2 orange 2 - 3 3 yellow 4 - 4 3 yellow 3 - 5 1 red 6 - 6 4 blue 3 - - [6 rows x 3 columns] - - # Pivot the table on the colorID column and aligned on the amount column - df2 <- h2o.pivot(df, index = "amount", column = "colorID", value = "value") - df2 - amount 1 2 3 4 - 1 2 NaN 1 NaN NaN - 2 3 NaN NaN 3 0 - 3 4 2 NaN 3 NaN - 4 6 2 NaN NaN NaN - - [4 rows x 5 columns] + - All rows of a single index value must fit on one node. + - The maximum rows for a single index value and column label is ``Chunk size * Chunk size``. +.. tabs:: .. code-tab:: python import h2o @@ -78,3 +49,38 @@ Use this function to pivot tables. This is performed by designating three column 6 2 nan nan nan [4 rows x 5 columns] + + .. code-tab:: r R + + library(h2o) + h2o.init() + + # Create a simple data frame by inputting values + data <- data.frame(colorID = c('1', '2', '3', '3', '1', '4'), + value = c('red', 'orange', 'yellow', 'yellow', 'red', 'blue'), + amount = c('4', '2', '4', '3', '6', '3')) + df <- as.h2o(data) + + # View the dataset + df + colorID value amount + 1 1 red 4 + 2 2 orange 2 + 3 3 yellow 4 + 4 3 yellow 3 + 5 1 red 6 + 6 4 blue 3 + + [6 rows x 3 columns] + + # Pivot the table on the colorID column and aligned on the amount column + df2 <- h2o.pivot(df, index = "amount", column = "colorID", value = "value") + df2 + amount 1 2 3 4 + 1 2 NaN 1 NaN NaN + 2 3 NaN NaN 3 0 + 3 4 2 NaN 3 NaN + 4 6 2 NaN NaN NaN + + [4 rows x 5 columns] + diff --git a/h2o-docs/src/product/data-munging/replacing-values.rst b/h2o-docs/src/product/data-munging/replacing-values.rst index eddc9f820415..f1e878cadcb8 100644 --- a/h2o-docs/src/product/data-munging/replacing-values.rst +++ b/h2o-docs/src/product/data-munging/replacing-values.rst @@ -1,9 +1,39 @@ -Replacing Values in a Frame ---------------------------- +Replace values in a frame +========================= -This example shows how to replace numeric values in a frame of data. Note that it is currently not possible to replace categorical value in a column. +This example shows how to replace numeric values in a frame of data. + +.. note:: + + It is not possible to replace a categorical value in a column. .. tabs:: + .. code-tab:: python + + import h2o + h2o.init() + path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv" + df = h2o.import_file(path=path) + + # Replace a single numerical datum. Note that columns and rows start at 0. + # so in the example below, the value in the 15th row and 3rd column will be set to 2.0. + df[14,2] = 2.0 + + # Replace a whole column. The example below multiplies all values in the first column by 3. + df[0] = 3*df[0] + + # Replace by row mask. The example below searches for value less than 4.6 in the + # sepal_len column and replaces those values with 4.6. + df[df["sepal_len"] < 4.6, "sepal_len"] = 4.6 + + # Replace using ifelse. Similar to the previous example, this replaces values less than 4.6 with 4.6. + df["sepal_len"] = (df["sepal_len"] < 4.6).ifelse(4.6, df["sepal_len"]) + + # Replace missing values with 0. + df[df["sepal_len"].isna(), "sepal_len"] = 0 + + # Alternative with ifelse. Note the parantheses. + .. code-tab:: r R library(h2o) @@ -34,29 +64,4 @@ This example shows how to replace numeric values in a frame of data. Note that i # Alternative with ifelse df[, "sepal_len"] <- h2o.ifelse(is.na(df[, "sepal_len"]), 0, df[, "sepal_len"]) - .. code-tab:: python - - import h2o - h2o.init() - path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv" - df = h2o.import_file(path=path) - - # Replace a single numerical datum. Note that columns and rows start at 0. - # so in the example below, the value in the 15th row and 3rd column will be set to 2.0. - df[14,2] = 2.0 - - # Replace a whole column. The example below multiplies all values in the first column by 3. - df[0] = 3*df[0] - - # Replace by row mask. The example below searches for value less than 4.6 in the - # sepal_len column and replaces those values with 4.6. - df[df["sepal_len"] < 4.6, "sepal_len"] = 4.6 - - # Replace using ifelse. Similar to the previous example, this replaces values less than 4.6 with 4.6. - df["sepal_len"] = (df["sepal_len"] < 4.6).ifelse(4.6, df["sepal_len"]) - - # Replace missing values with 0. - df[df["sepal_len"].isna(), "sepal_len"] = 0 - - # Alternative with ifelse. Note the parantheses. df["sepal_len"] = (df["sepal_len"].isna()).ifelse(0, df["sepal_len"]) diff --git a/h2o-docs/src/product/data-munging/slicing-columns.rst b/h2o-docs/src/product/data-munging/slicing-columns.rst index 162309c29af9..af3c1a40f51a 100644 --- a/h2o-docs/src/product/data-munging/slicing-columns.rst +++ b/h2o-docs/src/product/data-munging/slicing-columns.rst @@ -1,80 +1,9 @@ -Slicing Columns ---------------- +Slice columns +============= -H2O lazily slices out columns of data and will only materialize a shared copy upon some type of triggering IO. This example shows how to slice columns from a frame of data. +H2O-3 lazily slices out columns of data and will only materialize a shared copy upon some type of triggering IO. This example shows how to slice columns from a frame of data. .. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Import the iris with headers dataset - path <- "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv" - df <- h2o.importFile(path) - print(df) - sepal_len sepal_wid petal_len petal_wid class - 1 5.1 3.5 1.4 0.2 Iris-setosa - 2 4.9 3.0 1.4 0.2 Iris-setosa - 3 4.7 3.2 1.3 0.2 Iris-setosa - 4 4.6 3.1 1.5 0.2 Iris-setosa - 5 5.0 3.6 1.4 0.2 Iris-setosa - 6 5.4 3.9 1.7 0.4 Iris-setosa - - [150 rows x 5 columns] - - # Slice 1 column by index - c1 <- df[, 1] - print(c1) - sepal_len - 1 5.1 - 2 4.9 - 3 4.7 - 4 4.6 - 5 5.0 - 6 5.4 - - [150 rows x 1 column] - - # Slice 1 column by name - c1_1 <- df[, "petal_len"] - print(c1_1) - petal_len - 1 1.4 - 2 1.4 - 3 1.3 - 4 1.5 - 5 1.4 - 6 1.7 - - [150 rows x 1 column] - - # Slice cols by vector of indexes - cols <- df[, 1:4] - print(cols) - sepal_len sepal_wid petal_len petal_wid - 1 5.1 3.5 1.4 0.2 - 2 4.9 3.0 1.4 0.2 - 3 4.7 3.2 1.3 0.2 - 4 4.6 3.1 1.5 0.2 - 5 5.0 3.6 1.4 0.2 - 6 5.4 3.9 1.7 0.4 - - [150 rows x 4 columns] - - # Slice cols by vector of names - cols_1 <- df[, c("sepal_len", "sepal_wid", "petal_len", "petal_wid")] - print(cols_1) - sepal_len sepal_wid petal_len petal_wid - 1 5.1 3.5 1.4 0.2 - 2 4.9 3.0 1.4 0.2 - 3 4.7 3.2 1.3 0.2 - 4 4.6 3.1 1.5 0.2 - 5 5.0 3.6 1.4 0.2 - 6 5.4 3.9 1.7 0.4 - - [150 rows x 4 columns] - .. code-tab:: python import h2o @@ -158,3 +87,75 @@ H2O lazily slices out columns of data and will only materialize a shared copy up 3.1 1.5 0.1 [150 rows x 3 columns] + + .. code-tab:: r R + + library(h2o) + h2o.init() + + # Import the iris with headers dataset + path <- "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv" + df <- h2o.importFile(path) + print(df) + sepal_len sepal_wid petal_len petal_wid class + 1 5.1 3.5 1.4 0.2 Iris-setosa + 2 4.9 3.0 1.4 0.2 Iris-setosa + 3 4.7 3.2 1.3 0.2 Iris-setosa + 4 4.6 3.1 1.5 0.2 Iris-setosa + 5 5.0 3.6 1.4 0.2 Iris-setosa + 6 5.4 3.9 1.7 0.4 Iris-setosa + + [150 rows x 5 columns] + + # Slice 1 column by index + c1 <- df[, 1] + print(c1) + sepal_len + 1 5.1 + 2 4.9 + 3 4.7 + 4 4.6 + 5 5.0 + 6 5.4 + + [150 rows x 1 column] + + # Slice 1 column by name + c1_1 <- df[, "petal_len"] + print(c1_1) + petal_len + 1 1.4 + 2 1.4 + 3 1.3 + 4 1.5 + 5 1.4 + 6 1.7 + + [150 rows x 1 column] + + # Slice cols by vector of indexes + cols <- df[, 1:4] + print(cols) + sepal_len sepal_wid petal_len petal_wid + 1 5.1 3.5 1.4 0.2 + 2 4.9 3.0 1.4 0.2 + 3 4.7 3.2 1.3 0.2 + 4 4.6 3.1 1.5 0.2 + 5 5.0 3.6 1.4 0.2 + 6 5.4 3.9 1.7 0.4 + + [150 rows x 4 columns] + + # Slice cols by vector of names + cols_1 <- df[, c("sepal_len", "sepal_wid", "petal_len", "petal_wid")] + print(cols_1) + sepal_len sepal_wid petal_len petal_wid + 1 5.1 3.5 1.4 0.2 + 2 4.9 3.0 1.4 0.2 + 3 4.7 3.2 1.3 0.2 + 4 4.6 3.1 1.5 0.2 + 5 5.0 3.6 1.4 0.2 + 6 5.4 3.9 1.7 0.4 + + [150 rows x 4 columns] + diff --git a/h2o-docs/src/product/data-munging/slicing-rows.rst b/h2o-docs/src/product/data-munging/slicing-rows.rst index 76278e689a19..87338e0a507c 100644 --- a/h2o-docs/src/product/data-munging/slicing-rows.rst +++ b/h2o-docs/src/product/data-munging/slicing-rows.rst @@ -1,9 +1,54 @@ -Slicing Rows ------------- +Slice rows +========== -H2O lazily slices out rows of data and will only materialize a shared copy upon IO. This example shows how to slice rows from a frame of data. +H2O-3 lazily slices out rows of data and will only materialize a shared copy upon IO. This example shows how to slice rows from a frame of data. .. tabs:: + .. code-tab:: python + + import h2o + h2o.init() + + # Import the iris with headers dataset + path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv" + df = h2o.import_file(path=path) + + # Slice 1 row by index + c1 = df[15,:] + c1.describe + + # Slice a range of rows + c1_1 = df[range(25,50,1),:] + c1_1.describe + + # Slice using a boolean mask. The output dataset will include rows with a sepal length + # less than 4.6. + mask = df["sepal_len"] < 4.6 + cols = df[mask,:] + cols.describe + + # Filter out rows that contain missing values in a column. Note the use of '~' to + # perform a logical not. + mask = df["sepal_len"].isna() + cols = df[~mask,:] + cols.describe + sepal_len sepal_wid petal_len petal_wid clas + ---------- ---------- ---------- ----------- ----------- + 5.1 3.5 1.4 0.2 Iris-setosa + 4.9 3 1.4 0.2 Iris-setosa + 4.7 3.2 1.3 0.2 Iris-setosa + 4.6 3.1 1.5 0.2 Iris-setosa + 5 3.6 1.4 0.2 Iris-setosa + 5.4 3.9 1.7 0.4 Iris-setosa + 4.6 3.4 1.4 0.3 Iris-setosa + 5 3.4 1.5 0.2 Iris-setosa + 4.4 2.9 1.4 0.2 Iris-setosa + 4.9 3.1 1.5 0.1 Iris-setosa + + + + [150 rows x 3 columns] + .. code-tab:: r R library(h2o) @@ -59,49 +104,5 @@ H2O lazily slices out rows of data and will only materialize a shared copy upon [150 rows x 5 columns] - .. code-tab:: python - - import h2o - h2o.init() - - # Import the iris with headers dataset - path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv" - df = h2o.import_file(path=path) - - # Slice 1 row by index - c1 = df[15,:] - c1.describe - - # Slice a range of rows - c1_1 = df[range(25,50,1),:] - c1_1.describe - - # Slice using a boolean mask. The output dataset will include rows with a sepal length - # less than 4.6. - mask = df["sepal_len"] < 4.6 - cols = df[mask,:] - cols.describe - - # Filter out rows that contain missing values in a column. Note the use of '~' to - # perform a logical not. - mask = df["sepal_len"].isna() - cols = df[~mask,:] - cols.describe - sepal_len sepal_wid petal_len petal_wid clas - ---------- ---------- ---------- ----------- ----------- - 5.1 3.5 1.4 0.2 Iris-setosa - 4.9 3 1.4 0.2 Iris-setosa - 4.7 3.2 1.3 0.2 Iris-setosa - 4.6 3.1 1.5 0.2 Iris-setosa - 5 3.6 1.4 0.2 Iris-setosa - 5.4 3.9 1.7 0.4 Iris-setosa - 4.6 3.4 1.4 0.3 Iris-setosa - 5 3.4 1.5 0.2 Iris-setosa - 4.4 2.9 1.4 0.2 Iris-setosa - 4.9 3.1 1.5 0.1 Iris-setosa - - - - [150 rows x 3 columns] diff --git a/h2o-docs/src/product/data-munging/sortcolumn.rst b/h2o-docs/src/product/data-munging/sortcolumn.rst index be9d4fbfb7c5..5c62c38fae5c 100644 --- a/h2o-docs/src/product/data-munging/sortcolumn.rst +++ b/h2o-docs/src/product/data-munging/sortcolumn.rst @@ -1,74 +1,16 @@ -Sorting Columns ---------------- +Sort columns +============ -Use the ``sort`` function in Python or the ``arrange`` function in R to create a new frame that is sorted by column(s) in ascending (default) or descending order. Note that when using ``sort``, the original frame cannot contain any string columns. +Use the ``sort`` function in Python or the ``arrange`` function in R to create a new frame that is sorted by column(s) in ascending (default) or descending order. -If only one column is specified in the sort, then the final results are sorted according to that one single column either in ascending (default) or in descending order. However, if you specify more than one column in the sort, then H2O performs as described below: +If only one column is specified in the sort, then the final results are sorted according to that one single column either in ascending (default) or in descending order. However, if you specify more than one column in the sort, then H2O-3 performs as described below: -Assuming two columns, X (first column) and Y (second column): - - - H2O will sort on the first specified column, so in the case of [0,1], the X column will be sorted first. Similarly, in the case of [1,0], the Y column will be sorted first. - - H2O will sort on subsequent columns in the order they are specified, but only on those rows that have the same values as the first sorted column. No sorting will be done on subsequent columns if the values are not also duplicated in the first sorted column. + Assuming two columns, X (first column) and Y (second column): + + - H2O-3 will sort on the first specified column, so in the case of [0,1], the X column will be sorted first. Similarly, in the case of [1,0], the Y column will be sorted first. + - H2O-3 will sort on subsequent columns in the order they are specified, but only on those rows that have the same values as the first sorted column. No sorting will be done on subsequent columns if the values are not also duplicated in the first sorted column. .. tabs:: - .. code-tab:: r R - - # Currently, this function only supports `all.x = TRUE`. All other permutations will fail. - library(h2o) - h2o.init() - - # Import the smallIntFloats dataset - X <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/synthetic/smallIntFloats.csv.zip") - X - C1 C10 - 1 68379 -1.618668e+07 - 2 67108864 3.276800e+04 - 3 32768 -8.709456e+08 - 4 32 1.310720e+05 - 5 268435456 -2.910033e+01 - 6 105383117 -2.397206e+08 - - [180000 rows x 2 columns] - - # Sort on the first column only in ascending order (default) - X_sorted1 <- h2o.arrange(X, C1) - X_sorted1 - C1 C10 - 1 -1073593184 7.474380e+05 - 2 -1073563127 -2.097152e+06 - 3 -1073521109 5.110769e+06 - 4 -1073416724 2.220942e+06 - 5 -1073361973 -5.707598e+00 - 6 -1073357712 -4.650334e+03 - - [180000 rows x 2 columns] - - # Sort on both columns in descending order, specifying to sort on C1 first - X_sorted2 <- h2o.arrange(X, desc(C1), desc(C10)) - X_sorted2 - C1 C10 - 1 1073593184 256.000000 - 2 1073521109 -128.000000 - 3 1073257966 15.616867 - 4 1073072648 1.884208 - 5 1072757094 441.816579 - 6 1072669626 -512.000000 - - [180000 rows x 2 columns] - - # Sort on the second column in descending order - X_sorted3 <- h2o.arrange(X, desc(C10)) - X_sorted3 - C1 C10 - 1 321417689 1073662860 - 2 448 1073574390 - 3 85 1073288384 - 4 -4096 1072908385 - 5 28 1072890306 - 6 -4194304 1072750253 - - [180000 rows x 2 columns] - .. code-tab:: python import h2o @@ -146,3 +88,62 @@ Assuming two columns, X (first column) and Y (second column): 55 1.07175e+09 [180000 rows x 2 columns] + + .. code-tab:: r R + + # Currently, this function only supports `all.x = TRUE`. All other permutations will fail. + library(h2o) + h2o.init() + + # Import the smallIntFloats dataset + X <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/synthetic/smallIntFloats.csv.zip") + X + C1 C10 + 1 68379 -1.618668e+07 + 2 67108864 3.276800e+04 + 3 32768 -8.709456e+08 + 4 32 1.310720e+05 + 5 268435456 -2.910033e+01 + 6 105383117 -2.397206e+08 + + [180000 rows x 2 columns] + + # Sort on the first column only in ascending order (default) + X_sorted1 <- h2o.arrange(X, C1) + X_sorted1 + C1 C10 + 1 -1073593184 7.474380e+05 + 2 -1073563127 -2.097152e+06 + 3 -1073521109 5.110769e+06 + 4 -1073416724 2.220942e+06 + 5 -1073361973 -5.707598e+00 + 6 -1073357712 -4.650334e+03 + + [180000 rows x 2 columns] + + # Sort on both columns in descending order, specifying to sort on C1 first + X_sorted2 <- h2o.arrange(X, desc(C1), desc(C10)) + X_sorted2 + C1 C10 + 1 1073593184 256.000000 + 2 1073521109 -128.000000 + 3 1073257966 15.616867 + 4 1073072648 1.884208 + 5 1072757094 441.816579 + 6 1072669626 -512.000000 + + [180000 rows x 2 columns] + + # Sort on the second column in descending order + X_sorted3 <- h2o.arrange(X, desc(C10)) + X_sorted3 + C1 C10 + 1 321417689 1073662860 + 2 448 1073574390 + 3 85 1073288384 + 4 -4096 1072908385 + 5 28 1072890306 + 6 -4194304 1072750253 + + [180000 rows x 2 columns] + diff --git a/h2o-docs/src/product/data-munging/splitting-datasets.rst b/h2o-docs/src/product/data-munging/splitting-datasets.rst index 2c1b8dfc9f03..6fb165cb6b58 100644 --- a/h2o-docs/src/product/data-munging/splitting-datasets.rst +++ b/h2o-docs/src/product/data-munging/splitting-datasets.rst @@ -1,11 +1,52 @@ -Splitting Datasets into Training/Testing/Validating ---------------------------------------------------- +Split datasets into training/testing/validating +=============================================== This example shows how to split a single dataset into two datasets, one used for training and the other used for testing. -Note that when splitting frames, H2O does not give an exact split. It's designed to be efficient on big data using a probabilistic splitting method rather than an exact split. For example, when specifying a 0.75/0.25 split, H2O will produce a test/train split with an expected value of 0.75/0.25 rather than exactly 0.75/0.25. On small datasets, the sizes of the resulting splits will deviate from the expected value more than on big data, where they will be very close to exact. +.. note:: + + When splitting frames, H2O-3 does not give an exact split. It's designed to be efficient on big data using a probabilistic splitting method rather than an exact split. + + For example, when specifying a 0.75/0.25 split, H2O-3 will produce a test/train split with an expected value of 0.75/0.25 rather than exactly 0.75/0.25. On small datasets, the sizes of the resulting splits will deviate from the expected value more than on big data, where they will be very close to exact. .. tabs:: + .. code-tab:: python + + import h2o + from h2o.estimators.glm import H2OGeneralizedLinearEstimator + h2o.init() + + # Import the prostate dataset + prostate = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv" + prostate_df = h2o.import_file(path=prostate) + + # Split the data into Train/Test/Validation with Train having 70% and test and validation 15% each + train,test,valid = prostate_df.split_frame(ratios=[.7, .15]) + + # Generate a GLM model using the training dataset + glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5) + glm_classifier.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "DCAPS"], training_frame=train) + + # Predict using the GLM model and the testing dataset + predict = glm_classifier.predict(test) + + # View a summary of the prediction + predict.head() + predict p0 p1 + --------- -------- -------- + 1 0.366189 0.633811 + 1 0.351269 0.648731 + 1 0.69012 0.30988 + 0 0.762335 0.237665 + 1 0.680127 0.319873 + 1 0.687736 0.312264 + 1 0.676753 0.323247 + 1 0.685876 0.314124 + 1 0.707027 0.292973 + 0 0.74706 0.25294 + + [10 rows x 3 columns] + .. code-tab:: r R library(h2o) @@ -50,39 +91,3 @@ Note that when splitting frames, H2O does not give an exact split. It's designed 3rd Qu.:0.4369 Max. :0.9989 - .. code-tab:: python - - import h2o - from h2o.estimators.glm import H2OGeneralizedLinearEstimator - h2o.init() - - # Import the prostate dataset - prostate = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv" - prostate_df = h2o.import_file(path=prostate) - - # Split the data into Train/Test/Validation with Train having 70% and test and validation 15% each - train,test,valid = prostate_df.split_frame(ratios=[.7, .15]) - - # Generate a GLM model using the training dataset - glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5) - glm_classifier.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "DCAPS"], training_frame=train) - - # Predict using the GLM model and the testing dataset - predict = glm_classifier.predict(test) - - # View a summary of the prediction - predict.head() - predict p0 p1 - --------- -------- -------- - 1 0.366189 0.633811 - 1 0.351269 0.648731 - 1 0.69012 0.30988 - 0 0.762335 0.237665 - 1 0.680127 0.319873 - 1 0.687736 0.312264 - 1 0.676753 0.323247 - 1 0.685876 0.314124 - 1 0.707027 0.292973 - 0 0.74706 0.25294 - - [10 rows x 3 columns] diff --git a/h2o-docs/src/product/data-munging/tokenize.rst b/h2o-docs/src/product/data-munging/tokenize.rst index 86fa3101d1d6..c94ba5e9868d 100644 --- a/h2o-docs/src/product/data-munging/tokenize.rst +++ b/h2o-docs/src/product/data-munging/tokenize.rst @@ -1,53 +1,19 @@ .. _tokenize: -Tokenize Strings -~~~~~~~~~~~~~~~~ +Tokenize strings +================ -A ``tokenize`` function is available in H2O-3, which converts strings into tokens, then stores the tokenized text into a single column, making it easier for additional processing. +The ``tokenize`` function is available in H2O-3. This function converts strings into tokens then stores the tokenized text into a single column, therefore making it easier for additional processing. -Simple Tokenize Example -''''''''''''''''''''''' +Tokenize example +---------------- -Below is a simple example showing strings from frames tokenized into a single column. Refer to the following demos for a more extensive demo using tokenized text in Word2Vec: +The following short example shows strings from frames tokenized into a single column. Refer to the following demos for a more extensive demo using tokenized text in Word2Vec: -- Python: https://github.com/h2oai/h2o-3/blob/master/h2o-py/demos/word2vec_craigslistjobtitles.ipynb -- R: https://github.com/h2oai/h2o-3/blob/master/h2o-r/demos/rdemo.word2vec.craigslistjobtitles.R +- `Python tokenizing demo `__ +- `R tokenizing demo `__ .. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Create four simple, single-column R data frames by inputting values. - s1 <- as.character(as.h2o(" this is a string ")) - s2 <- as.character(as.h2o("this is another string")) - s3 <- as.character(as.h2o("this is a longer string")) - s4 <- as.character(as.h2o("this is tall, this is taller")) - - # Combine the datasets into a single dataset. - ds <- h2o.rbind(s1, s2, s3, s4) - ds - C1 - 1 this is a string - 2 this is another string - 3 this is a longer string - 4 this is tall, this is taller - - # Tokenize the dataset. - # Notice that tokenized sentences are separated by . - tokenized <- h2o.tokenize(ds, " ") - tokenized - C1 - 1 - 2 this - 3 is - 4 a - 5 string - 6 - - [24 rows x 1 column] - .. code-tab:: python import h2o @@ -91,3 +57,38 @@ Below is a simple example showing strings from frames tokenized into a single co string [24 rows x 1 column] + + .. code-tab:: r R + + library(h2o) + h2o.init() + + # Create four simple, single-column R data frames by inputting values. + s1 <- as.character(as.h2o(" this is a string ")) + s2 <- as.character(as.h2o("this is another string")) + s3 <- as.character(as.h2o("this is a longer string")) + s4 <- as.character(as.h2o("this is tall, this is taller")) + + # Combine the datasets into a single dataset. + ds <- h2o.rbind(s1, s2, s3, s4) + ds + C1 + 1 this is a string + 2 this is another string + 3 this is a longer string + 4 this is tall, this is taller + + # Tokenize the dataset. + # Notice that tokenized sentences are separated by . + tokenized <- h2o.tokenize(ds, " ") + tokenized + C1 + 1 + 2 this + 3 is + 4 a + 5 string + 6 + + [24 rows x 1 column] + diff --git a/h2o-docs/src/product/data-munging/uploading-data.rst b/h2o-docs/src/product/data-munging/uploading-data.rst index ceaedc255b36..6e1f123ab379 100644 --- a/h2o-docs/src/product/data-munging/uploading-data.rst +++ b/h2o-docs/src/product/data-munging/uploading-data.rst @@ -1,27 +1,31 @@ -Uploading a File ----------------- +Upload a file +============= -Unlike the import function, which is a parallelized reader, the upload function is a push from the client to the server. The specified path must be a client-side path. This is not scalable and is only intended for smaller data sizes. The client pushes the data from a local filesystem (for example, on your machine where R or Python is running) to H2O. For big-data operations, you don't want the data stored on or flowing through the client. +Unlike the import function, which is a parallelized reader, the upload function is a push from the client to the server. The specified path must be a client-side path. This is not scalable and is only intended for smaller data sizes. The client pushes the data from a local filesystem (for example, on your machine where Python or R is running) to H2O-3. For big-data operations, you don't want the data stored on or flowing through the client. -Refer to the `Supported File Formats `__ topic to ensure that you are using a supported file type. +`See more on supported file formats `__ to ensure that you are using a supported file type. -**Note**: When parsing a data file containing timestamps that do not include a timezone, the timestamps will be interpreted as UTC (GMT). You can override the parsing timezone using the following: +.. note:: + + When parsing a data file containing timestamps that do not include a timezone, the timestamps will be interpreted as UTC (GMT). You can override the parsing timezone using the following: + + - **Python**: ``h2o.cluster().timezone = "America/Los Angeles"`` + - **R**: ``h2o.setTimezone("America/Los Angeles")`` - - R: ``h2o.setTimezone("America/Los Angeles")`` - - Python: ``h2o.cluster().timezone = "America/Los Angeles"`` -Run the following command to load data that resides on the same machine that is running H2O. +Run the following command to load data that resides on the same machine that is running H2O-3. .. tabs:: + .. code-tab:: python + + import h2o + h2o.init() + iris_df = h2o.upload_file("../smalldata/iris/iris_wheader.csv") + .. code-tab:: r R library(h2o) h2o.init() iris_path <- "../smalldata/iris/iris_wheader.csv" iris <- h2o.uploadFile(path = iris_path) - - .. code-tab:: python - - import h2o - h2o.init() - iris_df = h2o.upload_file("../smalldata/iris/iris_wheader.csv") + diff --git a/h2o-docs/src/product/data-science.rst b/h2o-docs/src/product/data-science.rst index 34c56a602f5b..bf6fef75163f 100644 --- a/h2o-docs/src/product/data-science.rst +++ b/h2o-docs/src/product/data-science.rst @@ -4,9 +4,9 @@ Algorithms ========== -This section provides an overview of each algorithm available in H2O. For detailed information about the parameters that can be used for building models, refer to `Appendix A - Parameters `__. +This section provides an overview of each algorithm available in H2O-3. For detailed information about the parameters that can be used for building models, refer to `Appendix A - Parameters `__. -Data Types +Data types ---------- .. toctree:: @@ -26,9 +26,9 @@ Common Supervised ---------- -In supervised learning, the dataset is labeled with the answer that algorithm should come up with. Supervised learning takes input variables (x) along with an output variable (y). The output variable represents the column that you want to predict on. The algorithm then uses these variables to learn and approximate the mapping function from the input to the output. Supervised learning algorithms support classification and regression problems. +In supervised learning, the dataset is labeled with the answer that the chosen algorithm should come up with. Supervised learning takes input variables (x) along with an output variable (y). The output variable represents the column that you want to predict on. The algorithm then uses these variables to learn and approximate the mapping function from the input to the output. Supervised learning algorithms support classification and regression problems. -H2O supports the following supervised algorithms: +H2O-3 supports the following supervised algorithms: .. toctree:: :maxdepth: 1 @@ -55,9 +55,9 @@ H2O supports the following supervised algorithms: Unsupervised ------------ -In unsupervised learning, the model is provided with a dataset that isn't labeled - i.e., without an explicit outcome that the algorithm should return. In this case, the algorithm attempts to find patterns and structure in the data by extracting useful features. The model organizes the data in different ways, depending on the algorithm (clustering, anomaly detection, autoencoders, etc). +In unsupervised learning, the model is provided with a dataset that isn't labeled (i.e. without an explicit outcome that the algorithm should return). In this case, the algorithm attempts to find patterns and structure in the data by extracting useful features. The model organizes the data in different ways, depending on the algorithm (clustering, anomaly detection, autoencoders, etc). -H2O supports the following unsupervised algorithms: +H2O-3 supports the following unsupervised algorithms: .. toctree:: :maxdepth: 1 diff --git a/h2o-docs/src/product/data-science/algo-params/auuc_nbins.rst b/h2o-docs/src/product/data-science/algo-params/auuc_nbins.rst index 26d10366fd6a..a5756b911e54 100644 --- a/h2o-docs/src/product/data-science/algo-params/auuc_nbins.rst +++ b/h2o-docs/src/product/data-science/algo-params/auuc_nbins.rst @@ -20,7 +20,7 @@ Related Parameters ~~~~~~~~~~~~~~~~~~ - `treatment_column `__ -- `response_column `__ +- `response_column `__ - `uplift_metric `__ - `auuc_type `__ diff --git a/h2o-docs/src/product/data-science/algo-params/auuc_type.rst b/h2o-docs/src/product/data-science/algo-params/auuc_type.rst index 0250ff90ee2c..27e5eea357e1 100644 --- a/h2o-docs/src/product/data-science/algo-params/auuc_type.rst +++ b/h2o-docs/src/product/data-science/algo-params/auuc_type.rst @@ -31,7 +31,7 @@ Related Parameters ~~~~~~~~~~~~~~~~~~ - `treatment_column `__ -- `response_column `__ +- `response_column `__ - `uplift_metric `__ - `auuc_nbins `__ diff --git a/h2o-docs/src/product/data-science/algo-params/exclude_algos.rst b/h2o-docs/src/product/data-science/algo-params/exclude_algos.rst index f141937e10fd..211cb9174563 100644 --- a/h2o-docs/src/product/data-science/algo-params/exclude_algos.rst +++ b/h2o-docs/src/product/data-science/algo-params/exclude_algos.rst @@ -34,7 +34,7 @@ Example h2o.init() # Import a sample binary outcome training set into H2O - train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") + train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") # Identify predictors and response x <- setdiff(names(train), y) @@ -77,7 +77,7 @@ Example h2o.init() # Import a sample binary outcome training set into H2O - train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") + train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") # Identify predictors and response x = train.columns diff --git a/h2o-docs/src/product/data-science/algo-params/include_algos.rst b/h2o-docs/src/product/data-science/algo-params/include_algos.rst index db85e19ec01e..7585addf6b5a 100644 --- a/h2o-docs/src/product/data-science/algo-params/include_algos.rst +++ b/h2o-docs/src/product/data-science/algo-params/include_algos.rst @@ -34,7 +34,7 @@ Example h2o.init() # Import a sample binary outcome training set into H2O - train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") + train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") # Identify predictors and response x <- setdiff(names(train), y) @@ -77,7 +77,7 @@ Example h2o.init() # Import a sample binary outcome training set into H2O - train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") + train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") # Identify predictors and response x = train.columns diff --git a/h2o-docs/src/product/data-science/algo-params/sort_metric.rst b/h2o-docs/src/product/data-science/algo-params/sort_metric.rst index 538e930c0871..92a6709751fa 100644 --- a/h2o-docs/src/product/data-science/algo-params/sort_metric.rst +++ b/h2o-docs/src/product/data-science/algo-params/sort_metric.rst @@ -38,7 +38,7 @@ Example h2o.init() # Import a sample binary outcome training set into H2O - train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") + train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") # Identify predictors and response x <- setdiff(names(train), y) @@ -81,7 +81,7 @@ Example h2o.init() # Import a sample binary outcome training set into H2O - train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") + train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") # Identify predictors and response x = train.columns diff --git a/h2o-docs/src/product/data-science/early_stopping.rst b/h2o-docs/src/product/data-science/early_stopping.rst index bd8c36d30f18..69eb7613c4c9 100644 --- a/h2o-docs/src/product/data-science/early_stopping.rst +++ b/h2o-docs/src/product/data-science/early_stopping.rst @@ -1,39 +1,59 @@ -Early Stopping --------------- +Early stopping +============== -All of the H2O supervised learning algorithms allow for early stopping during model building and scoring. +All of the H2O-3 supervised learning algorithms allow for early stopping during model building and scoring. -Early Stopping in All Supervised Algorithms -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Early stopping in all supervised algorithms +------------------------------------------- -- :ref:`max_runtime_secs` (Defaults to 0/disabled.) +The following early stopping parameter is available to all supervised algorithms: + +- :ref:`max_runtime_secs` (Defaults to ``0``/disabled.) The ``max_runtime_secs`` option specifes the maximum runtime in seconds that you want to allot in order to complete the model. If this maximum runtime is exceeded before the model build is completed, then the model will fail. When performing a grid search, this option specifies the maximum runtime in seconds for the entire grid. This option can also be combined with ``max_runtime_secs`` in the model parameters. If ``max_runtime_secs`` is not set in the model parameters, then each model build is launched with a limit equal to the remainder of the grid time. On the other hand, if ``max_runtime_secs`` is set in the model parameters, then each build is launched with a limit equal to the minimum of the model time limit and the remaining time for the grid. -Early Stopping in AutoML, Grid Search, Deep Learning, DRF, GBM, and XGBoost -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Early stopping in AutoML, grid search, Deep Learning, DRF, GBM, and XGBoost +--------------------------------------------------------------------------- + +In AutoML, grid search, Deep Learning, DRF, GBM, and XGBoost, the following additional parameters are used for early stopping: + +- :ref:`stopping_rounds` - Defaults to: + + - AutoML: ``3`` + - Deep Learning: ``5`` + - DRF, GBM, XGBoost: ``0``/disabled -In AutoML, Grid Search, Deep Learning, DRF, GBM, and XGBoost, the following additional parameters are used for early stopping: +- :ref:`stopping_tolerance` - Defaults to: + + - ``0.001`` + - In AutoML, for datasets with more than 1 million rows, this defaults to a larger valued determined by the size of the dataset and the non-NA-rate. -- :ref:`stopping_rounds` (Defaults to 3 in AutoML; defaults to 5 in Deep Learning; defaults to 0/disabled in DRF, GBM, XGBoost.) -- :ref:`stopping_tolerance` (Defaults to 0.001. In AutoML for datasets with more than 1 million rows, this defaults to a larger valued determined by the size of the dataset and the non-NA-rate.) -- :ref:`stopping_metric` (Defaults to "logloss" for classification and "deviance" for regression.) +- :ref:`stopping_metric` - Defaults to: + + - classification models: ``"logloss"`` + - regression models: ``"deviance"`` The simplest way to turn on early stopping in these algorithms is to use a number >=1 in ``stopping_rounds``. The default values for the other two parameters will work fairly well, but a ``stopping_tolerance`` of 0 is a common alternative to the default. -Additionally, take :ref:`score_tree_interval` and/or :ref:`score_each_iteration` into account when using these early stopping methods. The stopping rounds applies to the number of scoring iterations H2O has performed, so regular scoring iterations of small size can help control early stopping the most (though there is a speed tradeoff to scoring more often). The default is to use H2O’s assessment of a reasonable ratio of training time to scoring time, which often results in inconsistent scoring gaps. +Additionally, you can take :ref:`score_tree_interval` and/or :ref:`score_each_iteration` into account when using these early stopping methods. The stopping rounds applies to the number of scoring iterations H2O-3 has performed, so regular scoring iterations of a small size can help control early stopping the most (though there is a speed tradeoff to scoring more often). The default is to use H2O-3’s assessment of a reasonable ratio of training time to scoring time, which often results in inconsistent scoring gaps. -Early Stopping in GLM and GAM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Early stopping in GLM and GAM +----------------------------- In GLM and GAM, the following additional parameters are used for early stopping: -- :ref:`early_stopping` (Default is enabled.) -- :ref:`max_active_predictors` (Default can vary based on the solver.) -- :ref:`stopping_rounds` (Defaults to 0 in GLM and GAM.) -- :ref:`stopping_tolerance` (Defaults to 0.001 in GLM and GAM.) -- :ref:`stopping_metric` (Defaults to "logloss" for classification and "deviance" for regression.) +- :ref:`early_stopping` (Defaults to ``enabled``.) +- :ref:`max_active_predictors` (Default varies based on the ``solver``.) +- :ref:`stopping_rounds` (Defaults to ``0`` in GLM and GAM.) +- :ref:`stopping_tolerance` (Defaults to ``0.001`` in GLM and GAM.) +- :ref:`stopping_metric` (Defaults to ``"logloss"`` for classification and ``"deviance"`` for regression.) When ``early_stopping`` is enabled, GLM and GAM will automatically stop building a model when there is no more relative improvement on the training or validation (if provided) set. This option prevents expensive model building with many predictors when no more improvements are occurring. -The ``max_active_predictors`` option limits the number of active predictors. (Note that the actual number of non-zero predictors in the model is going to be slightly lower). This is useful when obtaining a sparse solution to avoid costly computation of models with too many predictors. When using the :math:`\lambda_1` penalty with lambda search, this option will stop the search before it completes. Models built at the beginning of the lambda search have higher lambda values, consider fewer predictors, and take less time to calculate the model. Models built at the end of the lambda search have lower lambda values, incorporate more predictors, and take a longer time to calculate the model. Set the ``nlambdas`` parameter for a lambda search to specify the number of models attempted across the search. \ No newline at end of file +The ``max_active_predictors`` option limits the number of active predictors. + +.. note:: + + The actual number of non-zero predictors in the model is going to be slightly lower. + +This is useful when obtaining a sparse solution to avoid costly computation of models with too many predictors. When using the :math:`\lambda_1` penalty with lambda search, this option will stop the search before it completes. Models built at the beginning of the lambda search have higher lambda values, consider fewer predictors, and take less time to calculate the model. Models built at the end of the lambda search have lower lambda values, incorporate more predictors, and take a longer time to calculate the model. Set the ``nlambdas`` parameter for a lambda search to specify the number of models attempted across the search. \ No newline at end of file diff --git a/h2o-docs/src/product/data-science/gbm.rst b/h2o-docs/src/product/data-science/gbm.rst index 99e65aad103b..468c9303fb87 100644 --- a/h2o-docs/src/product/data-science/gbm.rst +++ b/h2o-docs/src/product/data-science/gbm.rst @@ -442,7 +442,7 @@ Below is a simple example showing how to build a Gradient Boosting Machine model feature_interactions = pros_gbm.feature_interaction() # Get Friedman and Popescu's H statistics - h = pros_gbm.h(prostate_train, ['DPROS','DCAPS']) + h = pros_gbm.h(prostate, ['DPROS','DCAPS']) print(h) diff --git a/h2o-docs/src/product/data-science/glm.rst b/h2o-docs/src/product/data-science/glm.rst index d78ac2fb2da0..161d57698fdc 100644 --- a/h2o-docs/src/product/data-science/glm.rst +++ b/h2o-docs/src/product/data-science/glm.rst @@ -63,6 +63,8 @@ Algorithm-specific parameters - `interaction_pairs `__: When defining interactions, use this option to specify a list of pairwise column interactions (interactions between two variables). Note that this is different than ``interactions``, which will compute all pairwise combinations of specified columns. +**max_iterations**: For GLM, must be :math:`\geq` 1 to obtain a proper model (or -1 for unlimited which is the default setting). Setting it to 0 will only return the correct coefficient names and an empty model. + - **max_iterations_dispersion**: Control the maximum number of iterations in the dispersion parameter estimation loop using maximum likelihood. This option defaults to ``1000000``. - `rand_family `__: The Random Component Family specified as an array. You must include one family for each random component. Currently only ``rand_family=["gaussisan"]`` is supported. @@ -239,7 +241,7 @@ Common parameters - `max_iterations `__: Specify the number of training iterations. This options defaults to ``-1``. -- `max_runtime_secs `__: Maximum allowed runtime in seconds for model training. Use ``0`` (default) to disable. +- `max_runtime_secs `__: Maximum allowed runtime in seconds for model training. Use ``0`` (default) to disable. - `missing_values_handling `__: Specify how to handle missing values. One of: ``Skip``, ``MeanImputation`` (default), or ``PlugValues``. @@ -1623,6 +1625,219 @@ Variable Inflation Factor Example vif_glm.get_variable_inflation_factors() {'Intercept': nan, 'abs.C1.': 1.0003341467438167, 'abs.C2.': 1.0001734204183244, 'abs.C3.': 1.0007846189027745, 'abs.C4.': 1.0005388379729434, 'abs.C5.': 1.0005349427184604} +Constrained GLM +~~~~~~~~~~~~~~~ + +We've implemented the algorithm from Bierlaire's *Optimization: Priciples and Algorithms, Chapter 19* [:ref:`8`] where we're basically trying to solve the following optimization problem: + +.. math:: + + \min_{X\in R^n} f(x), \text{subject to } h(x) = 0, g(x) \leq 0 \quad \text{ equation 1} + +where: + + - :math:`f: R^n \to R,h: R^n \to R^m,g: R^n \to R^p` + - the constraints :math:`h,g` are linear. + +However, the actual problem we are solving is: + +.. math:: + + \min_{X\in R^n} f(x) \text{ subject to } h(x)=0 \quad \text{ equation 2} + +The inequalities constraints can be easily converted to equalities constraints through simple reasoning and using active constraints. We solve the constrained optimization problem by solving the augmented Lagrangian function using the quadratic penalty: + +.. math:: + + L_c(x,\lambda) = f(x) + \lambda^T h(x) + \frac{c}{2} \| h(x) \|^2 \quad \text{ equation 3} + +The basic ideas used to solve the constrained GLM consist of: + +a. transforming a constrained problem into a sequence of unconstrained problems; +b. penalizing more and more the possible violation of the constraints during the sequence by continuously increasing the value of :math:`c` at each iteration. + +Converting to standard form +''''''''''''''''''''''''''' + +A standard form of :math:`g(x) \leq 0` is the only acceptable form of inequality constraints. For example, if you have a constraint of :math:`2x_1 - 4x_2 \geq 10` where :math:`x_1 \text{ and } x_4` are coefficient names, then you must convert it to :math:`10-2x_1 + 4x_2 \leq 0`. + +Example of constrained GLM +'''''''''''''''''''''''''' + +.. tabs:: + .. code-tab:: r R + + # Import the Gaussian 10,000 rows dataset: + h2o_data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/gaussian_20cols_10000Rows.csv") + + # Set the predictors, response, and enum columns: + enum_columns = c("C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10") + for (cname in enum_columns) { + h2o.asfactor(h2o_data[cname]) + } + myY = "C21" + col_names <- names(h2o_data) + myX <- col_names[1:20] + + # Set the constraints: + constraints <- data.frame(names <- c("C1.2", "C11", "constant", "C5.2", "C12", "C15", "constant"), + values <- c(1, 1, 13.56, 1, 1, 1, -5), + types <- c("Equal", "Equal", "Equal", "LessThanEqual", "LessThanEqual", "LessThanEqual", "LessThanEqual"), + constraint_numbers <- c(0, 0, 0, 1, 1, 1, 1)) + constraints_h2o <- as.h2o(constraints) + + # Set the beta constraints: + bc <- data.frame(names <- c("C1.1", "C5.2", "C11", "C15"), + lower_bounds <- c(-36, -14, 25, 14), + upper_bounds <- c(-35, -13, 26, 15)) + bc_h2o <- as.h2o(bc) + + # Build and train your model: + m_sep <- h2o.glm(x=myX, + y=myY, + training_frame=h2o.data, + family='gaussian', + linear_constraints=constraints, + solver="irlsm", + lambda=0.0, + beta_constraints=bc_h2o, + constraint_eta0=0.1, + constraint_tau=10, + constraint_alpha=0.01, + constraint_beta=0.9, + constraint_c0=100) + + # Find your coefficients: + h2o.coef(m_sep) + + .. code-tab:: python + + # Import the Gaussian 10,000 rows dataset: + h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/gaussian_20cols_10000Rows.csv") + + # Set the predictors, response, and enum columns: + enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] + ffor cname in enum_columns: + h2o_data[cname] = h2o_data[cname].asfactor() + myY = "C21" + myX = h2o_data.names.remove(myY) + + # Set the linear constraints: + linear_constraints = [] # this constraint is satisfied by default coefficient initialization + name = "C1.2" + values = 1 + types = "Equal" + contraint_numbers = 0 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C11" + values = 1 + types = "Equal" + contraint_numbers = 0 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 13.56 + types = "Equal" + contraint_numbers = 0 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C5.2" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C12" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C15" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -5 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + linear_constraints2 = h2o.H2OFrame(linear_constraints) + linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) + + # Set the beta constraints: + bc = [] + name = "C1.1" + c1p1LowerBound = -36 + c1p1UpperBound=-35 + bc.append([name, c1p1LowerBound, c1p1UpperBound]) + + name = "C5.2" + c5p2LowerBound=-14 + c5p2UpperBound=-13 + bc.append([name, c5p2LowerBound, c5p2UpperBound]) + + name = "C11" + c11LowerBound=25 + c11UpperBound=26 + bc.append([name, c11LowerBound, c11UpperBound]) + + name = "C15" + c15LowerBound=14 + c15UpperBound=15 + bc.append([name, c15LowerBound, c15UpperBound]) + + beta_constraints = h2o.H2OFrame(bc) + beta_constraints.set_names(["names", "lower_bounds", "upper_bounds"]) + + # Build and train your model: + m_sep = glm(family='gaussian', + linear_constraints=linear_constraints2, + solver="irlsm", + lambda_=0.0, + beta_constraints=beta_constraints, + constraint_eta0=0.1, + constraint_tau=10, + constraint_alpha=0.01, + constraint_beta=0.9, + constraint_c0=100) + m_sep.train(training_frame=h2o_data,x=myX, y=myY) + + # Find your coefficients: + coef_sep = m_sep.coef() + + +Treatment of strict inequalities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To convert a strict inequality, just add a small number to it. For example, :math:`2x_1 - 4x_2 < 0` can be converted to :math:`2x_1 - 4x_2 - 10^{-12} \leq 0`. + +Transforming inequality constraints to equality constraints +''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +This transformation is going to use slack variables which are introduced to replace an inequality constraint by an equality constraint. The slack variable should be non-negative. To transform inequality constraints to equality constraints, we proceed as follows: + +a. For each inequality constraint of :math:`g(x)`, a slack variable is added to it such that you will have: :math:`g_i(x) - s_i^2 = 0`; +b. Let :math:`s = \begin{bmatrix} s_1^2 \\ \vdots \\ s_p^2 \\\end{bmatrix}` and :math:`g_{aug}(x) = g(x) - s`; +c. When :math:`g_i(x) \leq 0`, the constraint is satisfied and can therefore be ignored and declared inactive; +d. The inequality constraints are violated only when :math:`g_i(x) - s_i^2 \geq 0`. This is because it implies that :math:`g_i(x) \geq s_i^2 \geq 0` and this isn't allowed. Therefore, :math:`geq(x)` only includes the :math:`g_i(x)` when you have :math:`g_i(x) \geq 0`; +e. Therefore, you have :math:`h_a(x) = \begin{bmatrix} h(x) \\ geq(x) \\\end{bmatrix}`, where :math:`h(x)` is the original equality constraint and :math:`geq(x)` contains the inequality constraints that satisfied the condition :math:`g_i(x) \geq 0`; +f. The optimization problem in *equation 1* can now be rewritten as: + +.. math:: + + \min_{X\in R^n} f(x), \text{ subject to } h_a(x) = 0 \quad \text{ equation 4} + +g. The augmented Lagrangian function you will solve from *equation 4* becomes: + +.. math:: + + L_c(x, \lambda) = f(x) + \lambda^T h_a(x) + \frac{c}{2} \|h_a(x)\|^2 \quad \text{ equation 5} + Modifying or Creating a Custom GLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2006,3 +2221,7 @@ Technometrics 19.4 (1977): 415-428. `Ronnegard, Lars. HGLM course at the Roslin Institute, http://users.du.se/~lrn/DUweb/Roslin/RoslinCourse_hglmAlgorithm_Nov13.pdf. `__ `Balzer, Laura B, and van der Laan, Mark J. "Estimating Effects on Rare Outcomes: Knowledge is Power." U.C. Berkeley Division of Biostatistics Working Paper Series (2013) `__. + +.. _ref8: + +Michel Bierlaire, Optimization: Principles and Algorithms, Chapter 19, EPEL Press, second edition, 2018 diff --git a/h2o-docs/src/product/data-science/quantiles.rst b/h2o-docs/src/product/data-science/quantiles.rst index 98f0edd61ffe..a24f279830d0 100644 --- a/h2o-docs/src/product/data-science/quantiles.rst +++ b/h2o-docs/src/product/data-science/quantiles.rst @@ -1,50 +1,59 @@ Quantiles ---------- +========= -This function retrieves and displays quantiles for H2O parsed data. +This function retrieves and displays quantiles for H2O-3 parsed data. -**Note**: The quantile results in Flow are computed lazily on-demand and cached. It is a fast approximation (max - min / 1024) that is very accurate for most use cases. If the distribution is skewed, the quantile results may not be as accurate as the results obtained using ``h2o.quantile`` in R or ``H2OFrame.quantile`` in Python. +.. note:: + + The quantile results in Flow are computed lazily on-demand and cached. It's a fast approximation (max - min / 1024) that's very accurate for most use cases. If the distribution is skewed, the quantile results may not be as accurate as the results obtained using ``h2o.quantile`` in R or ``H2OFrame.quantile`` in Python. -Quantile Parameters -~~~~~~~~~~~~~~~~~~~ +Quantile parameters +------------------- -- h2oFrame: Specify the an H2OFrame -- `weights_column `__: (Optional) A string name identifying the obsevation weights column in this frame or a single-column, separate H2OFrame of observation weights. If this option isn't specified, then all rows are assumed to have equal importance. -- **prob**: Specify a list of probabilities with values in the range [0,1]. By default, the following probabilities are returned: +- **combine_method**: Specify the method for combining quantiles for even sample sizes. Abbreviations for average, low, and high are acceptable (avg, lo, hi). The default is to do linear interpolation (e.g. if method is "lo", then it will take the lo value of the quantile). Available methods include: - - R: 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.667, 0.75, 0.9, 0.99, 0.999 - - Python: 0.01, 0.1, 0.25, 0.333, 0.5, 0.667, 0.75, 0.9, 0.99 + - ``average`` + - ``high`` + - ``interpolate`` + - ``low`` + +- **h2oFrame**: Specify the H2OFrame. +- **prob**: Specify a list of probabilities with values in the range [0,1]. By default, the following probabilities are returned: + + - Python: 0.01, 0.1, 0.25, 0.333, 0.5, 0.667, 0.75, 0.9, 0.99 + - R: 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.667, 0.75, 0.9, 0.99, 0.999 -- **combine_method**: Specify the method for combining quantiles for even sample sizes. Available methods include ``interpolate``, ``average``, ``low``, and ``high``. Abbreviations for average, low, and high are acceptable (avg, lo, hi). The default is to do linear interpolation (e.g. if method is "lo", then it will take the lo value of the quantile). +- `weights_column `__: A string name identifying the obsevation weights column in this frame or a single-column, separate H2OFrame of observation weights. If this option isn't specified, then all rows are assumed to have equal importance. Examples -~~~~~~~~ +-------- .. tabs:: - .. code-tab:: r R + .. code-tab:: python # Import the prostate dataset: - prostate <- h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") # Request quantiles for the parsed dataset - quantile(prostate) + prostate.quantile() # Request quantiles for the AGE column: - quantile(prostate[, 3]) + prostate["AGE"].quantile() # Request quantiles for probabilities 0.001 and 0.01 for the AGE column - quantile(prostate[, 3], prob=c(0.001, 0.01)) + prostate["AGE"].quantile(prob=[0.001, 0.01]) - .. code-tab:: python + .. code-tab:: r R # Import the prostate dataset: - prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + prostate <- h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") # Request quantiles for the parsed dataset - prostate.quantile() + quantile(prostate) # Request quantiles for the AGE column: - prostate["AGE"].quantile() + quantile(prostate[, 3]) # Request quantiles for probabilities 0.001 and 0.01 for the AGE column - prostate["AGE"].quantile(prob=[0.001, 0.01]) + quantile(prostate[, 3], prob=c(0.001, 0.01)) + diff --git a/h2o-docs/src/product/data-science/stacked-ensembles.rst b/h2o-docs/src/product/data-science/stacked-ensembles.rst index 4caa64f74061..25a42c2e9b2d 100644 --- a/h2o-docs/src/product/data-science/stacked-ensembles.rst +++ b/h2o-docs/src/product/data-science/stacked-ensembles.rst @@ -157,8 +157,8 @@ Below is a simple example showing how to build a Stacked Ensembles model. h2o.init() # Import a sample binary outcome train/test set into H2O - train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") + test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv") # Identify predictors and response y <- "response" @@ -287,8 +287,8 @@ Below is a simple example showing how to build a Stacked Ensembles model. h2o.init() # Import a sample binary outcome train/test set into H2O - train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") + test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv") # Identify predictors and response x = train.columns diff --git a/h2o-docs/src/product/data-science/supported-data-types.rst b/h2o-docs/src/product/data-science/supported-data-types.rst index b2748b898e34..1eb28403d504 100644 --- a/h2o-docs/src/product/data-science/supported-data-types.rst +++ b/h2o-docs/src/product/data-science/supported-data-types.rst @@ -1,9 +1,11 @@ -Supported Data Types +Supported data types ==================== -When building models, the supported data types varies per algorithm. +When building models, the supported data types vary per algorithm. -- All H2O-3 algos accept data as numerical and categorical. -- Word2Vec accepts data as text. +.. note:: + + - All H2O-3 algorithms accept data as numerical and categorical. + - Word2Vec accepts data as text. If your data includes timestamps, we recommend that you either convert the data to numeric (if you plan to use the data) or ignore timestamp columns. \ No newline at end of file diff --git a/h2o-docs/src/product/downloading.rst b/h2o-docs/src/product/downloading.rst index 6b6c0ef086be..5f77a1d75607 100644 --- a/h2o-docs/src/product/downloading.rst +++ b/h2o-docs/src/product/downloading.rst @@ -1,25 +1,30 @@ -Downloading & Installing H2O -============================ +Downloading and installing H2O-3 +================================ -This section describes how to download and install the latest stable version of H2O. These instructions are also available on the `H2O Download page `__. Please first make sure you meet the requirements listed `here `__. Java is a prerequisite for H2O, even if using it from the R or Python packages. +This section describes how to download and install the latest stable version of H2O-3. These instructions are also available on the `H2O-3 Download page `__. Please first make sure you meet `the requirements to download and use H2O-3 `__. Java is a prerequisite for H2O-3, even if using it from the R or Python packages. -**Note**: To download the nightly bleeding edge release, go to `h2o-release.s3.amazonaws.com/h2o/master/latest.html `__. Choose the type of installation you want to perform (for example, "Install in Python") by clicking on the tab. +.. note:: + + Check our `Downloads page to download the nightly bleeding edge release `__. -Choose your desired method of use below. Most users will want to use H2O from either `R `__ or `Python `__; however we also include instructions for using H2O's web GUI Flow and Hadoop below. + Scroll down to the H2O-3 section and select Nightly Bleeding Edge. Then, choose the type of installation you want to perform (for example, "Install in Python") by clicking on the tab. +Choose your desired method of use below. Most users will want to use H2O-3 from either `R `__ or `Python `__. However, we also include instructions for using H2O-3's web GUI Flow and Hadoop below. -Download and Run from the Command Line --------------------------------------- -If you plan to exclusively use H2O's web GUI, `Flow `__, this is the method you should use. If you plan to use H2O from R or Python, skip to the appropriate sections below. +Download and run Flow from the command line +------------------------------------------- -1. Click the ``Download H2O`` button on the `http://h2o-release.s3.amazonaws.com/h2o/latest_stable.html `__ page. This downloads a zip file that contains everything you need to get started. +If you plan to exclusively use H2O-3's web GUI, `Flow `__, this is the method you should use. + +1. Go to our `latest stable release page `__. +2. Click the ``Download H2O`` button from the Download and Run tab. This downloads a ZIP file that contains everything you need to get started. .. note:: By default, this setup is open. Follow `security guidelines `__ if you want to secure your installation. -2. From your terminal, unzip and start H2O as in the example below. +3. From your terminal, unzip and start H2O-3 as in the example below. .. substitution-code-block:: bash @@ -28,25 +33,25 @@ If you plan to exclusively use H2O's web GUI, `Flow `__ if you want to secure your installation. -1. The following two commands remove any previously installed H2O packages for R. +1. The following two commands remove any previously installed H2O-3 packages for R. .. code-block:: r if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) } if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") } -2. Next, download packages that H2O depends on. +2. Next, `download packages that H2O-3 depends on `__. .. code-block:: r @@ -55,13 +60,13 @@ Perform the following steps in R to install H2O. Copy and paste these commands o if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) } } -3. Download and install the H2O package for R. +3. Download and install the H2O-3 package for R. .. code-block:: r install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R"))) -4. Optionally initialize H2O and run a demo to see H2O at work. +4. Optionally initialize H2O-3 and run a demo to see H2O-3 at work. .. code-block:: r @@ -69,10 +74,14 @@ Perform the following steps in R to install H2O. Copy and paste these commands o localH2O = h2o.init() demo(h2o.kmeans) -Installing H2O's R Package from CRAN -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Install H2O-3's R package from CRAN +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Alternatively you can install H2O-3’s R package from `CRAN `__ or by typing ``install.packages("h2o")`` in R. -Alternatively you can install H2O’s R package from `CRAN `__ or by typing ``install.packages("h2o")`` in R. Sometimes there can be a delay in publishing the latest stable release to CRAN, so to guarantee you have the latest stable version, use the instructions above to install directly from the H2O website. +.. note:: + + Sometimes there can be a delay in publishing the latest stable release to CRAN. To guarantee you have the latest stable version, use the instructions above to install directly from the H2O.ai website. Install in Python ----------------- @@ -81,7 +90,7 @@ Install in Python By default, this setup is open. Follow `security guidelines `__ if you want to secure your installation. -Run the following commands in a Terminal window to install H2O for Python. +Run the following commands in a Terminal window to install H2O-3 for Python. 1. Install dependencies (prepending with ``sudo`` if needed): @@ -94,27 +103,23 @@ Run the following commands in a Terminal window to install H2O for Python. # Required for plotting: pip install matplotlib - **Note**: These are the dependencies required to run H2O. ``matplotlib`` is optional and only required to plot in H2O. A complete list of dependencies is maintained in the following file: `https://github.com/h2oai/h2o-3/blob/master/h2o-py/conda/h2o-main/meta.yaml `__. +.. note:: + + These are the dependencies required to run H2O-3. ``matplotlib`` is optional and only required to plot in H2O-3. See our `complete list of dependencies `__. -2. Run the following command to remove any existing H2O module for Python. +2. Run the following command to remove any existing H2O-3 module for Python. .. code-block:: bash pip uninstall h2o -3. Use ``pip`` to install this version of the H2O Python module. +3. Use ``pip`` to install `the H2O-3 Python module `__. .. code-block:: bash pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o - **Note**: When installing H2O from ``pip`` in OS X El Capitan, users must include the ``--user`` flag. For example: - - .. code-block:: bash - - pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o --user - -4. Optionally initialize H2O in Python and run a demo to see H2O at work. +4. Optionally initialize H2O-3 in Python and run a demo to see H2O-3 at work. .. code-block:: python @@ -125,9 +130,9 @@ Run the following commands in a Terminal window to install H2O for Python. Install on Anaconda Cloud ~~~~~~~~~~~~~~~~~~~~~~~~~ -This section describes how to set up and run H2O in an Anaconda Cloud environment. Conda 2.7, 3.5, and 3.6 repos are supported as are a number of H2O versions. Refer to `https://anaconda.org/h2oai/h2o/files `__ to view a list of available H2O versions. +This section describes how to set up and run H2O-3 in an Anaconda Cloud environment. Conda 2.7, 3.5, and 3.6 repos are supported as are a number of H2O-3 versions. See `which H2O-3 versions are available on Anaconda `__. -Open a terminal window and run the following command to install H2O on the Anaconda Cloud. The H2O version in this command should match the version that you want to download. If you leave the h2o version blank and specify just ``h2o``, then the latest version will be installed. For example: +Open a terminal window and run the following command to install H2O-3 on the Anaconda Cloud. The H2O-3 version in this command should match the version that you want to download. If you leave the H2O-3 version blank and specify just ``h2o``, then the latest version will be installed. For example: .. substitution-code-block:: bash @@ -139,23 +144,25 @@ or: user$ conda install -c h2oai h2o -**Note**: For Python 3.6 users, H2O has ``tabulate>=0.75`` as a dependency; however, there is no ``tabulate`` available in the default channels for Python 3.6. This is available in the conda-forge channel. As a result, Python 3.6 users must add the ``conda-forge`` channel in order to load the latest version of H2O. This can be done by performing the following steps: +.. note:: + + For Python 3.6 users, H2O-3 has ``tabulate>=0.75`` as a dependency; however, there is no ``tabulate`` available in the default channels for Python 3.6. This is available in the conda-forge channel. As a result, Python 3.6 users must add the ``conda-forge`` channel in order to load the latest version of H2O-3. This can be done by performing the following steps: - .. code-block:: bash + .. code-block:: bash - conda create -n py36 python=3.6 anaconda - source activate py36 - conda config --append channels conda-forge - conda install -c h2oai h2o + conda create -n py36 python=3.6 anaconda + source activate py36 + conda config --append channels conda-forge + conda install -c h2oai h2o -After H2O is installed, refer to the `Starting H2O from Anaconda `__ section for information on how to start H2O and to view a GBM example run in Jupyter Notebook. +After H2O-3 is installed, see the `Starting H2O-3 from Anaconda `__ section for information on how to start H2O-3 and to view a GBM example run in Jupyter Notebook. Install on Hadoop ----------------- -1. Go to `http://h2o-release.s3.amazonaws.com/h2o/latest_stable.html `__. Click on the **Install on Hadoop** tab, and download H2O for your version of Hadoop. This is a zip file that contains everything you need to get started. +1. Go to the `Downloads page `__. Click on the **Install on Hadoop** tab, and download H2O-3 for your version of Hadoop. This is a ZIP file that contains everything you need to get started. -2. Unpack the zip file and launch a 6g instance of H2O. For example: +2. Unpack the ZIP file and launch a 6g instance of H2O-3. For example: .. substitution-code-block:: bash @@ -163,7 +170,7 @@ Install on Hadoop cd h2o-|version|-* hadoop jar h2odriver.jar -nodes 1 -mapperXmx 6g -3. Point your browser to H2O. (See "Open H2O Flow in your web browser" in the output below.) +3. Point your browser to H2O-3. (See "Open H2O Flow in your web browser" in the output below.) .. code-block:: bash diff --git a/h2o-docs/src/product/getting-data-into-h2o.rst b/h2o-docs/src/product/getting-data-into-h2o.rst index 5b5f0cb6e630..703cb5b3a1f1 100644 --- a/h2o-docs/src/product/getting-data-into-h2o.rst +++ b/h2o-docs/src/product/getting-data-into-h2o.rst @@ -1,14 +1,14 @@ -Getting Data into Your H2O Cluster -================================== +Getting data into your H2O-3 cluster +==================================== -The first step toward building and scoring your models is getting your data into the H2O cluster/Java process that’s running on your local or remote machine. Whether you're importing data, uploading data, or retrieving data from HDFS or S3, be sure that your data is compatible with H2O. +The first step toward building and scoring your models is getting your data into the H2O-3 cluster/Java process that’s running on your local or remote machine. Whether you're importing data, uploading data, or retrieving data from HDFS or S3, be sure that your data is compatible with H2O-3. .. _supported_file_formats: -Supported File Formats +Supported file formats ---------------------- -H2O supports the following file types: +H2O-3 supports the following file types: - CSV (delimited, UTF-8 only) files (including GZipped CSV) - ORC @@ -20,27 +20,30 @@ H2O supports the following file types: - Parquet - Google Storage (gs://) -**Notes**: +.. note:: - - H2O supports UTF-8 encodings for CSV files. Please convert UTF-16 encodings to UTF-8 encoding before parsing CSV files into H2O. - - ORC is available only if H2O is running as a Hadoop job. - - Users can also import Hive files that are saved in ORC format (experimental). - - If you encounter issues importing XLS or XLSX files, you may be using an unsupported version. In this case, re-save the file in BIFF 8 format. Also note that XLS and XLSX support will eventually be deprecated. - - When doing a parallel data import into a cluster: + - H2O supports UTF-8 encodings for CSV files. Please convert UTF-16 encodings to UTF-8 encoding before parsing CSV files into H2O-3. + - ORC is available only if H2O-3 is running as a Hadoop job. + - Users can also import Hive files that are saved in ORC format (experimental). + - When doing a parallel data import into a cluster: - - If the data is an unzipped csv file, H2O can do offset reads, so each node in your cluster can be directly reading its part of the csv file in parallel. - - If the data is zipped, H2O will have to read the whole file and unzip it before doing the parallel read. + - If the data is an unzipped CSV file, H2O-3 can do offset reads, so each node in your cluster can be directly reading its part of the CSV file in parallel. + - If the data is zipped, H2O-3 will have to read the whole file and unzip it before doing the parallel read. - So, if you have very large data files reading from HDFS, it is best to use unzipped csv. But if the data is further away than the LAN, then it is best to use zipped csv. + So, if you have very large data files reading from HDFS, it's best to use unzipped CSV. But, if the data is further away than the LAN, then it's best to use zipped CSV. + +.. caution:: + + - If you encounter issues importing XLS or XLSX files, you may be using an unsupported version. In this case, re-save the file in BIFF 8 format. Also note that XLS and XLSX support will eventually be deprecated. .. _data_sources: -Data Sources +Data sources ------------ -H2O supports data ingest from various data sources. Natively, a local file system, remote file systems, HDFS, S3, and some relational databases are supported. Additional data sources can be accessed through a generic HDFS API, such as Alluxio or OpenStack Swift. +H2O-3 supports data ingest from various data sources. Natively, a local file system, remote file systems, HDFS, S3, and some relational databases are supported. Additional data sources can be accessed through a generic HDFS API, such as Alluxio or OpenStack Swift. -Default Data Sources +Default data sources ~~~~~~~~~~~~~~~~~~~~ - Local File System @@ -50,194 +53,226 @@ Default Data Sources - JDBC - Hive -Local File System -~~~~~~~~~~~~~~~~~ +Local file system +''''''''''''''''' -Data from a local machine can be uploaded to H2O via a push from the client. For more information, refer to `Uploading a File `__. +Data from a local machine can be uploaded to H2O-3 through a push from the client. See more information on `uploading from your local file system `__. -Remote File -~~~~~~~~~~~ +Remote file +''''''''''' -Data that is hosted on the Internet can be imported into H2O by specifying the URL. For more information, refer to `Importing a File `__. +Data that is hosted on the Internet can be imported into H2O-3 by specifying the URL. See more information on `importing data from the internet `__. -HDFS-like Data Sources -~~~~~~~~~~~~~~~~~~~~~~ +HDFS-like data sources +'''''''''''''''''''''' -Various data sources can be accessed through an HDFS API. In this case, a library providing access to a data source needs to be passed on a command line when H2O is launched. (Reminder: Each node in the cluster must be launched in the same way.) The library must be compatible with the HDFS API in order to be registered as a correct HDFS ``FileSystem``. +Various data sources can be accessed through an HDFS API. In this case, a library providing access to a data source needs to be passed on a command line when H2O-3 is launched. The library must be compatible with the HDFS API in order to be registered as a correct HDFS ``FileSystem``. -Alluxio FS -'''''''''' +.. tip:: + + Each node in the cluster must be launched in the same way. -**Required Library** +Example HDFS-like data sources +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To access Alluxio data source, an Alluxio client library that is part of Alluxio distribution is required. For example, ``alluxio-1.3.0/core/client/target/alluxio-core-client-1.3.0-jar-with-dependencies.jar``. +.. tabs:: + .. tab:: Alluxio -**H2O Command Line** + **Required library** + + To access the Alluxio data source, an Alluxio client library that is part of Alluxio distribution is required. For example, ``alluxio-1.3.0/core/client/target/alluxio-core-client-1.3.0-jar-with-dependencies.jar``. -.. code-block:: bash + **H2O-3 command line** - java -cp alluxio-core-client-1.3.0-jar-with-dependencies.jar:build/h2o.jar water.H2OApp + .. code-block:: bash -**URI Scheme** + java -cp alluxio-core-client-1.3.0-jar-with-dependencies.jar:build/h2o.jar water.H2OApp -An Alluxio data source is referenced using ``alluxio://`` schema and location of Alluxio master. For example, + **URI scheme** -.. code-block:: bash + An Alluxio data source is referenced using the ``alluxio://`` schema and the location of the Alluxio master. For example, - alluxio://localhost:19998/iris.csv + .. code-block:: bash -**core-site.xml Configuration** + alluxio://localhost:19998/iris.csv -Not supported. + ``core-site.xml`` **configuration** -IBM Swift Object Storage -'''''''''''''''''''''''' + Not supported. -**Required Library** + .. tab:: IBM Swift Object Storage -To access IBM Object Store (which can be exposed via Bluemix or Softlayer), IBM's HDFS driver ``hadoop-openstack.jar`` is required. The driver can be obtained, for example, by running BigInsight instances at location ``/usr/iop/4.2.0.0/hadoop-mapreduce/hadoop-openstack.jar``. + **Required library** -Note: The jar available at Maven central is not compatible with IBM Swift Object Storage. + To access IBM Object Store (which can be exposed via Bluemix or Softlayer), IBM's HDFS driver ``hadoop-openstack.jar`` is required. The driver can be obtained, for example, by running BigInsight instances at the following location: ``/usr/iop/4.2.0.0/hadoop-mapreduce/hadoop-openstack.jar``. -**H2O Command Line** + .. caution:: -.. code-block:: bash + The JAR file available at Maven central is not compatible with IBM Swift Object Storage. - java -cp hadoop-openstack.jar:h2o.jar water.H2OApp + **H2O-3 command line** + + .. code-block:: bash -**URI Scheme** + java -cp hadoop-openstack.jar:h2o.jar water.H2OApp -Data source is available under the regular Swift URI structure: ``swift://./path/to/file`` For example, + **URI scheme** -.. code-block:: bash + The data source is available under the regular Swift URI structure: ``swift://./path/to/file``. For example: - swift://smalldata.h2o/iris.csv - -**core-site.xml Configuration** - -The core-site.xml needs to be configured with Swift Object Store parameters. These are available in the Bluemix/Softlayer management console. - -.. code:: xml - - - - fs.swift.service.SERVICE.auth.url - https://identity.open.softlayer.com/v3/auth/tokens - - - fs.swift.service.SERVICE.project.id - ... - - - fs.swift.service.SERVICE.user.id - ... - - - fs.swift.service.SERVICE.password - ... - - - fs.swift.service.SERVICE.region - dallas - - - fs.swift.service.SERVICE.public - false - - - -Google Cloud Storage Connector for Hadoop & Spark -''''''''''''''''''''''''''''''''''''''''''''''''' - -**Required Library** - -To access the Google Cloud Store Object Store, Google's cloud storage connector, ``gcs-connector-latest-hadoop2.jar`` is required. The official documentation and driver can be found `here `__. - -**H2O Command Line** + .. code-block:: bash -.. code-block:: bash + swift://smalldata.h2o/iris.csv - H2O on Hadoop: - hadoop jar h2o-driver.jar -libjars /path/to/gcs-connector-latest-hadoop2.jar + ``core-site.xml`` **configuration** - Sparkling Water - export SPARK_CLASSPATH=/home/nick/spark-2.0.2-bin-hadoop2.6/lib_managed/jar/gcs-connector-latest-hadoop2.jar - sparkling-water-2.0.5/bin/sparkling-shell --conf "spark.executor.memory=10g" + The ``core-site.xml`` needs to be configured with Swift Object Store parameters. These are available in the Bluemix/Softlayer management console. -**URI Scheme** + .. code:: xml -Data source is available under the regular Google Storage URI structure: ``gs:///path/to/file`` For example, + + + fs.swift.service.SERVICE.auth.url + https://identity.open.softlayer.com/v3/auth/tokens + + + fs.swift.service.SERVICE.project.id + ... + + + fs.swift.service.SERVICE.user.id + ... + + + fs.swift.service.SERVICE.password + ... + + + fs.swift.service.SERVICE.region + dallas + + + fs.swift.service.SERVICE.public + false + + -.. code-block:: bash + .. tab:: Google Cloud Storage Connector + + For Hadoop and Spark. + + **Required library** + + To access the Google Cloud Store Object Store, Google's cloud storage connector, ``gcs-connector-latest-hadoop2.jar`` is required. See `the official documentation and driver `__. + + **H2O-3 command line** + + .. code-block:: bash + + # H2O-3 on Hadoop: + hadoop jar h2o-driver.jar -libjars /path/to/gcs-connector-latest-hadoop2.jar + + # Sparkling Water: + export SPARK_CLASSPATH=/home/nick/spark-2.0.2-bin-hadoop2.6/lib_managed/jar/gcs-connector-latest-hadoop2.jar + sparkling-water-2.0.5/bin/sparkling-shell --conf "spark.executor.memory=10g" + + **URI scheme** + + The data source is available under the regular Google Storage URI structure: ``gs:///path/to/file``. For example: + + .. code-block:: bash - gs://mybucket/iris.csv + gs://mybucket/iris.csv -**core-site.xml Configuration** + ``core-site.xml`` **configuration** -core-site.xml must be configured for at least the following properties (class, project-id, bucketname) as shown in the example below. A full list of configuration options is found `here `__. + The ``core-site.xml`` must be configured for at least the following properties (as shown in the following example): -.. code:: xml + - class + - project-id + - bucketname - - - fs.gs.impl - com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem - - - fs.gs.project.id - my-google-project-id - - - fs.gs.system.bucket - mybucket - - + See the `full list of configuration options `__. + + .. code:: xml + + + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + + + fs.gs.project.id + my-google-project-id + + + fs.gs.system.bucket + mybucket + + .. _direct_hive_import: -Direct Hive Import +Direct Hive import ~~~~~~~~~~~~~~~~~~ -H2O supports direct ingestion of data managed by Hive in Hadoop. This feature is available only when H2O is running as a Hadoop job. Internally H2O uses metadata in Hive Metastore database to determine the location and format of given Hive table. H2O then imports data directly from HDFS so limitations of supported formats mentioned above apply. Data from hive can pulled into H2O using ``import_hive_table`` function. H2O can read Hive table metadata two ways - either via direct Metastore access or via JDBC. +H2O-3 supports direct ingestion of data managed by Hive in Hadoop. This feature is available only when H2O-3 is running as a Hadoop job. Internally, H2O-3 uses metadata in the Hive Metastore database to determine the location and format of a given Hive table. H2O-3 then imports data directly from HDFS, so limitations of supported formats mentioned above apply. Data from Hive can be pulled into H2O-3 using the ``import_hive_table`` function. H2O-3 can read Hive table metadata two ways: -**Note**: When ingesting data from Hive in Hadoop, direct Hive import is preferred over :ref:`hive2`. +- Direct Metastore access +- JDBC + +.. tip:: + + When ingesting data from Hive in Hadoop, direct Hive import is preferred over :ref:`hive2`. Requirements '''''''''''' -- The user running H2O must have read access to Hive and the files it manages. - -- For Direct Metastore access, the Hive jars and configuration must be present on H2O job classpath - either by adding it to yarn.application.classpath (or similar property for your resource manger of choice) or by adding Hive jars and configuration to libjars. - -- For JDBC metadata access, the Hive JDBC Driver must be on H2O job classpath. +- You must have read access to Hive and the files it manages. +- For direct metastore access, the Hive JARs and configuration must be present on the H2O-3 job classpath. You can achieve this either by adding it to the ``yarn.application.classpath`` (or similar property for your resource manger of choice) or by adding Hive JARs and configuration to ``-libjars``. +- For JDBC metadata access, the Hive JDBC Driver must be on the H2O-3 job classpath. Limitations ''''''''''' -- The imported table must be stored in a :ref:`format supported by H2O`. -- CSV: The Hive table property ``skip.header.line.count`` is currently not supported. CSV files with header rows will be imported with the header row as data. -- Partitioned tables with different storage formats. H2O supports importing partitioned tables that use different storage formats for different partitions; however in some cases (for example large number of small partitions), H2O may run out of memory while importing, even though the final data would easily fit into the memory allocated to the H2O cluster. +- The imported table must be stored in a :ref:`format supported by H2O-3`. +- (CSV) The Hive table property ``skip.header.line.count`` is not supported. CSV files with header rows will be imported with the header row as data. +- (Partitioned tables with different storage formats) H2O-3 supports importing partitioned tables that use different storage formats for different partitions; however, in some cases (for example, a large number of small partitions), H2O-3 may run out of memory while importing, even though the final data would easily fit into the memory allocated to the H2O-3 cluster. -Importing Examples -'''''''''''''''''' +Examples of importing +''''''''''''''''''''' -Example 1: Access Metadata via Metastore -######################################## +Example 1: Access metadata through metastore +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This example shows how to access metadata via the metastore. +This example shows how to access metadata through the metastore. -1. Start the H2O jar in the terminal with your downloaded Hive JDBC driver in the classpath +1. Start the H2O JAR in the terminal with your downloaded Hive JDBC driver in the classpath: .. code-block:: bash - # start the h2o.jar + # start the h2o.jar: hadoop jar h2odriver.jar -libjars hive-jdbc-standalone.jar -nodes 3 -mapperXmx 6g -2. Import data in R or Python. +2. Import data in Python or R. .. tabs:: + .. code-tab:: python - .. code-tab:: r R + # basic import + basic_import = h2o.import_hive_table("default", "table_name") + + # multi-format import + multi_format_enabled = h2o.import_hive_table("default", + "table_name", + allow_multi_format=True) + + # import with partition filter + with_partition_filter = h2o.import_hive_table("default", + "table_name", + [["2017", "02"]]) + + .. code-tab:: r R # basic import basic_import <- h2o.import_hive_table("default", "table_name") @@ -252,94 +287,73 @@ This example shows how to access metadata via the metastore. "table_name", [["2017", "02"]]) - .. code-tab:: python - # basic import - basic_import = h2o.import_hive_table("default", "table_name") +Example 2: Access metadata through JDBC +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - # multi-format import - multi_format_enabled = h2o.import_hive_table("default", - "table_name", - allow_multi_format=True) +This example shows how to access metadata through JDBC. - # import with partition filter - with_partition_filter = h2o.import_hive_table("default", - "table_name", - [["2017", "02"]]) - - -Example 2: Access Metadata via JDBC -################################### - -This example shows how to access metadata via JDBC. - -1. Start the H2O jar in the terminal with your downloaded Hive JDBC driver in the classpath +1. Start the H2O JAR in the terminal with your downloaded Hive JDBC driver in the classpath: .. code-block:: bash - # start the h2o.jar + # start the h2o.jar: hadoop jar h2odriver.jar -libjars hive-jdbc-standalone.jar -nodes 3 -mapperXmx 6g -2. Import data in R or Python. +2. Import data in Python or R. .. tabs:: - - .. code-tab:: r R + .. code-tab:: python # basic import of metadata via JDBC - basic_import <- h2o.import_hive_table("jdbc:hive2://hive-server:10000/default", "table_name") - + basic_import = h2o.import_hive_table("jdbc:hive2://hive-server:10000/default", "table_name") - .. code-tab:: python + .. code-tab:: r R # basic import of metadata via JDBC - basic_import = h2o.import_hive_table("jdbc:hive2://hive-server:10000/default", "table_name") + basic_import <- h2o.import_hive_table("jdbc:hive2://hive-server:10000/default", "table_name") -JDBC Databases +JDBC databases ~~~~~~~~~~~~~~ -Relational databases that include a JDBC (Java database connectivity) driver can be used as the source of data for machine learning in H2O. Currently supported SQL databases are MySQL, PostgreSQL, MariaDB, Netezza, Amazon Redshift, Teradata, and Hive. (Refer to :ref:`hive2` for more information.) Data from these SQL databases can be pulled into H2O using the ``import_sql_table`` and ``import_sql_select`` functions. +Relational databases that include a JDBC (Java database connectivity) driver can be used as the source of data for machine learning in H2O-3. The supported SQL databases are MySQL, PostgreSQL, MariaDB, Netezza, Amazon Redshift, Teradata, and Hive. (See :ref:`hive2` for more information.) Data from these SQL databases can be pulled into H2O-3 using the ``import_sql_table`` and ``import_sql_select`` functions. -Refer to the following articles for examples about using JDBC data sources with H2O. +See the following articles for examples about using JDBC data sources with H2O-3. - `Setup postgresql database on OSX `__ - `Restoring DVD rental database into postgresql `__ -- `Building H2O GLM model using Postgresql database and JDBC driver `__ +- `Building H2O-3 GLM model using Postgresql database and JDBC driver `__ + +.. note:: + + The handling of categorical values is different between file ingest and JDBC ingests. The JDBC treats categorical values as strings. Strings are not compressed in any way in H2O-3 memory, and using the JDBC interface might need more memory and additional data post-processing (converting to categoricals explicitly). -**Note**: The handling of categorical values is different between file ingest and JDBC ingests. Te JDBC treats categorical values as Strings. Strings are not compressed in any way in H2O memory, and using the JDBC interface might need more memory and additional data post-processing (converting to categoricals explicitly). +``import_sql_table`` function +''''''''''''''''''''''''''''' -``import_sql_table`` -'''''''''''''''''''' +This function imports a SQL table to H2OFrame in memory. This function assumes that the SQL table is not being updated and is stable. You can run multiple SELECT SQL queries concurrently for parallel ingestion. -This function imports a SQL table to H2OFrame in memory. This function assumes that the SQL table is not being updated and is stable. Users can run multiple SELECT SQL queries concurrently for parallel ingestion. +.. tip:: -**Note**: Be sure to start the h2o.jar in the terminal with your downloaded JDBC driver in the classpath: + Be sure to start the ``h2o.jar`` in the terminal with your downloaded JDBC driver in the classpath: -:: - - java -cp : water.H2OApp + :: + + java -cp : water.H2OApp The ``import_sql_table`` function accepts the following parameters: -- ``connection_url``: The URL of the SQL database connection as specified by the Java Database Connectivity (JDBC) Driver. For example, "jdbc:mysql://localhost:3306/menagerie?&useSSL=false" -- ``table``: The name of the SQL table -- ``columns``: A list of column names to import from SQL table. Default is to import all columns. -- ``username``: The username for SQL server -- ``password``: The password for SQL server -- ``optimize``: Specifies to optimize the import of SQL table for faster imports. Note that this option is experimental. -- ``fetch_mode``: Set to DISTRIBUTED to enable distributed import. Set to SINGLE to force a sequential read by a single node from the database. +- ``connection_url``: The URL of the SQL database connection as specified by the Java Database Connectivity (JDBC) Driver. For example, ``jdbc:mysql://localhost:3306/menagerie?&useSSL=false``. +- ``table``: The name of the SQL table. +- ``columns``: A list of column names to import from SQL table. Defaults to importing all columns. +- ``username``: The username for the SQL server. +- ``password``: The password for the SQL server. +- ``optimize``: Specifies to optimize the import of the SQL table for faster imports. Note that this option is experimental. +- ``fetch_mode``: Set to ``DISTRIBUTED`` to enable distributed import. Set to ``SINGLE`` to force a sequential read by a single node from the database. - ``num_chunks_hint``: Optionally specify the number of chunks for the target frame. .. tabs:: - .. code-tab:: r R - - connection_url <- "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" - table <- "citibike20k" - username <- "root" - password <- "abc123" - my_citibike_data <- h2o.import_sql_table(connection_url, table, username, password) - .. code-tab:: python connection_url = "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" @@ -348,39 +362,39 @@ The ``import_sql_table`` function accepts the following parameters: password = "abc123" my_citibike_data = h2o.import_sql_table(connection_url, table, username, password) + .. code-tab:: r R + + connection_url <- "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" + table <- "citibike20k" + username <- "root" + password <- "abc123" + my_citibike_data <- h2o.import_sql_table(connection_url, table, username, password) -``import_sql_select`` -''''''''''''''''''''' +``import_sql_select`` function +'''''''''''''''''''''''''''''' -This function imports the SQL table that is the result of the specified SQL query to H2OFrame in memory. It creates a temporary SQL table from the specified sql_query. Users can run multiple SELECT SQL queries on the temporary table concurrently for parallel ingestion, and then drop the table. +This function imports the SQL table that is the result of the specified SQL query to the H2OFrame in memory. It creates a temporary SQL table from the specified ``sql_query``. You can run multiple SELECT SQL queries on the temporary table concurrently for parallel ingestion then drop the table. -**Note**: Be sure to start the h2o.jar in the terminal with your downloaded JDBC driver in the classpath: +.. tip:: + + Be sure to start the ``h2o.jar`` in the terminal with your downloaded JDBC driver in the classpath: -:: - - java -cp : water.H2OApp + :: + + java -cp : water.H2OApp The ``import_sql_select`` function accepts the following parameters: -- ``connection_url``: URL of the SQL database connection as specified by the Java Database Connectivity (JDBC) Driver. For example, "jdbc:mysql://localhost:3306/menagerie?&useSSL=false" -- ``select_query``: SQL query starting with `SELECT` that returns rows from one or more database tables. -- ``username``: The username for the SQL server -- ``password``: The password for the SQL server -- ``optimize``: Specifies to optimize import of SQL table for faster imports. Note that this option is experimental. +- ``connection_url``: URL of the SQL database connection as specified by the Java Database Connectivity (JDBC) Driver. For example, ``jdbc:mysql://localhost:3306/menagerie?&useSSL=false``. +- ``select_query``: SQL query starting with ``SELECT`` that returns rows from one or more database tables. +- ``username``: The username for the SQL server. +- ``password``: The password for the SQL server. +- ``optimize``: Specifies to optimize import of the SQL table for faster imports. Note that this option is experimental. - ``use_temp_table``: Specifies whether a temporary table should be created by ``select_query``. - ``temp_table_name``: The name of the temporary table to be created by ``select_query``. -- ``fetch_mode``: Set to DISTRIBUTED to enable distributed import. Set to SINGLE to force a sequential read by a single node from the database. +- ``fetch_mode``: Set to ``DISTRIBUTED`` to enable distributed import. Set to ``SINGLE`` to force a sequential read by a single node from the database. .. tabs:: - .. code-tab:: r R - - connection_url <- "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" - select_query <- "SELECT bikeid from citibike20k" - username <- "root" - password <- "abc123" - my_citibike_data <- h2o.import_sql_select(connection_url, select_query, username, password) - - .. code-tab:: python connection_url = "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" @@ -389,22 +403,30 @@ The ``import_sql_select`` function accepts the following parameters: password = "abc123" my_citibike_data = h2o.import_sql_select(connection_url, select_query, username, password) + .. code-tab:: r R + + connection_url <- "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" + select_query <- "SELECT bikeid from citibike20k" + username <- "root" + password <- "abc123" + my_citibike_data <- h2o.import_sql_select(connection_url, select_query, username, password) + .. _hive2: -Using the Hive 2 JDBC Driver -'''''''''''''''''''''''''''' +Hive JDBC driver +'''''''''''''''' -H2O can ingest data from Hive through the Hive v2 JDBC driver by providing H2O with the JDBC driver for your Hive version. A demo showing how to ingest data from Hive through the Hive v2 JDBC driver is available `here `__. The basic steps are described below. +H2O-3 can ingest data from Hive through the Hive JDBC driver (v2) by providing H2O-3 with the JDBC driver for your Hive version. Explore this `demo showing how to ingest data from Hive through the Hive v2 JDBC driver `__. The basic steps are described below. -**Notes**: +.. tip:: -- :ref:`direct_hive_import` is preferred over using the Hive 2 JDBC driver. -- H2O can only load data from Hive version 2.2.0 or greater due to a limited implementation of the JDBC interface by Hive in earlier versions. + - :ref:`direct_hive_import` is preferred over using the Hive JDBC driver. + - H2O-3 can only load data from Hive version 2.2.0 or greater due to a limited implementation of the JDBC interface by Hive in earlier versions. 1. Set up a table with data. - a. Retrieve the AirlinesTest dataset from `S3 `__. + a. Download `this AirlinesTest dataset from S3 `__. b. Run the CLI client for Hive: @@ -435,37 +457,29 @@ H2O can ingest data from Hive through the Hive v2 JDBC driver by providing H2O w FIELDS TERMINATED BY ',' LOCATION '/tmp'; - d. Import the data from the dataset. Note that the file must be present on HDFS in /tmp. + d. Import the data from the dataset (note that the file must be present on HDFS in ``/tmp``): .. code-block:: sql LOAD DATA INPATH '/tmp/AirlinesTest.csv' OVERWRITE INTO TABLE AirlinesTest -2. Retrieve the Hive JDBC client jar. +2. Retrieve the Hive JDBC client JAR. - - For Hortonworks, Hive JDBC client jars can be found on one of the edge nodes after you have installed HDP: ``/usr/hdp/current/hive-client/lib/hive-jdbc--standalone.jar``. More information is available `here `__. - - For Cloudera, install the JDBC package for your operating system, and then add ``/usr/lib/hive/lib/hive-jdbc--standalone.jar`` to your classpath. More information is available `here: `__. - - You can also retrieve this from Maven for the desire version using ``mvn dependency:get -Dartifact=groupId:artifactId:version``. + - For Hortonworks, Hive JDBC client JARs can be found on one of the edge nodes after you have installed HDP: ``/usr/hdp/current/hive-client/lib/hive-jdbc--standalone.jar``. See more `information on Hortonworks and the Hive JDBC client `__. + - For Cloudera, install the JDBC package for your operating system, and then add ``/usr/lib/hive/lib/hive-jdbc--standalone.jar`` to your classpath. See more `information on Cloudera and the Hive JDBC client `__. + - You can also retrieve this from Maven for your desired version using ``mvn dependency:get -Dartifact=groupId:artifactId:version``. + +3. Add the Hive JDBC driver to H2O-3's classpath: -3. Add the Hive JDBC driver to H2O's classpath. .. code-block:: bash - # Add the Hive JDBC driver to H2O's classpath + # Add the Hive JDBC driver to H2O-3's classpath: java -cp hive-jdbc.jar: water.H2OApp -4. Initialize H2O in either R or Python and import data. +4. Initialize H2O-3 in either Python or R and import your data: .. tabs:: - - .. group-tab:: R - - .. code-block:: r - - # initialize h2o in R - library(h2o) - h2o.init(extra_classpath=["hive-jdbc-standalone.jar"]) - .. group-tab:: Python .. code-block:: python @@ -474,24 +488,18 @@ H2O can ingest data from Hive through the Hive v2 JDBC driver by providing H2O w import h2o h2o.init(extra_classpath = ["hive-jdbc-standalone.jar"]) + .. group-tab:: R -5. After the jar file with JDBC driver is added, then data from the Hive databases can be pulled into H2O using the aforementioned ``import_sql_table`` and ``import_sql_select`` functions. - - .. tabs:: - - .. code-tab:: r R + .. code-block:: r - connection_url <- "jdbc:hive2://localhost:10000/default" - select_query <- "SELECT * FROM AirlinesTest;" - username <- "username" - password <- "changeit" + # initialize h2o in R + library(h2o) + h2o.init(extra_classpath=["hive-jdbc-standalone.jar"]) - airlines_dataset <- h2o.import_sql_select(connection_url, - select_query, - username, - password) +5. After the JAR file with the JDBC driver is added, the data from the Hive databases can be pulled into H2O-3 using the aforementioned ``import_sql_table`` and ``import_sql_select`` functions. - .. code-tab :: python + .. tabs:: + .. code-tab:: python connection_url = "jdbc:hive2://localhost:10000/default" select_query = "SELECT * FROM AirlinesTest;" @@ -502,41 +510,53 @@ H2O can ingest data from Hive through the Hive v2 JDBC driver by providing H2O w select_query, username, password) + .. code-tab:: r R + + connection_url <- "jdbc:hive2://localhost:10000/default" + select_query <- "SELECT * FROM AirlinesTest;" + username <- "username" + password <- "changeit" + airlines_dataset <- h2o.import_sql_select(connection_url, + select_query, + username, + password) -Connecting to Hive in a Kerberized Hadoop Cluster -################################################# -When importing data from Kerberized Hive on Hadoop, it is necessary to configure the h2odriver to authenticate with the Hive instance via -a delegation token. Since Hadoop does not generate delegation tokens for Hive automatically, it is necessary to provide the h2odriver with additional configurations. +Connect to Hive in a Kerberized Hadoop cluster +'''''''''''''''''''''''''''''''''''''''''''''' -H2O is able to generate Hive delegation tokens in three modes: +When importing data from Kerberized Hive on Hadoop, it's necessary to configure the h2odriver to authenticate with the Hive instance through a delegation token. Since Hadoop does not generate delegation tokens for Hive automatically, it's necessary to provide the h2odriver with additional configurations. -- On the driver side, a token can be generated on H2O cluster start. -- On the mapper side, a token refresh thread is started, periodically re-generating the token. -- A combination of both of the above. +H2O-3 is able to generate Hive delegation tokens in three modes: -H2O arguments used to configure the JDBC URL for Hive delegation token generation: +- `On the driver side <#generate-the-token-in-the-driver>`__, a token can be generated on H2O-3 cluster start. +- `On the mapper side <#generate-the-token-in-the-mapper-and-token-refresh>`__, a token refresh thread is started, periodically re-generating the token. +- `A combination of both of the above <#generate-the-token-in-the-driver-with-refresh-in-the-mapper>`__. -- ``hiveHost`` - The full address of HiveServer2, for example ``hostname:10000`` -- ``hivePrincipal`` - Hiveserver2 Kerberos principal, for example ``hive/hostname@DOMAIN.COM`` -- ``hiveJdbcUrlPattern`` - (optional) Can be used to further customize the way the driver constructs the Hive JDBC URL. The default pattern used is ``jdbc:hive2://{{host}}/;{{auth}}`` where ``{{auth}}`` is replaced by ``principal={{hivePrincipal}}`` or ``auth=delegationToken`` based on context +The following are the H2O-3 arguments used to configure the JDBC URL for Hive delegation token generation: -**Note on libjars:** +- ``hiveHost`` - The full address of HiveServer2 (for example, ``hostname:10000``). +- ``hivePrincipal`` - The Hiveserver2 Kerberos principal (for example, ``hive/hostname@DOMAIN.COM``). +- ``hiveJdbcUrlPattern`` - (optional) Can be used to further customize the way the driver constructs the Hive JDBC URL. The default pattern used is ``jdbc:hive2://{{host}}/;{{auth}}`` where ``{{auth}}`` is replaced by ``principal={{hivePrincipal}}`` or ``auth=delegationToken`` based on the context. -In the examples below, we are omitting the ``-libjars`` option of the ``hadoop.jar`` command because it is not necessary for token generation. You may need to add it to be able to import data from Hive via JDBC. +.. attention:: + + In the following examples, we omit the ``-libjars`` option of the ``hadoop.jar`` command because it is not necessary for token generation. You may need to add it to be able to import data from Hive via JDBC. -Generating the Token in the Driver -################################## +Generate the token in the driver +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The advantage of this approach is that the keytab does not need to be distributed into the Hadoop cluster. -Requirements: +**Requirements** + +- The Hive JDBC driver is on h2odriver classpath through the ``HADOOP_CLASSPATH`` environment variable. (Only used to acquire the Hive delegation token.) +- The ``hiveHost``, ``hivePrincipal`` and optionally ``hiveJdbcUrlPattern`` arguments are present. See `Connect to Hive in a Kerberized Hadoop cluster <#connect-to-hive-in-a-kerberized-hadoop-cluster>`__ for more details. -- The Hive JDBC driver is on h2odriver classpath via the HADOOP_CLASSPATH environment variable. (Only used to acquire Hive delegation token.) -- The ``hiveHost``, ``hivePrincipal`` and optionally ``hiveJdbcUrlPattern`` arguments are present. (See above for details.) +**Example** -Example command: +The following is an example of generating a token in the driver: .. code-block:: bash @@ -546,20 +566,22 @@ Example command: -hiveHost hostname:10000 -hivePrincipal hive/hostname@EXAMPLE.COM \ -hiveJdbcUrlPattern "jdbc:hive2://{{host}}/;{{auth}};ssl=true;sslTrustStore=/path/to/keystore.jks" -Generating the Token in the Mapper and Token Refresh -#################################################### +Generate the token in the mapper and token refresh +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This approach generates a Hive delegation token after the H2O cluster is fully started up and then periodically refreshes the token. Delegation tokens usually have a limited life span, and for long-running H2O clusters, they need to be refreshed. For this to work, the user's keytab and principal need to available to the H2O Cluster Leader node. +This approach generates a Hive delegation token after the H2O-3 cluster is fully started up and then periodically refreshes the token. Delegation tokens usually have a limited life span, and for long-running H2O-3 clusters, they need to be refreshed. For this to work, your keytab and principal need to be available to the H2O-3 cluster leader node. -Requirements: +**Requirements** -- The Hive JDBC driver is on the h2o mapper classpath (either via libjars or YARN configuration). -- The ``hiveHost``, ``hivePrincipal`` and optionally ``hiveJdbcUrlPattern`` arguments are present. (See above for details.) -- The ``principal`` argument is set with the value of the users's Kerberos principal. -- The ``keytab`` argument set pointing to the file with the user's Kerberos keytab file. +- The Hive JDBC driver is on the h2o mapper classpath (either through ``-libjars`` or YARN configuration). +- The ``hiveHost``, ``hivePrincipal`` and optionally ``hiveJdbcUrlPattern`` arguments are present. See `Connect to Hive in a Kerberized Hadoop cluster <#connect-to-hive-in-a-kerberized-hadoop-cluster>`__ for more details. +- The ``principal`` argument is set with the value of your Kerberos principal. +- The ``keytab`` argument set pointing to the file with your Kerberos keytab file. - The ``refreshHiveTokens`` argument is present. -Example command: +**Example** + +The following is an example of how to set up a token refresh using the h2o mapper classpath: .. code-block:: bash @@ -569,24 +591,30 @@ Example command: -pricipal user/host@DOMAIN.COM -keytab path/to/user.keytab \ -refreshHiveTokens -**Note on refreshHiveTokens:** The provided keytab will be copied over to the machine running the H2O Cluster leader node. For this reason, we strongly recommended that both YARN and HDFS be secured with encryption. +.. important:: + + The provided keytab (``refreshHiveTokens``) will be copied over to the machine running the H2O-3 cluster leader node. For this reason, we strongly recommended that both YARN and HDFS be secured with encryption. -**Note on generating the refreshing HDFS delegation tokens:** In case generation of the refreshing HDFS delegation tokens is required, the ``-refreshHdfsTokens`` argument has to be present. In specific deployments (eg. on CDP with IDbroker security) you might need to enable S3A token refresh to acquire (and keep refreshing) delegation tokens to access S3 buckets. This option is being enabled by the ``refreshS3ATokens`` argument. +.. note:: + + In case generation of the refreshing HDFS delegation tokens is required, the ``-refreshHdfsTokens`` argument has to be present. In specific deployments (e.g. on CDP with IDbroker security) you might need to enable S3A token refresh to acquire (and keep refreshing) delegation tokens to access S3 buckets. This option is enabled by the ``refreshS3ATokens`` argument. -Generating the Token in the Driver with Refresh in the Mapper -############################################################# +Generate the token in the driver with refresh in the mapper +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This approach is a combination of the two previous scenarios. Hive delegation token is first generated by the h2odriver and then periodically refreshed by the H2O Cluster leader node. +This approach is a combination of the two previous scenarios. The Hive delegation token is first generated by the h2odriver and then periodically refreshed by the H2O-3 cluster leader node. This is the best-of-both-worlds approach. The token is generated first in the driver and is available immediately on cluster start. It is then periodically refreshed and never expires. -Requirements: +**Requirements** -- The Hive JDBC driver is on the h2o driver and mapper classpaths. -- The ``hiveHost``, ``hivePrincipal`` and optionally ``hiveJdbcUrlPattern`` arguments are present. (See above for details.) +- The Hive JDBC driver is on the h2odriver and mapper classpaths. +- The ``hiveHost``, ``hivePrincipal`` and optionally ``hiveJdbcUrlPattern`` arguments are present. See `Connect to Hive in a Kerberized Hadoop cluster <#connect-to-hive-in-a-kerberized-hadoop-cluster>`__ for more details. - The ``refreshHiveTokens`` argument is present. -Example command: +**Example** + +The following is an example of generating a token in the driver and setting up token refresh using the h2o mapper classpath: .. code-block:: bash @@ -597,22 +625,22 @@ Example command: -refreshHiveTokens -Using a Delegation Token when Connecting to Hive via JDBC -######################################################### +Use a delegation token when connecting to Hive through JDBC +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When running the actual data-load, specify the JDBC URL with the delegation token parameter: .. tabs:: - .. code-tab:: r R + .. code-tab:: python - my_citibike_data <- h2o.import_sql_table( + my_citibike_data = h2o.import_sql_table( "jdbc:hive2://hostname:10000/default;auth=delegationToken", "citibike20k", "", "" ) - .. code-tab:: python + .. code-tab:: r R - my_citibike_data = h2o.import_sql_table( + my_citibike_data <- h2o.import_sql_table( "jdbc:hive2://hostname:10000/default;auth=delegationToken", "citibike20k", "", "" ) diff --git a/h2o-docs/src/product/grid-search.rst b/h2o-docs/src/product/grid-search.rst index a3f82a8a9de2..e98ae71c41a7 100644 --- a/h2o-docs/src/product/grid-search.rst +++ b/h2o-docs/src/product/grid-search.rst @@ -116,8 +116,8 @@ Grid Search Examples h2o.init() # Import a sample binary outcome dataset into H2O - data <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") + test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv") # Identify predictors and response y <- "response" @@ -175,8 +175,8 @@ Grid Search Examples h2o.init() # Import a sample binary outcome dataset into H2O - data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") + test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv") # Identify predictors and response x = data.columns diff --git a/h2o-docs/src/product/h2o-client.rst b/h2o-docs/src/product/h2o-client.rst index 1074360f89c6..4ad739f7c804 100644 --- a/h2o-docs/src/product/h2o-client.rst +++ b/h2o-docs/src/product/h2o-client.rst @@ -1,26 +1,26 @@ -H2O Clients -=========== +H2O-3 clients +============= -These clients allow you to connect to and interact with H2O. +These clients allow you to connect to and interact with H2O-3. H2O Flow -------- -H2O Flow is a Web based (GUI) user interface. It allows users to interactively run their H2O machine learning workflows and iteratively improve them. It combines code execution, text, mathematics, plots, and rich media in a single document. Documentation for H2O Flow can be found `here `__. +H2O Flow is a Web based (GUI) user interface. It lets you interactively run your H2O-3 machine learning workflows and iteratively improve them. H2O Flow combines code execution, text, mathematics, plots, and rich media in a single document. See the `documentation for H2O Flow `__. -R Client +R client -------- -R users can use H2O-R library which internally uses H2O REST API calls to connect to H2O (Server) and allows users to run their H2O workflow via R. Documentation for the H2O-R Client can be found `here <../h2o-r/docs/index.html>`__. +R users can use H2O-R library which internally uses H2O REST API calls to connect to H2O-3 (Server) and lets you run your H2O-3 workflow via R. See the `documentation for the H2O-R Client <../h2o-r/docs/index.html>`__. -Python Client +Python client ------------- -Python users can connect to H2O using the H2O Python package that internally uses H2O REST API calls to connect to H2O (Server) and allows users to run their H2O workflow via Python. Documentation for the H2O-Python Client can be found `here <../h2o-py/docs/index.html>`__. +Python users can connect to H2O-3 using the H2O Python package that internally uses H2O REST API calls to connect to H2O-3 (Server) and allows users to run their H2O-3 workflow via Python. See the `documentation for the H2O-Python Client <../h2o-py/docs/index.html>`__. -Sklearn Support +Sklearn support ~~~~~~~~~~~~~~~ -Most H2O estimators available in the H2O-Python client can also be used in the standard ``sklearn`` API. The ``h2o.sklearn`` module provides a collection of wrappers auto-generated on top of the original estimators and transformers, as well as on top of ``H2OAutoML``. +Most H2O-3 estimators available in the H2O-Python client can also be used in the standard ``sklearn`` API. The ``h2o.sklearn`` module provides a collection of wrappers auto-generated on top of the original estimators and transformers, as well as on top of ``H2OAutoML``. -For examples on how to integrate H2O estimators into your Sklearn workflow, please click `here `__. \ No newline at end of file +See `examples on how to integrate H2O-3 estimators into your Sklearn workflow `__. \ No newline at end of file diff --git a/h2o-docs/src/product/index.rst b/h2o-docs/src/product/index.rst index bd4a4f4096b5..db42db053a4a 100644 --- a/h2o-docs/src/product/index.rst +++ b/h2o-docs/src/product/index.rst @@ -3,20 +3,22 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -======== Overview ======== -Welcome to the H2O documentation site! Depending on your area of interest, select a learning path from the sidebar, or look at the full content outline below. +Welcome to the H2O-3 documentation site! Select a learning path from the sidebar or browse through the full content outline below. + +We're excited you're interesting in learning more about H2O-3. If you have questions or ideas to share, please post them to the H2O community site on `Stack Overflow `__. -We're glad you're interested in learning more about H2O. If you have questions or ideas to share, please post them to the `H2O community site on Stack Overflow `__. +Additional resources +-------------------- -Additional Resources: +The following are additional resources to learn more information about H2O-3: -- See how are customers are using H2O at https://www.h2o.ai/customers/. -- Keep up to date with the latest H2O blogs at https://www.h2o.ai/blog/. -- Review projects, applications, research papers, tutorials, courses, and books that use H2O at https://github.com/h2oai/awesome-h2o. -- Learn about securing your installation by following our `security guidelines `__. +- `See how customers are using H2O-3 `__. +- Keep up to date with `the latest H2O-3 blogs `__. +- `Explore Awesome H2O `__ to review projects, applications, research papers, tutorials, courses, and books that use H2O-3. +- Learn about securing your installation by following `H2O-3's security guidelines `__. .. toctree:: :maxdepth: 2 diff --git a/h2o-docs/src/product/performance-and-prediction.rst b/h2o-docs/src/product/performance-and-prediction.rst index 9c8bd70089bc..e8666204973f 100644 --- a/h2o-docs/src/product/performance-and-prediction.rst +++ b/h2o-docs/src/product/performance-and-prediction.rst @@ -2428,8 +2428,8 @@ Allowed options include: h2o.init() # Import a sample binary outcome dataset into H2O - data <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") + test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv") # Identify predictors and response y <- "response" @@ -2470,8 +2470,8 @@ Allowed options include: h2o.init() # Import a sample binary outcome dataset into H2O - data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv") + test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv") # Identify predictors and response x = data.columns diff --git a/h2o-docs/src/product/starting-h2o.rst b/h2o-docs/src/product/starting-h2o.rst index 379232ea8d78..ef89954c279c 100644 --- a/h2o-docs/src/product/starting-h2o.rst +++ b/h2o-docs/src/product/starting-h2o.rst @@ -1,118 +1,48 @@ -Starting H2O -============ +Starting H2O-3 +============== -There are a variety of ways to start H2O, depending on which client you would like to use. The instructions below assume that you already downloaded and installed H2O. If you have not, then please refer to the `Downloading & Installing H2O `__ section. - -From R ------- - -Use the ``h2o.init()`` method to initialize H2O. This method accepts the following options. Note that in most cases, simply using ``h2o.init()`` is all that a user is required to do. - -- ``nthreads``: This launches H2O using all available CPUs and is only applicable if you launch H2O locally using R. If you start H2O locally outside of R or start H2O on Hadoop, the nthreads parameter is not applicable. -- ``ip``: The IP address of the server where H2O is running. -- ``port``: The port number of the H2O server. -- ``startH2O``: (Optional) A logical value indicating whether to try to start H2O from R if no connection with H2O is detected. This is only possible if ``ip = "localhost"`` or ``ip = "127.0.0.1"``. If an existing connection is detected, R does not start H2O. -- ``forceDL``: (Optional) A logical value indicating whether to force download of the H2O executable. This defaults to FALSE, so the executable will only be downloaded if it does not already exist in the H2O R library resources directory at h2o/java/h2o.jar. -- ``enable_assertions``: (Optional) A logical value indicating whether H2O should be launched with assertions enabled. This is used mainly for error checking and debugging purposes. -- ``license``: (Optional) A character string value specifying the full path of the license file. -- ``max_log_file_size``: Maximum size of INFO and DEBUG log files. The file is rolled over after the specifized size has been reached. The range for this option is 1MB to 99999MB. The value defaults to 3MB. -- ``max_mem_size``: (Optional) A character string specifying the maximum size, in bytes, of the memory allocation pool to H2O. This value must be a multiple of 1024 greater than 2MB. Append the letter ``m`` or ``M`` to indicate megabytes, or ``g`` or ``G`` to indicate gigabytes. - - **Note:** If ``max_mem_size`` is not defined, then the amount of memory that H2O allocates will be determined by the default memory of the Java Virtual Machine (JVM). This amount depends on the Java version, but it will generally be 25% of the machine's physical memory. - -- ``min_mem_size``: (Optional) A character string specifying the minimum size, in bytes, of the memory allocation pool to H2O. This value must a multiple of 1024 greater than 2MB. Append the letter ``m`` or ``M`` to indicate megabytes, or ``g`` or ``G`` to indicate gigabytes. -- ``ice_root``: (Optional) A directory to handle object spillage. The default varies by OS. -- ``strict_version_check``: (Optional) Setting this to FALSE is unsupported and should only be done when advised by technical support. -- ``ignore_config``: (Optional) This option allows you to specify whether to perform processing of a .h2oconfig file. When h2o.init() is specified, a call to a config reader method is invoked. This call can result in path issues when there is no "root" (for example, with a Windows network drive) because the config file reader searches up to "root." When there is no "root", the path to search will continue to expand, eventually result in an error. This value defaults to False. -- ``proxy``: (Optional) A character string specifying the proxy path. -- ``https``: (Optional) Set this to TRUE to use https instead of http. -- ``insecure``: (Optional) Set this to TRUE to disable SSL certificate checking. -- ``username``: (Optional) The username to log in with. -- ``password``: (Optional) The password to log in with. -- ``cookies``: (Optional) Vector (or list) of cookies to add to request. -- ``context_path``: (Optional) The last part of connection URL. For example, **http://:/** -- ``use_spnego``: (Optional) Set this to TRUE to connect to an H2O cluster with SPNEGO authentication. This defaults to FALSE and is mutually exclusive with ``username`` and ``password``. - -By default, ``h2o.init()`` first checks if an H2O instance is connectible. If it cannot connect and ``start = TRUE`` with ``ip = "localhost"``, it will attempt to start an instance of H2O at localhost:54321. If an open ip and port of your choice are passed in, then this method will attempt to start an H2O instance at that specified ip and port. - -When initializing H2O locally, this method searches for the h2o.jar file in the R library resources (system.file("java", "h2o.jar", package = "h2o")), and if the file does not exist, it will automatically attempt to download the correct version from Amazon S3. The user must have Internet access for this process to be successful. - -Once connected, the ``h2o.init()`` method checks to see if the local H2O R package version matches the version of H2O running on the server. If there is a mismatch and the user indicates he/she wants to upgrade, it will remove the local H2O R package and download/install the H2O R package from the server. - -**Note**: You may want to manually upgrade your package rather than waiting until being prompted. This requires that you fully uninstall and reinstall the H2O package and the H2O client package. You must unload packages running in the environment before upgrading. We also recommended that you restart R or R studio after upgrading. - -Example -~~~~~~~ - -.. substitution-code-block:: r - - library h2o - h2o.init() - - H2O is not running yet, starting it now... - - Note: In case of errors look at the following log files: - /var/folders/yl/cq5nhky53hjcl9wrqxt39kz80000gn/T//RtmpKtZXsy/h2o_techwriter_started_from_r.out - /var/folders/yl/cq5nhky53hjcl9wrqxt39kz80000gn/T//RtmpKtZXsy/h2o_techwriter_started_from_r.err - - java version "1.8.0_25" - Java(TM) SE Runtime Environment (build 1.8.0_25-b17) - Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) - - Starting H2O JVM and connecting: .. Connection successful! - - R is connected to the H2O cluster: - H2O cluster uptime: 2 seconds 73 milliseconds - H2O cluster timezone: America/Los_Angeles - H2O data parsing timezone: UTC - H2O cluster version: |version| - H2O cluster version age: 9 days - H2O cluster name: H2O_started_from_R_angelabartz_dxr691 - H2O cluster total nodes: 1 - H2O cluster total memory: 2.00 GB - H2O cluster total cores: 8 - H2O cluster allowed cores: 8 - H2O cluster healthy: TRUE - H2O Connection ip: localhost - H2O Connection port: 54321 - H2O Connection proxy: NA - H2O Internal Security: FALSE - H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 - R Version: R version 3.5.1 (2018-07-02) +There are a variety of ways to start H2O-3 depending on which client you would like to use. The instructions below assume that you already downloaded and installed H2O-3. If you have not, then please refer to the `Downloading & Installing H2O `__ section. From Python ----------- -Use the ``h2o.init()`` function to initialize H2O. This function accepts the following options. Note that in most cases, simply using ``h2o.init()`` is all that a user is required to do. +Use the ``h2o.init()`` function to initialize H2O-3. In most cases, simply using ``h2o.init()`` is all that you are required to do. + +Options to initialize H2O-3 in Python +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This function accepts the following options: -- ``url``: Full URL of the server to connect to. (This can be used instead of ``ip`` + ``port`` + ``https``.) -- ``ip``: The ip address (or host name) of the server where H2O is running. -- ``port``: Port number that H2O service is listening to. -- ``name``: Cluster name. If None while connecting to an existing cluster it will not check the cluster name. If set then will connect only if the target cluster name matches. If no instance is found and decides to start a local one then this will be used as the cluster name or a random one will be generated if set to None. -- ``https``: Set to True to connect via https:// instead of http://. -- ``insecure``: When using https, setting this to True will disable SSL certificates verification. + +- ``url``: Full URL of the server to connect to (this can be used instead of ``ip`` + ``port`` + ``https``). +- ``ip``: The ip address (or host name) of the server where H2O-3 is running. +- ``port``: Port number that H2O-3 service is listening to. +- ``name``: Cluster name. If ``None``, while connecting to an existing cluster, it will not check the cluster name. If set, then it will connect only if the target cluster name matches. If no instance is found and it decides to start a local one then this will be used as the cluster name or a random one will be generated if set to ``None``. +- ``https``: Set to ``True`` to connect via https:// instead of http://. +- ``insecure``: When using https, setting this to ``True`` will disable SSL certificates verification. - ``username``: The username to log in with when using basic authentication. - ``password``: The password to log in with when using basic authentication. - ``cookies``: Cookie (or list of) to add to each request. - ``proxy``: The proxy server address. -- ``start_h2o``: If False, do not attempt to start an H2O server when a connection to an existing one failed. -- ``nthreads``: "Number of threads" option when launching a new H2O server. -- ``ice_root``: The directory for temporary files for the new H2O server. -- ``log_dir``: Directory for H2O logs to be stored if a new instance is started. Ignored if connecting to an existing node. -- ``log_level``: The logger level for H2O if a new instance is started. One of TRACE,DEBUG,INFO,WARN,ERRR,FATA. Default is INFO. Ignored if connecting to an existing node. -- ``enable_assertions``: Enable assertions in Java for the new H2O server. -- ``max_mem_size``: Maximum memory to use for the new H2O server. Integer input will be evaluated as gigabytes. Other units can be specified by passing in a string (e.g. "160M" for 160 megabytes). +- ``start_h2o``: If ``False``, do not attempt to start an H2O-3 server when a connection to an existing one failed. +- ``nthreads``: "Number of threads" option when launching a new H2O-3 server. +- ``ice_root``: The directory for temporary files for the new H2O-3 server. +- ``log_dir``: Directory for H2O-3 logs to be stored if a new instance is started. Ignored if connecting to an existing node. +- ``log_level``: The logger level for H2O-3 if a new instance is started. One of ``TRACE``, ``DEBUG``, ``INFO``, ``WARN``, ``ERRR``, or ``FATAL``. Default is ``INFO``. Ignored if connecting to an existing node. +- ``enable_assertions``: Enable assertions in Java for the new H2O-3 server. +- ``max_mem_size``: Maximum memory to use for the new H2O-3 server. Integer input will be evaluated as gigabytes. Other units can be specified by passing in a string (e.g. ``"160M"`` for 160 megabytes). - **Note:** If ``max_mem_size`` is not defined, then the amount of memory that H2O allocates will be determined by the default memory of the Java Virtual Machine (JVM). This amount depends on the Java version, but it will generally be 25% of the machine's physical memory. + .. note:: + + If ``max_mem_size`` is not defined, then the amount of memory that H2O-3 allocates will be determined by the default memory of the Java Virtual Machine (JVM). This amount depends on the Java version, but it will generally be 25% of the machine's physical memory. -- ``min_mem_size``: Minimum memory to use for the new H2O server. Integer input will be evaluated as gigabytes. Other units can be specified by passing in a string (e.g. "160M" for 160 megabytes). -- ``strict_version_check``: If True, an error will be raised if the client and server versions don't match. -- ``ignore_config``: Indicates whether a processing of a .h2oconfig file should be conducted or not. Default value is False. -- ``extra_classpath``: List of paths to libraries that should be included on the Java classpath when starting H2O from Python. -- ``kwargs``: (all other deprecated attributes) -- ``jvm_custom_args``: Customer, user-defined argument’s for the JVM H2O is instantiated in. Ignored if there is an instance of H2O already running and the client connects to it. -- ``bind_to_localhost``: A flag indicating whether access to the H2O instance should be restricted to the local machine (default) or if it can be reached from other computers on the network. +- ``min_mem_size``: Minimum memory to use for the new H2O-3 server. Integer input will be evaluated as gigabytes. Other units can be specified by passing in a string (e.g. ``"160M"`` for 160 megabytes). +- ``strict_version_check``: If ``True``, an error will be raised if the client and server versions don't match. +- ``ignore_config``: Indicates whether a processing of a ``.h2oconfig`` file should be conducted or not. Default value is ``False``. +- ``extra_classpath``: List of paths to libraries that should be included on the Java classpath when starting H2O-3 from Python. +- ``kwargs``: (All other deprecated attributes.) +- ``jvm_custom_args``: User-defined arguments for the JVM H2O-3 is instantiated in. Ignored if there is an instance of H2O-3 already running and the client connects to it. +- ``bind_to_localhost``: A flag indicating whether access to the H2O-3 instance should be restricted to the local machine (default) or if it can be reached from other computers on the network. Example ~~~~~~~ @@ -153,96 +83,178 @@ Example From Anaconda ~~~~~~~~~~~~~ -This section describes how run H2O in an Anaconda Cloud environment. This section assumes that you have installed H2O on Anaconda using the instructions in the `Install on Anaconda Cloud `__ section. +This section describes how to run H2O-3 in an Anaconda Cloud environment. This section assumes that you have installed H2O-3 on Anaconda using the instructions in the `Install on Anaconda Cloud `__ section. Launching Jupyter Notebook ^^^^^^^^^^^^^^^^^^^^^^^^^^ -1. Open a Terminal window and launch jupyter notebook. +1. Open a Terminal window and launch Jupyter Notebook. :: user$ jupyter notebook -2. Create a new Python notebook by selecting the **New** button in the upper left corner. At this point, you can begin using Jupyter Notebook to run H2O Python commands. An example notebook follows. +2. Create a new Python notebook by clicking **New**. At this point, you can begin using Jupyter Notebook to run H2O-3 Python commands. See the following example. -GBM Example +GBM example ^^^^^^^^^^^ After you successfully launch Jupyter notebook, enter the following commands to run a GBM example. -1. Import the H2O and GBM modules. +1. Import the H2O-3 and GBM modules. .. figure:: images/anaconda_import_module.png - :alt: Import H2O + :alt: Import H2O-3 and GBM estimator commands in a Jupyter Notebook. -2. Initialize H2O using ``h2o.init()``. +2. Initialize H2O-3 using ``h2o.init()``. .. figure:: images/anaconda_init.png - :alt: Initialize H2O + :alt: Initialize H2O-3 command and the following printout with H2O-3 cluster information. 3. Import the Airlines dataset. This dataset will be used to classify whether a flight will be delayed. .. figure:: images/anaconda_import_airlines.png - :alt: Import dataset + :alt: Import airlines dataset with parse progress bar. 4. Convert columns to factors. .. figure:: images/anaconda_convert_columns.png - :alt: Convert columns to factors + :alt: Convert columns to factors: Year, Month, dayOfWeek, Cancelled, and FlightNum. 5. Set the predictor names and the response column name. .. figure:: images/anaconda_predictor_response.png - :alt: Set predictor names and response column + :alt: Set Origin, Dest, Year, UniqueCarrier, DayOfWeek, Month, Distance, and FlightNum for the predictors. Set IsDepDelayed for response. 6. Split the dataset into training and validation sets. .. figure:: images/anaconda_split_data.png - :alt: Split the dataset + :alt: Split the dataset with a .8 ratio and seed set to 1234. 7. Specify the number of bins that will be included in the historgram and then split. .. figure:: images/anaconda_nbins_cats.png - :alt: Try a range of nbins_cats + :alt: Try a range of nbins_cats: 8, 16, 32, 64, 128, 256, 512, 1024, 2048, and 4096. 8. Train the models. .. figure:: images/anaconda_train_model.png - :alt: Train the models + :alt: Train the models using the information you've set. 9. Print the AUC scores for the training data and the validation data. .. figure:: images/anaconda_print_auc.png - :alt: Print the AUC score + :alt: Print the AUC score for the training and validation. Troubleshooting ^^^^^^^^^^^^^^^ If your system includes two versions of Anaconda (a global installation and a user-specific installation), be sure to use the User Anaconda. Using the Global Anaconda will result in an error when you attempt to run commands in Jupyter Notebook. You can verify the version that you are using by running ``which pip`` (Mac) or ``where pip`` (Windows). If your system shows that your environment is set up to use Global Anaconda by default, then change the PATH environment variable to use the User Anaconda. -From the Command Line +From R +------ + +Use the ``h2o.init()`` method to initialize H2O-3. In most cases, simply using ``h2o.init()`` is all that you are required to do. + +Options to initialize H2O-3 in R +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method accepts the following options: + +- ``nthreads``: This launches H2O-3 using all available CPUs and is only applicable if you launch H2O locally using R. If you start H2O-3 locally outside of R or start H2O-3 on Hadoop, the nthreads parameter is not applicable. +- ``ip``: The IP address of the server where H2O-3 is running. +- ``port``: The port number of the H2O-3 server. +- ``startH2O``: (Optional) A logical value indicating whether to try to start H2O-3 from R if no connection with H2O-3 is detected. This is only possible if ``ip = "localhost"`` or ``ip = "127.0.0.1"``. If an existing connection is detected, R does not start H2O-3. +- ``forceDL``: (Optional) A logical value indicating whether to force download of the H2O-3 executable. This defaults to ``FALSE``, so the executable will only be downloaded if it does not already exist in the H2O-3 R library resources directory at ``h2o/java/h2o.jar``. +- ``enable_assertions``: (Optional) A logical value indicating whether H2O-3 should be launched with assertions enabled. This is used mainly for error checking and debugging purposes. +- ``license``: (Optional) A character string value specifying the full path of the license file. +- ``max_log_file_size``: Maximum size of INFO and DEBUG log files. The file is rolled over after the specifized size has been reached. The range for this option is ``1MB`` to ``99999MB``. The value defaults to ``3MB``. +- ``max_mem_size``: (Optional) A character string specifying the maximum size, in bytes, of the memory allocation pool to H2O-3. This value must be a multiple of 1024 greater than 2MB. Append the letter ``m`` or ``M`` to indicate megabytes, or ``g`` or ``G`` to indicate gigabytes. + + .. note:: + + If ``max_mem_size`` is not defined, then the amount of memory that H2O allocates will be determined by the default memory of the Java Virtual Machine (JVM). This amount depends on the Java version, but it will generally be 25% of the machine's physical memory. + +- ``min_mem_size``: (Optional) A character string specifying the minimum size, in bytes, of the memory allocation pool to H2O-3. This value must a multiple of 1024 greater than 2MB. Append the letter ``m`` or ``M`` to indicate megabytes, or ``g`` or ``G`` to indicate gigabytes. +- ``ice_root``: (Optional) A directory to handle object spillage. The default varies by OS. +- ``strict_version_check``: (Optional) Setting this to ``FALSE`` is unsupported and should only be done when advised by technical support. +- ``ignore_config``: (Optional) This option allows you to specify whether to perform processing of a ``.h2oconfig`` file. When ``h2o.init()`` is specified, a call to a config reader method is invoked. This call can result in path issues when there is no "root" (for example, with a Windows network drive) because the config file reader searches up to "root." When there is no "root", the path to search will continue to expand, eventually result in an error. This value defaults to ``FALSE``. +- ``proxy``: (Optional) A character string specifying the proxy path. +- ``https``: (Optional) Set this to ``TRUE`` to use https instead of http. +- ``insecure``: (Optional) Set this to ``TRUE`` to disable SSL certificate checking. +- ``username``: (Optional) The username to log in with. +- ``password``: (Optional) The password to log in with. +- ``cookies``: (Optional) Vector (or list) of cookies to add to request. +- ``context_path``: (Optional) The last part of connection URL. For example, **http://:/** +- ``use_spnego``: (Optional) Set this to TRUE to connect to an H2O-3 cluster with SPNEGO authentication. This defaults to FALSE and is mutually exclusive with ``username`` and ``password``. + +Connection process +~~~~~~~~~~~~~~~~~~ + +By default, ``h2o.init()`` first checks if an H2O-3 instance is connectible. If it cannot connect and ``start = TRUE`` with ``ip = "localhost"``, it will attempt to start an instance of H2O-3 at ``localhost:54321``. If an open ip and port of your choice are passed in, then this method will attempt to start an H2O-3 instance at that specified ip and port. + +When initializing H2O-3 locally, this method searches for the h2o.jar file in the R library resources ``(system.file("java", "h2o.jar", package = "h2o"))``, and if the file does not exist, it will automatically attempt to download the correct version from Amazon S3. The user must have Internet access for this process to be successful. + +Once connected, the ``h2o.init()`` method checks to see if the local H2O-3 R package version matches the version of H2O-3 running on the server. If there is a mismatch and you indicates you want to upgrade, it will remove the local H2O-3 R package and download/install the H2O-3 R package from the server. + +.. note:: + + You may want to manually upgrade your package rather than waiting until being prompted. This requires that you fully uninstall and reinstall the H2O-3 package and the H2O-3 client package. You must unload packages running in the environment before upgrading. We also recommended that you restart R or R studio after upgrading. + +Example +~~~~~~~ + +.. substitution-code-block:: r + + library h2o + h2o.init() + + H2O is not running yet, starting it now... + + Note: In case of errors look at the following log files: + /var/folders/yl/cq5nhky53hjcl9wrqxt39kz80000gn/T//RtmpKtZXsy/h2o_techwriter_started_from_r.out + /var/folders/yl/cq5nhky53hjcl9wrqxt39kz80000gn/T//RtmpKtZXsy/h2o_techwriter_started_from_r.err + + java version "1.8.0_25" + Java(TM) SE Runtime Environment (build 1.8.0_25-b17) + Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) + + Starting H2O JVM and connecting: .. Connection successful! + + R is connected to the H2O cluster: + H2O cluster uptime: 2 seconds 73 milliseconds + H2O cluster timezone: America/Los_Angeles + H2O data parsing timezone: UTC + H2O cluster version: |version| + H2O cluster version age: 9 days + H2O cluster name: H2O_started_from_R_angelabartz_dxr691 + H2O cluster total nodes: 1 + H2O cluster total memory: 2.00 GB + H2O cluster total cores: 8 + H2O cluster allowed cores: 8 + H2O cluster healthy: TRUE + H2O Connection ip: localhost + H2O Connection port: 54321 + H2O Connection proxy: NA + H2O Internal Security: FALSE + H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 + R Version: R version 3.5.1 (2018-07-02) + +From the command line --------------------- .. todo:: create a table of command line options (should you say expression or primary?) .. todo:: provide examples for most common clusters -You can use Terminal (OS X) or the Command Prompt (Windows) to launch -H2O. +You can use Terminal (Mac) or Command Prompt (Windows) to launch H2O-3. + +When you launch from the command line, you can include additional instructions to H2O-3 such as how many nodes to launch, how much memory to allocate for each node, that you can assign names to the nodes in the cluster, and more. -When you launch from the command line, you can include -additional instructions to H2O 3.0, such as how many nodes to launch, -how much memory to allocate for each node, assign names to the nodes in -the cluster, and more. +.. note:: - **Note**: H2O requires some space in the ``/tmp`` directory to - launch. If you cannot launch H2O, try freeing up some space in the - ``/tmp`` directory, then try launching H2O again. + H2O-3 requires some space in the ``/tmp`` directory to launch. If you cannot launch H2O-3, try freeing up some space in the ``/tmp`` directory, then try launching H2O-3 again. -For more detailed instructions on how to build and launch H2O, including -how to clone the repository, how to pull from the repository, and how to -install required dependencies, refer to the `developer -documentation `_. +For more detailed instructions on how to build and launch H2O-3 (including how to clone the repository, how to pull from the repository, and how to install required dependencies), see the `developer documentation `_. There are three different argument types: @@ -250,27 +262,37 @@ There are three different argument types: - H2O options - Authentication options -The arguments use the following format: java ```` -jar h2o.jar ````. +The arguments use the following format: ``java -jar h2o.jar ``. -JVM Options +JVM options ~~~~~~~~~~~ +The following are the available JVM options: + - ``-version``: Display Java version info. -- ``-Xmx``: To set the total heap size for an H2O node, configure the memory allocation option ``-Xmx``. By default, this option is set to 1 Gb (``-Xmx1g``). When launching nodes, we recommend allocating a total of four times the memory of your data. +- ``-Xmx``: Configure the memory allocation option ``-Xmx`` to set the total heap size for an H2O-3 node. By default, this option is set to 1GB (``-Xmx1g``). When launching nodes, we recommend allocating a total of four times the memory of your data. - **Note**: Do not try to launch H2O with more memory than you have available. If ``-Xmx`` is not defined, then the amount of memory that H2O allocates will be determined by the default memory of the JVM. This amount depends on the Java version, but it will generally be 25% of the machine's physical memory. +.. note:: + + Do not try to launch H2O-3 with more memory than you have available. If ``-Xmx`` is not defined, then the amount of memory that H2O-3 allocates will be determined by the default memory of the JVM. This amount depends on the Java version, but it will generally be 25% of the machine's physical memory. -H2O Options +H2O options ~~~~~~~~~~~ +The following are the available H2O options: + - ``-h`` or ``-help``: Display this information in the command line output. - ``-version``: Specify to print version information and exit. -- ``-name ``: Assign a name to the H2O instance in the cluster (where ```` is the name of the cluster). Nodes with the same cluster name will form an H2O cluster (also known as an H2O cloud). +- ``-name ``: Assign a name to the H2O-3 instance in the cluster (where ```` is the name of the cluster). Nodes with the same cluster name will form an H2O-3 cluster (also known as an H2O-3 cloud). - ``-flatfile ``: Specify a flatfile of IP address for faster cluster formation (where ```` is the name of the flatfile). - ``-ip ``: Specify an IP for the machine other than the default ``localhost``, for example: - IPv4: ``-ip 178.16.2.223`` - - IPv6: ``-ip 2001:db8:1234:0:0:0:0:1`` (Short version of IPv6 with ``::`` is not supported.) **Note**: If you are selecting a link-local address ``fe80::/96``, it is necessary to specify the *zone index* (e.g., ``%en0`` for ``fe80::2acf:e9ff:fe15:e0f3%en0``) in order to select the right interface. + - IPv6: ``-ip 2001:db8:1234:0:0:0:0:1`` (Short version of IPv6 with ``::`` is not supported.) + + .. note:: + + If you are selecting a link-local address ``fe80::/96``, it is necessary to specify the *zone index* (e.g., ``%en0`` for ``fe80::2acf:e9ff:fe15:e0f3%en0``) in order to select the right interface. - ``-port <#>``: Specify a PORT used for REST API. The communication port will be the port with value +1 higher. - ``-baseport``: Specifies the starting port to find a free port for REST API, the internal communication port will be port with value +1 higher. @@ -279,13 +301,13 @@ H2O Options - IPv4: ``-network 178.0.0.0/8`` - IPv6: ``-network 2001:db8:1234:0:0:0:0:0/48`` (short version of IPv6 with ``::`` is not supported.) -- ``-ice_root ``: Specify a directory for H2O to spill temporary data to disk (where ```` is the file path). -- ``-log_dir ``: Specify the directory where H2O writes logs to disk. (This usually has a good default that you need not change. +- ``-ice_root ``: Specify a directory for H2O-3 to spill temporary data to disk (where ```` is the file path). +- ``-log_dir ``: Specify the directory where H2O-3 writes logs to disk. (This usually has a good default that you need not change. - ``-log_level ``: Specify to write messages at this logging level, or above. The default is INFO. - ``-flow_dir ``: Specify a directory for saved flows. The default is ``/Users/h2o-/h2oflows`` (where ```` is your user name). -- ``-file_deny_glob ``: Specify `glob `_. pattern to deny access to certain files. The default is ``{/bin/*,/etc/*,/var/*,/usr/*,/proc/*,**/.**}``. +- ``-file_deny_glob ``: Specify the `glob `__ pattern to deny access to certain directories. This parameter is added to remove vulnerabilities CVE-2023-6038, CVE-2023-6569 and CVE-2024-5986. The default is ``{/bin/*,/etc/*,/var/*,/usr/*,/proc/*,**/.**}``. - ``-nthreads <#ofThreads>``: Specify the maximum number of threads in the low-priority batch work queue (where ``<#ofThreads>`` is the number of threads). -- ``-client``: Launch H2O node in client mode. This is used mostly for running Sparkling Water. +- ``-client``: Launch H2O-3 node in client mode (this is used mostly for running Sparkling Water). - ``-notify_local ``: Specifies a file to write to when the node is up. The file system path contains a single line with the IP and port of the embedded web server. For example, 192.168.1.100:54321. - ``-context_path ``: The context path for Jetty. - ``features``: Disable availability of features considered to be experimental or beta. Currently, this only works with algorithms. Options include: @@ -294,39 +316,43 @@ H2O Options - ``beta``: Only beta and stable algorithms will be enabled; experimental will not. - ``experimental``: Enables all algorithms (default). -Authentication Options +Authentication options ~~~~~~~~~~~~~~~~~~~~~~ +The following are the available authentication options: + - ``-jks ``: Specify a Java keystore file. - ``-jks_pass ``: Specify the Java keystore password. - ``-jks_alias ``: Optional, use if the keystore has multiple certificates and you want to use a specific one. -- ``-hash_login``: Specify to use Jetty HashLoginService. This defaults to False. -- ``-ldap_login``: Specify to use Jetty LdapLoginService. This defaults to False. -- ``-kerberos_login``: Specify to use Kerberos LoginService. This defaults to False. -- ``-pam_login``: Specify to use the Pluggable Authentication Module (PAM) LoginService. This defaults to False. +- ``-hash_login``: Specify to use Jetty HashLoginService. This defaults to ``False``. +- ``-ldap_login``: Specify to use Jetty LdapLoginService. This defaults to ``False``. +- ``-kerberos_login``: Specify to use Kerberos LoginService. This defaults to ``False``. +- ``-pam_login``: Specify to use the Pluggable Authentication Module (PAM) LoginService. This defaults to ``False``. - ``-login_conf ``: Specify the LoginService configuration file. - ``-form_auth``: Enables Form-based authentication for Flow. This defaults to Basic authentication. - ``-session_timeout ``: Specifies the number of minutes that a session can remain idle before the server invalidates the session and requests a new login. Requires ``-form_auth``. This defaults to no timeout. - ``-internal_security_conf ``: Specify the path (absolute or relative) to a file containing all internal security related configurations. -H2O Networking -~~~~~~~~~~~~~~ +H2O-3 networking +~~~~~~~~~~~~~~~~ -H2O Internal Communication -^^^^^^^^^^^^^^^^^^^^^^^^^^ +H2O-3 internal communication +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default, H2O selects the IP and PORT for internal communication automatically using the following this process (if not specified): +By default, H2O-3 selects the IP and PORT for internal communication automatically using the following process (if not specified): 1. Retrieve a list of available interfaces (which are up). 2. Sort them with "bond" interfaces put on the top. -3. For each interface, extract associated IPs. +3. Extract associated IPs for each interface. 4. Pick only reachable IPs (that filter IPs provided by interfaces, such as awdl): - If there is a site IP, use it. - Otherwise, if there is a link local IP, use it. (For IPv6, the link IP 0xfe80/96 is associated with each interface.) - Or finally, try to find a local IP. (Use loopback or try to use Google DNS to find IP for this machine.) -**Notes**: The port is selected by looking for a free port starting with port 54322. The IP, PORT and network selection can be changed by the following options: +.. note:: + + The port is selected by looking for a free port starting with port 54322. The IP, PORT and network selection can be changed by the following options: - ``-ip`` - ``network`` @@ -334,29 +360,28 @@ By default, H2O selects the IP and PORT for internal communication automatically - ``-baseport`` -Cluster Formation Behavior +Cluster formation behavior ^^^^^^^^^^^^^^^^^^^^^^^^^^ -New H2O nodes join to form a cluster during launch. After a job has -started on the cluster, it prevents new members from joining. +New H2O-3 nodes join to form a cluster during launch. After a job has started on the cluster, it prevents new members from joining. -- To start an H2O node with 4GB of memory and a default cluster name: - ``java -Xmx4g -jar h2o.jar`` +*Cluster formation examples* -- To start an H2O node with 6GB of memory and a specific cluster name: - ``java -Xmx6g -jar h2o.jar -name MyCluster`` +To start an H2O-3 node with 4GB of memory and a default cluster name: ``java -Xmx4g -jar h2o.jar`` -- To start an H2O cluster with three 2GB nodes using the default cluster - names: ``java -Xmx2g -jar h2o.jar & java -Xmx2g -jar h2o.jar & java -Xmx2g -jar h2o.jar &`` +To start an H2O-3 node with 6GB of memory and a specific cluster name: +``java -Xmx6g -jar h2o.jar -name MyCluster`` -Wait for the ``INFO: Registered: # schemas in: #mS`` output before -entering the above command again to add another node (the number for # -will vary). +To start an H2O-3 cluster with three 2GB nodes using the default cluster names: ``java -Xmx2g -jar h2o.jar & java -Xmx2g -jar h2o.jar & java -Xmx2g -jar h2o.jar &`` -Clouding Up: Cluster Creation +.. tip:: + + Wait for the ``INFO: Registered: # schemas in: #mS`` output before entering the above command again to add another node (the number for ``#`` will vary). + +Clouding up: Cluster creation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -H2O provides two modes for cluster creation: +H2O-3 provides two modes for cluster creation: - Multicast based - Flatfile based @@ -364,28 +389,30 @@ H2O provides two modes for cluster creation: Multicast ''''''''' -In this mode, H2O is using IP multicast to announce existence of H2O nodes. Each node selects the same multicast group and port based on specified shared cluster name (see ``-name`` option). For example, for IPv4/PORT a generated multicast group is ``228.246.114.236:58614`` (for cluster name ``michal``), -for IPv6/PORT a generated multicast group is ``ff05:0:3ff6:72ec:0:0:3ff6:72ec:58614`` (for cluster name ``michal`` and link-local address which enforce link-local scope). +In this mode, H2O-3 uses IP multicast to announce the existence of H2O nodes. Each node selects the same multicast group and port based on a specified shared cluster name (see the ``-name`` option). For example, for IPv4/PORT, a generated multicast group is ``228.246.114.236:58614`` (for cluster name ``michal``); for IPv6/PORT, a generated multicast group is ``ff05:0:3ff6:72ec:0:0:3ff6:72ec:58614`` (for cluster name ``michal`` and a link-local address which enforces a link-local scope). -For IPv6 the scope of multicast address is enforced by a selected node IP. For example, if IP the selection process selects link-local address, then the scope of multicast will be link-local. This can be modified by specifying JVM variable ``sys.ai.h2o.network.ipv6.scope`` which enforces addressing scope use in multicast group address (for example, ``-Dsys.ai.h2o.network.ipv6.scope=0x0005000000000000`` enforces the site local scope. For more details please consult the -class ``water.util.NetworkUtils``). +For IPv6, the scope of the multicast address is enforced by a selected node IP. For example, if IP the selection process selects a link-local address, then the scope of the multicast will be a link-local. This can be modified by specifying the JVM variable ``sys.ai.h2o.network.ipv6.scope`` which enforces addressing scope use in the multicast group address. For example, ``-Dsys.ai.h2o.network.ipv6.scope=0x0005000000000000`` enforces the site local scope. For more details please consult the class ``water.util.NetworkUtils``. -For more information about scopes, see the following `image `_. +See the following `image on scopes for more information `_. Flatfile '''''''' -The flatfile describes a topology of a H2O cluster. The flatfile definition is passed via the ``-flatfile`` option. It needs to be passed at each node in the cluster, but definition does not be the same at each node. However, transitive closure of all definitions should contains all nodes. For example, for the following definition +The flatfile describes a topology of an H2O-3 cluster. The flatfile definition is passed through the ``-flatfile`` option. It needs to be passed at each node in the cluster, but the definition is not the same at each node. However, transitive closure of all the definitions should contains all nodes. + +.. hint:: + + For example, in the following definition the resulting cluster will be formed by nodes A, B, C. -+---------+-------+-------+-------+ -| Nodes | nodeA | nodeB | nodeC | -+---------+-------+-------+-------+ -|Flatfile | A,B | A, B | B, C | -+---------+-------+-------+-------+ + +---------+-------+-------+-------+ + | Nodes | nodeA | nodeB | nodeC | + +---------+-------+-------+-------+ + |Flatfile | A,B | A, B | B, C | + +---------+-------+-------+-------+ -The resulting cluster will be formed by nodes A, B, C. The node A transitively sees node C via node B flatfile definition, and vice versa. + The node A transitively sees node C through node B flatfile definition, and vice versa. -The flatfile contains a list of nodes in the form ``IP:PORT`` that are going to compose a resulting cluster (each node on a separated line, everything prefixed by ``#`` is ignored). Running H2O on a multi-node cluster allows you to use more memory for large-scale tasks (for example, creating models from huge datasets) than would be possible on a single node. +The flatfile contains a list of nodes in the form ``IP:PORT`` that are going to compose a resulting cluster (each node is on a separated line and everything prefixed by ``#`` is ignored). Running H2O-3 on a multi-node cluster lets you use more memory for large-scale tasks (for example, creating models from huge datasets) than would be possible on a single node. **IPv4**: @@ -402,34 +429,33 @@ The flatfile contains a list of nodes in the form ``IP:PORT`` that are going to 0:0:0:0:0:0:0:1:54321 0:0:0:0:0:0:0:1:54323 -Web Server +Web server ^^^^^^^^^^ -By default, the web server IP is auto-configured in the same way as internal communication IP, nevertheless the created socket listens on all available interfaces. A specific IP can be specified with the ``-web_ip`` option. +By default, the web server IP is auto-configured in the same way as the internal communication IP. Nevertheless, the created socket listens on all available interfaces. A specific IP can be specified with the ``-web_ip`` option. Options ''''''' -- ``-web_ip``: specifies IP for web server to expose REST API +- ``-web_ip``: specifies IP for web server to expose the REST API. -Dual Stacks +Dual stacks ^^^^^^^^^^^ -Dual stack machines support IPv4 and IPv6 network stacks. -Right now, H2O always prefer IPV4, however the preference can be changed via JVM system options ``java.net.preferIPv4Addresses`` and ``java.net.preferIPv6Addresses``. For example: +Dual stack machines support IPv4 and IPv6 network stacks. H2O-3 prefers IPV4, however the preference can be changed through JVM system options ``java.net.preferIPv4Addresses`` and ``java.net.preferIPv6Addresses``. For example: -- ``-Djava.net.preferIPv6Addresses=true -Djava.net.preferIPv4Addresses=true`` - H2O will try to select IPv4 -- ``-Djava.net.preferIPv6Addresses=true -Djava.net.preferIPv4Addresses=false`` - H2O will try to select IPv6 +- ``-Djava.net.preferIPv6Addresses=true -Djava.net.preferIPv4Addresses=true`` - H2O-3 will try to select IPv4. +- ``-Djava.net.preferIPv6Addresses=true -Djava.net.preferIPv4Addresses=false`` - H2O-3 will try to select IPv6. On Spark -------- -Refer to the `Getting Started with Sparkling Water `__ section for information on how to launch H2O on Spark. +See the `Getting Started with Sparkling Water `__ section for information on how to launch H2O-3 on Spark. -Connecting to an H2O Cluster by Name ------------------------------------- +Connecting to an H2O-3 cluster by name +-------------------------------------- -You can connect to an already live H2O cluster by providing the cluster name. +You can connect to an already live H2O-3 cluster by providing the cluster name. The following examples show how to connect through the cluster name in a programmatic way. You will first save the connection details where the file name is the cluster name. The connection details are then programmatically picked up by the cluster name. @@ -473,9 +499,9 @@ Then, connect to that cluster by importing the cluster details that were saved: # Connect via the URL: h2o.connect(url=url) -Best Practices +Best practices -------------- -- Use ``h2o.importFile`` instead of ``h2o.uploadFile`` if possible. +- Use ``h2o.import_file``/``h2o.importFile`` instead of ``h2o.upload_file``/``h2o.uploadFile`` when possible. - Set the correct cluster size for your given dataset size. The rule of thumb is to use at least 4 times the size of your data. For example, if the dataset is 10GB, you should allocate at least 40GB of memory. diff --git a/h2o-docs/src/product/welcome.rst b/h2o-docs/src/product/welcome.rst index d97bbc92ae31..edb4fe92ff24 100644 --- a/h2o-docs/src/product/welcome.rst +++ b/h2o-docs/src/product/welcome.rst @@ -53,7 +53,36 @@ H2O-3 supports the following `algorithms `__: Requirements ------------ -We recommend the following at minimum for compatibility with H2O-3: +At a minimum, we recommend the following for compatibility with H2O-3: + +- **Operating Systems**: + + - Windows 7 or later + - OS X 10.9 or later + - Ubuntu 12.04 + - RHEL/CentOS 6 or later + +- **Languages**: R and Python are not required to use H2O-3 unless you want to use H2O in those environments, but Java is always required (see `below `__). + + - R version 3 or later + - Python 3.6.x, 3.7.x, 3.8.x, 3.9.x, 3.10.x, 3.11.x + +- **Browser**: An internet browser is required to use H2O-3's web UI, Flow. Supported versions include the latest version of Chrome, Firefox, Safari, or Internet Explorer. +- **numpy**: H2O-3 only supports ``numpy<2``. To work around having ``numpy2`` installed, run the following command: + + :: + + pip install --force-reinstall 'numpy<2' + +Java Requirements +~~~~~~~~~~~~~~~~~ + +H2O-3 runs on Java. To build H2O-3 or run H2O-3 tests, the 64-bit JDK is required. To run the H2O-3 binary using either the command line, R, or Python packages, only 64-bit JRE is required. + +H2O-3 supports the following versions of Java: + +- Java SE 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 + - **Operating systems**: diff --git a/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoostModel.java b/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoostModel.java index da0930786539..30a02b35753e 100755 --- a/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoostModel.java +++ b/h2o-extensions/xgboost/src/main/java/hex/tree/xgboost/XGBoostModel.java @@ -67,6 +67,12 @@ public enum DMatrixType { public enum Backend { auto, gpu, cpu } + public enum FeatureSelector { + cyclic, shuffle, random, greedy, thrifty + } + public enum Updater { + gpu_hist, shotgun, coord_descent, gpu_coord_descent, + } // H2O GBM options public boolean _quiet_mode = true; @@ -141,6 +147,12 @@ public enum Backend { public int[] _gpu_id; // which GPU to use public Backend _backend = Backend.auto; + // GBLiner specific (booster == gblinear) + // lambda, alpha support also for gbtree + public FeatureSelector _feature_selector = FeatureSelector.cyclic; + public int _top_k; + public Updater _updater; + public String _eval_metric; public boolean _score_eval_metric_only; @@ -378,6 +390,10 @@ public static Map createParamsMap(XGBoostParameters p, int nClas params.put("one_drop", p._one_drop ? "1" : "0"); params.put("skip_drop", p._skip_drop); } + if (p._booster == XGBoostParameters.Booster.gblinear) { + params.put("feature_selector", p._feature_selector.toString()); + params.put("top_k", p._top_k); + } XGBoostParameters.Backend actualBackend = getActualBackend(p, true); XGBoostParameters.TreeMethod actualTreeMethod = getActualTreeMethod(p); if (actualBackend == XGBoostParameters.Backend.gpu) { @@ -387,17 +403,17 @@ public static Map createParamsMap(XGBoostParameters p, int nClas params.put("gpu_id", 0); } // we are setting updater rather than tree_method here to keep CPU predictor, which is faster - if (p._booster == XGBoostParameters.Booster.gblinear) { + if (p._booster == XGBoostParameters.Booster.gblinear && p._updater == null) { LOG.info("Using gpu_coord_descent updater."); - params.put("updater", "gpu_coord_descent"); + params.put("updater", XGBoostParameters.Updater.gpu_coord_descent.toString()); } else { LOG.info("Using gpu_hist tree method."); params.put("max_bin", p._max_bins); - params.put("tree_method", "gpu_hist"); + params.put("tree_method", XGBoostParameters.Updater.gpu_hist.toString()); } - } else if (p._booster == XGBoostParameters.Booster.gblinear) { + } else if (p._booster == XGBoostParameters.Booster.gblinear && p._updater == null) { LOG.info("Using coord_descent updater."); - params.put("updater", "coord_descent"); + params.put("updater", XGBoostParameters.Updater.coord_descent.toString()); } else if (H2O.CLOUD.size() > 1 && p._tree_method == XGBoostParameters.TreeMethod.auto && p._monotone_constraints != null) { LOG.info("Using hist tree method for distributed computation with monotone_constraints."); @@ -410,6 +426,10 @@ public static Map createParamsMap(XGBoostParameters p, int nClas params.put("max_bin", p._max_bins); } } + if (p._updater != null) { + LOG.info("Using user-provided updater."); + params.put("updater", p._updater.toString()); + } if (p._min_child_weight != 1) { LOG.info("Using user-provided parameter min_child_weight instead of min_rows."); params.put("min_child_weight", p._min_child_weight); diff --git a/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java b/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java index dc94b1835e01..267fd281bee9 100644 --- a/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java +++ b/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java @@ -10,19 +10,25 @@ public class XGBoostLibExtractTool { public static void main(String[] args) throws IOException { + try { + mainInternal(args); + } catch (IllegalArgumentException e) { + System.err.println((e.getMessage())); + System.exit(1); + } + } + + public static void mainInternal(String[] args) throws IOException { if (args.length != 1) { - System.err.println("XGBoostLibExtractTool: Specify target directory where to extract XGBoost native libraries."); - System.exit(-1); + throw new IllegalArgumentException("XGBoostLibExtractTool: Specify target directory where to extract XGBoost native libraries."); } File dir = new File(args[0]); if (!dir.exists()) { - System.err.println("XGBoostLibExtractTool: Directory '" + dir.getAbsolutePath() + "' doesn't exist."); - System.exit(-1); + throw new IllegalArgumentException("XGBoostLibExtractTool: Directory '" + dir.getAbsolutePath() + "' doesn't exist."); } NativeLibraryLoaderChain loader = XGBoostExtension.getLoader(); if (loader == null) { - System.err.println("XGBoostLibExtractTool: Failed to locate native libraries."); - System.exit(-1); + throw new IllegalArgumentException("XGBoostLibExtractTool: Failed to locate native libraries."); } for (NativeLibrary lib : loader.getNativeLibs()) { if (!lib.isBundled()) diff --git a/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java b/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java index 611371b15717..6ec00ac879aa 100755 --- a/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java +++ b/h2o-extensions/xgboost/src/test/java/hex/tree/xgboost/XGBoostTest.java @@ -3279,5 +3279,73 @@ public void testWarnEvalMetricOnlyWithouEvalMetric() { Scope.exit(); } } - + + @Test + public void testGBLinearTopKAndFeatureSelector() { + Scope.enter(); + try { + String response = "CAPSULE"; + Frame train = parseAndTrackTestFile("./smalldata/logreg/prostate_train.csv"); + train.toCategoricalCol(response); + + XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters(); + parms._ntrees = 1; + parms._train = train._key; + parms._response_column = response; + parms._booster = XGBoostModel.XGBoostParameters.Booster.gblinear; + parms._top_k = 2; + parms._feature_selector = XGBoostModel.XGBoostParameters.FeatureSelector.greedy; + + ModelBuilder job = new hex.tree.xgboost.XGBoost(parms); + + XGBoostModel xgboost = (XGBoostModel) job.trainModel().get(); + Scope.track_generic(xgboost); + assertNotNull(xgboost); + + Frame score = xgboost.score(train); + Scope.track(score); + + parms._top_k = 100; + ModelBuilder jobTopKChanged = new hex.tree.xgboost.XGBoost(parms); + + XGBoostModel xgboostTopKChanged = (XGBoostModel) jobTopKChanged.trainModel().get(); + Scope.track_generic(xgboostTopKChanged); + assertNotNull(xgboostTopKChanged); + + Frame scoreTopKChanged = xgboostTopKChanged.score(train); + Scope.track(scoreTopKChanged); + assertNotEquals("top_k should affect the predictions", score.toTwoDimTable().get(0,1), scoreTopKChanged.toTwoDimTable().get(0,1)); + } + finally { + Scope.exit(); + } + } + + + @Test + public void testGBLinearShotgun() { + Scope.enter(); + try { + String response = "CAPSULE"; + Frame train = parseAndTrackTestFile("./smalldata/logreg/prostate_train.csv"); + train.toCategoricalCol(response); + + XGBoostModel.XGBoostParameters parms = new XGBoostModel.XGBoostParameters(); + parms._ntrees = 1; + parms._train = train._key; + parms._response_column = response; + parms._booster = XGBoostModel.XGBoostParameters.Booster.gblinear; + parms._updater = XGBoostModel.XGBoostParameters.Updater.shotgun; + parms._feature_selector = XGBoostModel.XGBoostParameters.FeatureSelector.shuffle; + + ModelBuilder job = new hex.tree.xgboost.XGBoost(parms); + XGBoostModel xgboost = (XGBoostModel) job.trainModel().get(); + assertNotNull(xgboost); + Scope.track_generic(xgboost); + assertEquals("updater should be changed", xgboost._output._native_parameters.get(1,1), XGBoostModel.XGBoostParameters.Updater.shotgun.toString()); + } + finally { + Scope.exit(); + } + } } diff --git a/h2o-jetty-9-minimal/build.gradle b/h2o-jetty-9-minimal/build.gradle index 64e32354d7eb..228b42d6c5a9 100644 --- a/h2o-jetty-9-minimal/build.gradle +++ b/h2o-jetty-9-minimal/build.gradle @@ -2,6 +2,8 @@ dependencies { api project(":h2o-webserver-iface") api "org.eclipse.jetty:jetty-server:${jetty9MinimalVersion}" api "org.eclipse.jetty:jetty-servlet:${jetty9MinimalVersion}" + api "org.eclipse.jetty.websocket:websocket-api:${jetty9MinimalVersion}" + api "org.eclipse.jetty.websocket:websocket-server:${jetty9MinimalVersion}" testImplementation group: "junit", name: "junit", version: "4.12" testImplementation "org.mockito:mockito-core:2.23.0" } diff --git a/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9ServerAdapter.java b/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9ServerAdapter.java index 8de335982895..ccba0dbd0a35 100644 --- a/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9ServerAdapter.java +++ b/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9ServerAdapter.java @@ -7,10 +7,13 @@ import org.eclipse.jetty.server.handler.HandlerCollection; import org.eclipse.jetty.server.handler.HandlerWrapper; import org.eclipse.jetty.servlet.ServletContextHandler; +import org.eclipse.jetty.servlet.ServletHolder; import water.webserver.iface.H2OHttpView; +import water.webserver.iface.H2OWebsocketServlet; import water.webserver.iface.RequestAuthExtension; import water.webserver.iface.WebServer; +import javax.servlet.Servlet; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; @@ -74,6 +77,14 @@ private void registerHandlers(final HandlerWrapper handlerWrapper, final Servlet for (Map.Entry> entry : h2oHttpView.getServlets().entrySet()) { context.addServlet(entry.getValue(), entry.getKey()); } + for (Map.Entry> entry : h2oHttpView.getWebsockets().entrySet()) { + try { + Servlet servlet = new Jetty9WebsocketServlet(entry.getValue().newInstance()); + context.addServlet(new ServletHolder(entry.getValue().getName(), servlet), entry.getKey()); + } catch (InstantiationException | IllegalAccessException e) { + throw new RuntimeException("Failed to instantiate websocket servlet object", e); + } + } final List extHandlers = new ArrayList<>(); extHandlers.add(helper.authenticationHandler()); diff --git a/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9WebsocketServlet.java b/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9WebsocketServlet.java new file mode 100644 index 000000000000..b5b23970a0b2 --- /dev/null +++ b/h2o-jetty-9-minimal/src/main/java/water/webserver/jetty9/Jetty9WebsocketServlet.java @@ -0,0 +1,96 @@ +package water.webserver.jetty9; + + +import org.eclipse.jetty.websocket.api.Session; +import org.eclipse.jetty.websocket.api.WebSocketListener; +import org.eclipse.jetty.websocket.servlet.*; +import water.webserver.iface.H2OWebsocketServlet; +import water.webserver.iface.WebsocketConnection; +import water.webserver.iface.WebsocketHandler; + +import java.io.IOException; + +public class Jetty9WebsocketServlet extends WebSocketServlet { + + private final H2OWebsocketServlet impl; + + public Jetty9WebsocketServlet(H2OWebsocketServlet impl) { + this.impl = impl; + } + + static class Jetty9WebsocketConnection implements WebsocketConnection { + + private final Session sess; + + Jetty9WebsocketConnection(Session sess) { + this.sess = sess; + } + + @Override + public void sendMessage(String message) throws IOException { + sess.getRemote().sendString(message); + } + } + + class Jetty9WebsocketHandler implements WebSocketListener { + + private WebsocketHandler handler; + private Jetty9WebsocketConnection conn; + + @Override + public void onWebSocketConnect(Session sess) { + conn = new Jetty9WebsocketConnection(sess); + handler = impl.onConnect(conn); + } + + @Override + public void onWebSocketBinary(byte[] payload, int offset, int len) { + // ignore + } + + @Override + public void onWebSocketText(String message) { + handler.onMessage(message); + } + + @Override + public void onWebSocketClose(int statusCode, String reason) + { + handler.onClose(conn); + conn = null; + handler = null; + } + + @Override + public void onWebSocketError(Throwable cause) { + cause.printStackTrace(); + } + + } + + /** + * Please note, each Servlet has it's own instance of WebSocketServletFactory. + * + * @param factory Factory object to register socket creator with. + */ + @Override + public void configure(WebSocketServletFactory factory) { + factory.setCreator(new H2OWebSocketCreator()); + } + + /** + * Custom in-place socket creator, returning new instance of {@link Jetty9WebsocketHandler}m + * which already contains the proper {@link WebsocketServlet} implementation the request is being delegated to. + *

+ * This is required, as default {@link WebSocketServletFactory} uses {@link org.eclipse.jetty.util.DecoratedObjectFactory} + * to instantiate {@link WebSocketListener} classes. This class is only able to instantiate static classes with 0-arg constructor, + * which inner non-static class {@link Jetty9WebsocketHandler} is NOT. + */ + public class H2OWebSocketCreator implements WebSocketCreator { + + @Override + public Object createWebSocket(ServletUpgradeRequest req, ServletUpgradeResponse resp) { + return new Jetty9WebsocketHandler(); + } + } +} diff --git a/h2o-parsers/h2o-avro-parser/build.gradle b/h2o-parsers/h2o-avro-parser/build.gradle index be498665d8ed..d61f58043d90 100644 --- a/h2o-parsers/h2o-avro-parser/build.gradle +++ b/h2o-parsers/h2o-avro-parser/build.gradle @@ -6,7 +6,7 @@ description = "H2O Avro Parser" dependencies { api project(":h2o-core") // Avro support - api 'org.apache.avro:avro:1.11.3' + api 'org.apache.avro:avro:1.11.4' testImplementation project(":h2o-test-support") testRuntimeOnly project(":${defaultWebserverModule}") diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java index 8c35cc72c8b8..e1fc1f8a85ed 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java @@ -112,7 +112,9 @@ public void map(Chunk[] cs) { group = group.append(currColName, cs[j].at8(i)); break; case (T_STR): - group = group.append(currColName, cs[j].atStr(new BufferedString(), i).toString()); + if (!cs[j].isNA(i)) { + group = group.append(currColName, cs[j].atStr(new BufferedString(), i).toString()); + } break; case (T_CAT): if (cs[j].isNA(i)) { diff --git a/h2o-persist-gcs/build.gradle b/h2o-persist-gcs/build.gradle index 5d1ba374ad2b..d07448510c39 100644 --- a/h2o-persist-gcs/build.gradle +++ b/h2o-persist-gcs/build.gradle @@ -4,10 +4,18 @@ description = "H2O Persist GCS" dependencies { api project(":h2o-core") - api 'com.google.cloud:google-cloud-storage:2.13.1' + api ('com.google.cloud:google-cloud-storage:2.13.1') testImplementation project(":h2o-test-support") testRuntimeOnly project(":${defaultWebserverModule}") + + constraints { + api('com.google.protobuf:protobuf-java:3.25.5') { + because 'Fixes CVE-2024-7254' + because 'Fixes SNYK-JAVA-COMGOOGLEPROTOBUF-8055227' + because 'Fixes SNYK-JAVA-COMGOOGLEPROTOBUF-8055228' + } + } } apply from: "${rootDir}/gradle/dataCheck.gradle" diff --git a/h2o-py/h2o/display.py b/h2o-py/h2o/display.py index 6815c19a1f79..5b382433605c 100644 --- a/h2o-py/h2o/display.py +++ b/h2o-py/h2o/display.py @@ -5,6 +5,7 @@ :copyright: (c) 2016 H2O.ai :license: Apache License Version 2.0 (see LICENSE for details) """ +# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython from contextlib import contextmanager import os import sys diff --git a/h2o-py/h2o/estimators/decision_tree.py b/h2o-py/h2o/estimators/decision_tree.py index e598396b2a82..0e7f391b515f 100644 --- a/h2o-py/h2o/estimators/decision_tree.py +++ b/h2o-py/h2o/estimators/decision_tree.py @@ -107,6 +107,22 @@ def ignore_const_cols(self): Ignore constant columns. Type: ``bool``, defaults to ``True``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> prostate["const_1"] = 6 + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5, + ... ignore_const_cols=True) + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) """ return self._parms.get("ignore_const_cols") @@ -122,6 +138,22 @@ def categorical_encoding(self): Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate["RACE"] = prostate["RACE"].asfactor() + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5, + ... categorical_encoding="binary") + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) """ return self._parms.get("categorical_encoding") @@ -164,6 +196,20 @@ def max_depth(self): Max depth of tree. Type: ``int``, defaults to ``20``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5) + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) """ return self._parms.get("max_depth") @@ -178,6 +224,21 @@ def min_rows(self): Fewest allowed (weighted) observations in a leaf. Type: ``int``, defaults to ``10``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2ODecisionTreeEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") + >>> target_variable = 'CAPSULE' + >>> prostate[target_variable] = prostate[target_variable].asfactor() + >>> train, test = prostate.split_frame(ratios=[0.7]) + >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex", + ... max_depth=5, + ... min_rows=20) + >>> sdt_h2o.train(y=target_variable, training_frame=train) + >>> pred_test = sdt_h2o.predict(test) """ return self._parms.get("min_rows") diff --git a/h2o-py/h2o/estimators/rulefit.py b/h2o-py/h2o/estimators/rulefit.py index 529b371780ea..be80309794b0 100644 --- a/h2o-py/h2o/estimators/rulefit.py +++ b/h2o-py/h2o/estimators/rulefit.py @@ -206,6 +206,22 @@ def algorithm(self): The algorithm to use to generate rules. Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... algorithm="gbm", + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("algorithm") @@ -220,6 +236,22 @@ def min_rule_length(self): Minimum length of rules. Defaults to 3. Type: ``int``, defaults to ``3``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... min_rule_length=4, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("min_rule_length") @@ -234,6 +266,22 @@ def max_rule_length(self): Maximum length of rules. Defaults to 3. Type: ``int``, defaults to ``3``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... min_rule_length=3, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("max_rule_length") @@ -249,6 +297,21 @@ def max_num_rules(self): by diminishing returns in model deviance. Type: ``int``, defaults to ``-1``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=3, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("max_num_rules") @@ -263,6 +326,22 @@ def model_type(self): Specifies type of base learners in the ensemble. Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... model_type="rules", + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("model_type") @@ -298,6 +377,22 @@ def distribution(self): Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]``, defaults to ``"auto"``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... distribution="bernoulli", + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("distribution") @@ -312,6 +407,22 @@ def rule_generation_ntrees(self): Specifies the number of trees to build in the tree model. Defaults to 50. Type: ``int``, defaults to ``50``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... rule_generation_ntrees=60, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("rule_generation_ntrees") @@ -370,6 +481,22 @@ def max_categorical_levels(self): for categorical_encoding == EnumLimited. Type: ``int``, defaults to ``10``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... max_categorical_levels=11, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("max_categorical_levels") @@ -385,6 +512,21 @@ def rule_importance(self): Retrieve rule importances for a Rulefit model :return: H2OTwoDimTable + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> rule_importance = rfit.rule_importance() + >>> print(rfit.rule_importance()) """ if self._model_json["algo"] != "rulefit": raise H2OValueError("This function is available for Rulefit models only") @@ -397,11 +539,29 @@ def rule_importance(self): def predict_rules(self, frame, rule_ids): """ - Evaluates validity of the given rules on the given data. + Evaluates validity of the given rules on the given data. :param frame: H2OFrame on which rule validity is to be evaluated :param rule_ids: string array of rule ids to be evaluated against the frame :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not. + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv" + >>> df = h2o.import_file(path=f, col_types={'species': "enum"}) + >>> x = df.columns + >>> y = "species" + >>> x.remove(y) + >>> train, test = df.split_frame(ratios=[.8], seed=1234) + >>> rfit = H2ORuleFitEstimator(min_rule_length=4, + ... max_rule_length=5, + ... max_num_rules=3, + ... seed=1234, + ... model_type="rules") + >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test) + >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica'])) """ from h2o.frame import H2OFrame from h2o.utils.typechecks import assert_is_type diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 42726858deba..432e197016cb 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -28,7 +28,7 @@ from h2o.utils.metaclass import deprecated_fn from h2o.utils.shared_utils import(gen_header, is_list, is_list_of_lists, is_str_list, py_tmp_key, quoted, can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, - check_frame_id, can_use_datatable, can_use_polars, can_use_pyarrow) + check_frame_id, can_use_polars, can_use_pyarrow) from h2o.utils.threading import local_context, local_env from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray, numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U) @@ -1942,17 +1942,16 @@ def structure(self): else: print("num {}".format(" ".join(it[0] if it else "nan" for it in h2o.as_list(self[:10, i], False)[1:]))) - def as_data_frame(self, use_pandas=True, header=True): + def as_data_frame(self, use_pandas=True, header=True, use_multi_thread=False): """ Obtain the dataset as a python-local object. :param bool use_pandas: If True (default) then return the H2OFrame as a pandas DataFrame (requires that the ``pandas`` library was installed). If False, then return the contents of the H2OFrame as plain nested - list, in a row-wise order. The conversion to pandas frame will use multi-thread whenever - possible with the right python modules (datatable or polars and pyarrow) installed. Otherwise, single - thread operation will be used in the conversion. + list, in a row-wise order. :param bool header: If True (default), then column names will be appended as the first row in list - + :param bool use_multi_thread: If True (False by default), will use polars/pyarrow to perform conversion in + multi-thread which is faster. :returns: A python object (a list of lists of strings, each list is a row, if ``use_pandas=False``, otherwise a pandas DataFrame) containing this H2OFrame instance's data. @@ -1969,22 +1968,19 @@ def as_data_frame(self, use_pandas=True, header=True): """ if can_use_pandas() and use_pandas: import pandas - if (can_use_datatable()) or (can_use_polars() and can_use_pyarrow()): # can use multi-thread - exportFile = tempfile.NamedTemporaryFile(suffix=".h2oframe2Convert.csv", delete=False) - try: - exportFile.close() # needed for Windows - h2o.export_file(self, exportFile.name, force=True) - if can_use_datatable(): # use datatable for multi-thread by default - return self.convert_with_datatable(exportFile.name) - elif can_use_polars() and can_use_pyarrow(): # polar/pyarrow if datatable is not available - return self.convert_with_polars(exportFile.name) - finally: - os.unlink(exportFile.name) + if use_multi_thread: + with local_context(polars_enabled=True): # turn on multi-thread toolboxes + if can_use_polars() and can_use_pyarrow(): # can use multi-thread + exportFile = tempfile.NamedTemporaryFile(suffix=".h2oframe2Convert.csv", delete=False) + try: + exportFile.close() # needed for Windows + h2o.export_file(self, exportFile.name, force=True) + return self.convert_with_polars(exportFile.name) + finally: + os.unlink(exportFile.name) warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using" - " multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow " - "(for Python 3.10 or above) and activate it using:\n\n"+ - "with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):\n" - " pandas_df = h2o_df.as_data_frame()\n", H2ODependencyWarning) + " multi-thread, install polars and pyarrow and use it as " + "pandas_df = h2o_df.as_data_frame(use_multi_thread=True)\n", H2ODependencyWarning) return pandas.read_csv(StringIO(self.get_frame_data()), low_memory=False, skip_blank_lines=False) from h2o.utils.csv.readers import reader @@ -1998,18 +1994,6 @@ def convert_with_polars(self, fileName): import polars as pl dt_frame = pl.read_csv(fileName, null_values = "") return dt_frame.to_pandas() - - def convert_with_datatable(self, fileName): - import datatable as dt - frameTypes = self.types - validFrameTypes = {} - for key, value in frameTypes.items(): - if value.startswith('int'): - validFrameTypes[key] = dt.int64 - elif value.startswith("real"): - validFrameTypes[key] = dt.float64 - dt_frame = dt.fread(fileName, na_strings=[""], columns=validFrameTypes) - return dt_frame.to_pandas() def save_to_hive(self, jdbc_url, table_name, format="csv", table_path=None, tmp_path=None): """ diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index 542cb117d872..db67d4fb93be 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -868,14 +868,22 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co if ind in skipped_columns: use_type[ind]=False - if column_names is not None: + if column_names is not None: if not isinstance(column_names, list): raise ValueError("col_names should be a list") if (skipped_columns is not None) and len(skipped_columns)>0: - if (len(column_names)) != parse_column_len: + # when we are converting a python object to H2OFrame, column_names will include all columns despite + # skipped columns are specified. In this case, we need to make sure that + # len(column_names)-len(skipped_columns)==parse_column_len + # When we are importing a file with skipped columns mentioned, column_names will only contain columns that + # are not skipped. Hence, in this case, we need to check len(column_names) == parse_column_len. + # To combine the two, correct parsing will have conditions len(column_names)-len(skipped_columns)==parse_column_len + # or len(column_names)==parse_column_len. Hence, we will raise an error when + # not(len(column_names)-len(skipped_columns)==parse_column_len or len(column_names)==parse_column_len happened. + if not((len(column_names) == parse_column_len) or ((len(column_names)-len(skipped_columns))==parse_column_len)): raise ValueError( - "length of col_names should be equal to the number of columns parsed: %d vs %d" - % (len(column_names), parse_column_len)) - else: + "length of col_names minus length of skipped_columns should equal the number of columns parsed: " + "%d vs %d" % (len(column_names), parse_column_len)) + else: # no skipped columns here if len(column_names) != len(j["column_types"]): raise ValueError( "length of col_names should be equal to the number of columns: %d vs %d" % (len(column_names), len(j["column_types"]))) diff --git a/h2o-py/h2o/plot/_matplotlib.py b/h2o-py/h2o/plot/_matplotlib.py index fa0b4212041f..d7e97caf541c 100644 --- a/h2o-py/h2o/plot/_matplotlib.py +++ b/h2o-py/h2o/plot/_matplotlib.py @@ -1,9 +1,11 @@ def get_matplotlib_pyplot(server, raise_if_not_available=False): + # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython try: # noinspection PyUnresolvedReferences import matplotlib - matplotlib.use("Agg") + if server: + matplotlib.use("Agg") try: # noinspection PyUnresolvedReferences import matplotlib.pyplot as plt @@ -25,6 +27,7 @@ def get_matplotlib_pyplot(server, raise_if_not_available=False): def get_polycollection(server, raise_if_not_available=False): + # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython try: from matplotlib.collections import PolyCollection as polycoll return polycoll @@ -36,6 +39,7 @@ def get_polycollection(server, raise_if_not_available=False): def get_matplotlib_cm(function_name): + # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython try: from matplotlib import cm return cm @@ -45,6 +49,7 @@ def get_matplotlib_cm(function_name): def get_mplot3d_axes(function_name): + # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython try: # noinspection PyUnresolvedReferences from mpl_toolkits.mplot3d import Axes3D diff --git a/h2o-py/h2o/plot/_plot_result.py b/h2o-py/h2o/plot/_plot_result.py index ad8b3bab2d87..25e6642957af 100644 --- a/h2o-py/h2o/plot/_plot_result.py +++ b/h2o-py/h2o/plot/_plot_result.py @@ -1,5 +1,6 @@ # -*- encoding: utf-8 -*- # mutable versions of py immutable types +# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython from h2o.exceptions import H2OError __no_export = set(dir()) # all variables defined above this are not exported diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py index 67b622fd62c6..89356c1144ed 100644 --- a/h2o-py/h2o/utils/shared_utils.py +++ b/h2o-py/h2o/utils/shared_utils.py @@ -137,28 +137,19 @@ def is_module_enabled(mod): def can_use_pandas(): return is_module_available('pandas') - -def can_use_datatable(): - return is_module_enabled('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9 - - -def can_install_datatable(): - return sys.version_info.major == 3 and sys.version_info.minor <= 9 - - def can_install_polars(): - return sys.version_info.major == 3 and sys.version_info.minor > 9 + return sys.version_info.major == 3 and sys.version_info.minor >= 6 def can_use_polars(): - return is_module_enabled('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9 + return is_module_enabled('polars') and sys.version_info.major == 3 and sys.version_info.minor >= 6 def can_use_pyarrow(): - if can_use_pandas() and sys.version_info.minor > 9: + if can_use_pandas() and sys.version_info.minor >= 6: import pandas - return is_module_available('pyarrow') and sys.version_info.major == 3 and sys.version_info.minor > 9 and \ - sys.version_info.major == 3 and float(pandas.__version__[0]) >= 1 + return is_module_available('pyarrow') and sys.version_info.major == 3 and sys.version_info.minor >= 6 and \ + float(pandas.__version__[0]) >= 1 else: return False diff --git a/h2o-py/tests/pyunit_utils/__init__.py b/h2o-py/tests/pyunit_utils/__init__.py index b2987ec0859d..a8415192911b 100644 --- a/h2o-py/tests/pyunit_utils/__init__.py +++ b/h2o-py/tests/pyunit_utils/__init__.py @@ -3,3 +3,4 @@ from .utils_model_custom_distribution import * from .utils_for_glm_tests import * from .sklearn_multinomial_auc_method import roc_auc_score +from .utils_parser_tests import * diff --git a/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py b/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py index 010845eb56e0..f993375e9578 100644 --- a/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py +++ b/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py @@ -1,4 +1,3 @@ -import h2o from h2o.estimators import H2OGeneralizedLinearEstimator as glm from h2o.exceptions import H2OValueError from h2o.grid.grid_search import H2OGridSearch diff --git a/h2o-py/tests/pyunit_utils/utils_parser_tests.py b/h2o-py/tests/pyunit_utils/utils_parser_tests.py new file mode 100644 index 000000000000..4993887032eb --- /dev/null +++ b/h2o-py/tests/pyunit_utils/utils_parser_tests.py @@ -0,0 +1,45 @@ +from tests import pyunit_utils +import h2o +import time +import pandas as pd + +def test_frame_conversion(dataset, original_pandas_frame): + # convert frame using datatable or polar/pyarrow + h2oframe = h2o.import_file(pyunit_utils.locate(dataset)) + test_frames_conversion(h2oframe, original_pandas_frame) + +def test_frames_conversion(h2oframe, original_pandas_frame): + start_time = time.time() + new_pandas_frame = h2oframe.as_data_frame(use_multi_thread=True) + new_time = time.time()-start_time + print("H2O frame to Pandas frame conversion time with multi-thread using module polars/pyarrow: {0}".format(new_time)) + # compare two frames column types + new_types = new_pandas_frame.dtypes + old_types = original_pandas_frame.dtypes + ncol = h2oframe.ncol + col_names = new_pandas_frame.columns + + for ind in list(range(ncol)): + assert new_types[col_names[ind]] == old_types[col_names[ind]], "Expected column types: {0}, actual column types: " \ + "{1}".format(old_types[col_names[ind]], new_types[col_names[ind]]) + if new_types[col_names[ind]] == "object": + diff = new_pandas_frame[col_names[ind]] == original_pandas_frame[col_names[ind]] + if not diff.all(): # difference caused by the presence of NAs + new_series = pd.Series(new_pandas_frame[col_names[ind]]) + new_NA = new_series.isna() + old_series = pd.Series(original_pandas_frame[col_names[ind]]) + old_NA = old_series.isna() + assert (new_NA==old_NA).all() + else: + diff = (new_pandas_frame[col_names[ind]] - original_pandas_frame[col_names[ind]]).abs() + assert diff.max() < 1e-10 + + +def single_thread_pandas_conversion(dataset): + print("converting h2o frame to pandas frame using single thread") + h2oframe = h2o.import_file(pyunit_utils.locate(dataset)) + start_time = time.time() + h2oframe_panda = h2oframe.as_data_frame() + new_time = time.time()-start_time + print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(new_time, dataset)) + return h2oframe_panda diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_bad_constraints_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_bad_constraints_large.py new file mode 100644 index 000000000000..c9a81a79a75f --- /dev/null +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_bad_constraints_large.py @@ -0,0 +1,156 @@ +import h2o +from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm +import numpy as np +import pandas as pd +from tests import pyunit_utils + +# this test needs to run into completion duplicating/conflicting constraints +def data_prep(seed): + np.random.seed(seed) + x1 = np.random.normal(0, 10, 100000) + x2 = np.random.normal(10, 100 , 100000) + x3 = np.random.normal(20, 200, 100000) + x4 = np.random.normal(30, 3000, 100000) + x5 = np.random.normal(400, 4000, 100000) + + y_raw = np.sin(x1)*100 + np.sin(x2)*100 + x3/20 + x3/30 + x5/400 + y = np.random.normal(y_raw, 20) + + data = { + 'x1': x1, + 'x2': x2, + 'x3': x3, + 'x4': x4, + 'x5': x5, + 'y': y, + } + return h2o.H2OFrame(pd.DataFrame(data)) + +def test_duplicate_conflicting_constraints(): + train_data = data_prep(123) + family = 'gaussian' + link = 'identity' + nfolds = 0 + lambda_ = 0.0 + seed = 1234 + calc_like = True + compute_p_values = True + solver = 'irlsm' + predictors = ['x1', 'x2', 'x3', 'x4', 'x5'] + response = "y" + + linear_constraints2 = [] + + name = "x2" + values = 1 + types = "LessThanEqual" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = -1 + types = "LessThanEqual" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "LessThanEqual" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x4" + values = -1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x2" + values = 1 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = 1 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x4" + values = 1 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + linear_constraints = h2o.H2OFrame(linear_constraints2) + linear_constraints.set_names(["names", "values", "types", "constraint_numbers"]) + + params = { + "family" : family, + "link": link, + "lambda_" : lambda_, + "seed" : seed, + "nfolds" : nfolds, + "compute_p_values" : compute_p_values, + "calc_like" : calc_like, + "solver" : solver, + "linear_constraints": linear_constraints + } + + model = glm(**params) + model.train(x = predictors, y = response, training_frame = train_data) + print(model.coef()) + coef_constrained = model.coef() + print(glm.getConstraintsInfo(model)) + + params = { + "family" : family, + "link": link, + "lambda_" : lambda_, + "seed" : seed, + "nfolds" : nfolds, + "compute_p_values" : compute_p_values, + "calc_like" : calc_like, + "solver" : solver, + } + + model_no_constraints = glm(**params) + model_no_constraints.train(x = predictors, y = response, training_frame = train_data) + coef_no_constraints = model_no_constraints.coef() + print("model built without constraints") + print(coef_no_constraints) + print("x2-x3: {0}".format(coef_no_constraints['x2']-coef_no_constraints['x3'])) + print("x3-x4: {0}".format(coef_no_constraints['x3']-coef_no_constraints['x4'])) + print("x2+x3+x4: {0}".format(coef_no_constraints['x2']+coef_no_constraints['x3']+coef_no_constraints['x4'])) + # assert that model with linear constraints does a better job than model without constraints + assert (coef_constrained['x2']-coef_constrained['x3']) < (coef_no_constraints['x2']-coef_no_constraints['x3']), \ + "Model built with constraints should be closer to the constraint x2-x3 <= 0" + assert (coef_constrained['x3']-coef_constrained['x4']) < (coef_no_constraints['x3']-coef_no_constraints['x4']), \ + "Model built with constraints should be closer to the constraint x3-x4 <= 0" + assert (coef_constrained['x2']+coef_constrained['x3']+coef_constrained['x4']) < \ + (coef_no_constraints['x2']+coef_no_constraints['x3']+coef_no_constraints['x4']), \ + "Model built with constraints should be closer to the constraint x2+x3+x4 <= 0" + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_duplicate_conflicting_constraints) +else: + test_duplicate_conflicting_constraints() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_beta_constraint_NPE_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_beta_constraint_NPE_large.py new file mode 100644 index 000000000000..7c7b18ef72c7 --- /dev/null +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_beta_constraint_NPE_large.py @@ -0,0 +1,163 @@ +import h2o +from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm +from tests import pyunit_utils +import numpy as np +import pandas as pd + +# For beta constraints, if only upper_bounds are specified, there are NPE errors because the code expects both upper +# and lower bounds to be specified. I have since fixed this error. +def data_prep(seed): + np.random.seed(seed) + x1 = np.random.normal(0, 10, 100000) + x2 = np.random.normal(10, 100 , 100000) + x3 = np.random.normal(20, 200, 100000) + x4 = np.random.normal(30, 3000, 100000) + x5 = np.random.normal(400, 4000, 100000) + + y_raw = np.sin(x1)*100 + np.sin(x2)*100 + x3/20 + x3/30 + x5/400 + y = np.random.normal(y_raw, 20) + + data = { + 'x1': x1, + 'x2': x2, + 'x3': x3, + 'x4': x4, + 'x5': x5, + 'y': y, + } + return h2o.H2OFrame(pd.DataFrame(data)) + +def test_bad_lambda_specification(): + train_data = data_prep(123) + family = 'gaussian' + link = 'identity' + nfolds = 0 + lambda_ = 0.0 + seed = 1234 + calc_like = True + compute_p_values = True + solver = 'irlsm' + predictors = ['x1', 'x2', 'x3', 'x4', 'x5'] + response = "y" + + # beta constraints + bc = [] + name = 'x1' + lower_bound = 0.03 + bc.append([name, lower_bound]) + + beta_constraints = h2o.H2OFrame(bc) + beta_constraints.set_names(["names", "lower_bounds"]) + + linear_constraints2 = [] + + name = "x2" + values = 1 + types = "LessThanEqual" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = -1 + types = "LessThanEqual" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "LessThanEqual" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x4" + values = -1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x2" + values = 1 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = 1 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x4" + values = 1 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "LessThanEqual" + contraint_numbers = 2 + linear_constraints2.append([name, values, types, contraint_numbers]) + + + linear_constraints = h2o.H2OFrame(linear_constraints2) + linear_constraints.set_names(["names", "values", "types", "constraint_numbers"]) + + linear_constraints = h2o.H2OFrame(linear_constraints2) + linear_constraints.set_names(["names", "values", "types", "constraint_numbers"]) + # check lower bound of beta constraint will not generate error but lambda will. + params = { + "family" : family, + "link": link, + "lambda_" : lambda_, + "seed" : seed, + "nfolds" : nfolds, + "compute_p_values" : compute_p_values, + "calc_like" : calc_like, + "solver" : solver, + "linear_constraints": linear_constraints, + "beta_constraints": beta_constraints + } + + model = glm(**params) + model.train(x = predictors, y = response, training_frame = train_data) + coefs = model.coef() + print(coefs) + print(glm.getConstraintsInfo(model)) + # beta constraints should be satisfied + assert coefs["x1"] >= 0.03 or abs(coefs["x1"]-0.03) < 1e-6, "beta constraint x1 ({0}) >= 0.03 is violated!".format(coefs["x1"]) + + # beta constraints + bc = [] + name = 'x1' + upper_bound = 1.5 + bc.append([name, upper_bound]) + + beta_constraints2 = h2o.H2OFrame(bc) + beta_constraints2.set_names(["names", "upper_bounds"]) + + params['beta_constraints'] = beta_constraints2 + model = glm(**params) + model.train(x = predictors, y = response, training_frame = train_data) + coefs = model.coef() + print(coefs) + print(glm.getConstraintsInfo(model)) + # beta constraints should always be satisfied + assert coefs["x1"] <= 1.5 or abs(1.5-coefs["x1"])<1e-6, "beta constraint x1 ({0}) >= 1.5 is violated.".format(coefs["x1"]) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_bad_lambda_specification) +else: + test_bad_lambda_specification() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_test_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_test_large.py new file mode 100644 index 000000000000..dc337ae33cf6 --- /dev/null +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_test_large.py @@ -0,0 +1,108 @@ +import h2o +from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm +from tests import pyunit_utils +import numpy as np +import pandas as pd + +def data_prep(seed): + np.random.seed(seed) + x1 = np.random.normal(0, 10, 100000) + x2 = np.random.normal(10, 100 , 100000) + x3 = np.random.normal(20, 200, 100000) + x4 = np.random.normal(30, 3000, 100000) + x5 = np.random.normal(400, 4000, 100000) + + y_raw = np.sin(x1)*100 + np.sin(x2)*100 + x3/20 + x3/30 + x5/400 + y = np.random.normal(y_raw, 20) + + data = { + 'x1': x1, + 'x2': x2, + 'x3': x3, + 'x4': x4, + 'x5': x5, + 'y': y, + } + return h2o.H2OFrame(pd.DataFrame(data)) + +def test_bad_linear_constraints(): + train_data = data_prep(123) + family = 'gaussian' + link = 'identity' + nfolds = 0 + lambda_ = 0 + seed = 1234 + calc_like = True + compute_p_values = True + solver = 'irlsm' + predictors = ['x1', 'x2', 'x3', 'x4', 'x5'] + response = "y" + + linear_constraints2 = [] + + name = "x2" + values = 1 + types = "Equal" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "x3" + values = 1 + types = "Equal" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 0 + types = "Equal" + contraint_numbers = 0 + linear_constraints2.append([name, values, types, contraint_numbers]) + + + linear_constraints = h2o.H2OFrame(linear_constraints2) + linear_constraints.set_names(["names", "values", "types", "constraint_numbers"]) + + params3 = { + "family" : family, + "link": link, + "lambda_" : lambda_, + "seed" : seed, + "nfolds" : nfolds, + "compute_p_values" : compute_p_values, + "calc_like" : calc_like, + "solver" : solver, + "linear_constraints": linear_constraints, + "standardize": True, + } + + glm3 = glm(**params3) + glm3.train(x = predictors, y = response, training_frame = train_data) + print(glm.getConstraintsInfo(glm3)) + coef3 = glm3.coef() + print(glm3.coef()) + + params2 = { + "family" : family, + "link": link, + "lambda_" : lambda_, + "seed" : seed, + "nfolds" : nfolds, + "compute_p_values" : compute_p_values, + "calc_like" : calc_like, + "solver" : solver + } + glm2 = glm(**params2) + glm2.train(x = predictors, y = response, training_frame = train_data) + print("Models built without linear constraints") + coef2 = glm2.coef() + print(coef2) + print("x2 + x3: {0}".format(coef2["x2"]+coef2["x3"])) + + # check that model with constraints are closer to the constraints than models without constraints + assert (coef3["x2"]+coef3["x3"])<(coef2["x2"]+coef3["x3"]), \ + "models built with constraints should be closer to the constraints x2+x3 but is not." + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_bad_linear_constraints) +else: + test_bad_linear_constraints() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py index 4c01ed1f67d7..569a4304268b 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py @@ -195,13 +195,13 @@ def test_constraints_binomial(): print(glm.getConstraintsInfo(h2o_glm_default_init)) - assert abs(logloss-init_logloss)<2e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \ + assert abs(logloss-init_logloss)<1e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \ "and initialized with optimal GLM {1} should equal but is not." \ "".format(logloss, init_logloss) - assert logloss <= init_random_logloss, "logloss from optimal GLM {0} should be less than GLM with constraints " \ + assert abs(logloss-init_random_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \ "and with random initial coefficients {1} but is" \ " not.".format(logloss, init_random_logloss) - assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be less than GLM with constraints " \ + assert abs(logloss-default_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \ "and with default initial coefficients {1} but is" \ " not.".format(logloss, default_init_logloss) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py index 8a47d3773279..8c29822f0b87 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_constraints_binomial(): +def test_equality_constraints_only_binomial(): ''' This test checks and make sure the equality constraints work with binomial family. Coefficients are initialized with glm coefficients built without contraints, default coefficients and random coefficients. @@ -124,11 +124,12 @@ def test_constraints_binomial(): "".format(default_init_logloss, h2o_glm_default_init._model_json["output"]["model_summary"].cell_values[0][6])) print(glm.getConstraintsInfo(h2o_glm_default_init)) - assert init_random_logloss >= logloss, "Random initialization logloss with constraints should be worst than GLM " \ - "without constraints but is not." + assert abs(init_random_logloss - logloss) < 1e-6, \ + "Random initialization logloss {0} with constraints should be similary to than GLM without constraints {1} but" \ + " is not.".format(init_random_logloss, logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_constraints_binomial) + pyunit_utils.standalone_test(test_equality_constraints_only_binomial) else: - test_constraints_binomial() + test_equality_constraints_only_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py index acf74d648dea..dd884aaea48b 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_constraints_binomial(): +def test_equality_linear_constraints_binomial(): ''' This test checks and make sure the equality constraints work with binomial family. Coefficients are initialized with glm coefficients built without constraints, default coefficients and random coefficients. Note in this case, @@ -156,18 +156,18 @@ def test_constraints_binomial(): " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) - assert abs(logloss-init_logloss)<2e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \ + assert abs(logloss-init_logloss)<1e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \ "and initialized with optimal GLM {1} should equal but is not." \ "".format(logloss, init_logloss) - assert logloss<=init_random_logloss, "logloss from optimal GLM {0} should be lower than GLM with constraints " \ + assert abs(logloss-init_random_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \ "and with random initial coefficients {1} but is" \ " not.".format(logloss, init_random_logloss) - assert logloss<=default_init_logloss, "logloss from optimal GLM {0} should be less than GLM with constraints " \ + assert abs(logloss-default_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \ "and with default initial coefficients {1} but is" \ " not.".format(logloss, default_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_constraints_binomial) + pyunit_utils.standalone_test(test_equality_linear_constraints_binomial) else: - test_constraints_binomial() + test_equality_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py index 97ddf83bbe29..732bdda9d8e5 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_light_tight_linear_constraints_only_gaussian(): +def test_light_tight_linear_constraints_binomial(): ''' Test constrained GLM with beta, equality and less than and equal to constraints. The constraints are not very tight. However, coefficients from GLM built without constraints won't be able to satisfied the constraints. @@ -207,6 +207,6 @@ def test_light_tight_linear_constraints_only_gaussian(): "not.".format(logloss, random_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian) + pyunit_utils.standalone_test(test_light_tight_linear_constraints_binomial) else: - test_light_tight_linear_constraints_only_gaussian() + test_light_tight_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py index 96146d98234a..8e50603e9f15 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_light_tight_linear_constraints_only_gaussian(): +def test_light_tight_linear_constraints_only_binomial(): ''' Test constrained GLM with equality and less than and equal to constraints. The constraints are not very tight. However, coefficients from GLM built without constraints won't be able to satisfied the constraints. @@ -173,7 +173,7 @@ def test_light_tight_linear_constraints_only_gaussian(): "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) - assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ + assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \ " constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss) assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ @@ -185,6 +185,6 @@ def test_light_tight_linear_constraints_only_gaussian(): "not.".format(logloss, random_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian) + pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_binomial) else: - test_light_tight_linear_constraints_only_gaussian() + test_light_tight_linear_constraints_only_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py index 103086f4ca6f..d59c80fd99a5 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_light_tight_linear_constraints_only_gaussian(): +def test_light_tight_linear_constraints_only_binomial(): ''' Test constrained GLM with less than and equal to constraints. The constraints are not very tight. However, coefficients from GLM built without constraints won't be able to satisfied the constraints. @@ -189,7 +189,7 @@ def test_light_tight_linear_constraints_only_gaussian(): print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) - assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ + assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \ " constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss) assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ @@ -201,6 +201,6 @@ def test_light_tight_linear_constraints_only_gaussian(): "not.".format(logloss, random_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian) + pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_binomial) else: - test_light_tight_linear_constraints_only_gaussian() + test_light_tight_linear_constraints_only_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py index 3a75c301cc65..08778437db07 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_constraints_binomial(): +def test_loose_beta_linear_constraints_binomial(): ''' check and make sure coefficients close to GLM built without constraints are generated with loose constraints that are satisfied with coefficients from GLM without constraints. Only beta and less than and equal to @@ -157,6 +157,6 @@ def test_constraints_binomial(): if __name__ == "__main__": - pyunit_utils.standalone_test(test_constraints_binomial) + pyunit_utils.standalone_test(test_loose_beta_linear_constraints_binomial) else: - test_constraints_binomial() + test_loose_beta_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py index af40e6503b38..f9b0d17f976d 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_constraints_binomial(): +def test_loose_linear_constraints_binomial(): ''' check and make sure coefficients close to GLM built without constraints are generated with loose constraints that are satisfied with coefficients from GLM without constraints. Only less than and equal to @@ -135,6 +135,6 @@ def test_constraints_binomial(): " but is not.".format(logloss, default_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_constraints_binomial) + pyunit_utils.standalone_test(test_loose_linear_constraints_binomial) else: - test_constraints_binomial() + test_loose_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py index 56a9c052625b..81fd336e2603 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py @@ -198,7 +198,7 @@ def test_redundant_constraints(): except Exception as ex: print(ex) temp = str(ex) - assert ("redundant and possibly conflicting linear constraints" in temp), "Wrong exception was received." + assert ("redundant linear constraints:" in temp), "Wrong exception was received." print("redundant constraint test passed!") if __name__ == "__main__": diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py index d7befd7da108..f40887389093 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_light_tight_linear_constraints_only_gaussian(): +def test_tight_beta_linear_constraints_binomial(): ''' Test constrained GLM with beta, equality and less than and equal to constraints. The constraints are very tight and coefficients from GLM built without constraints won't be able to satisfied the constraints. @@ -174,42 +174,6 @@ def test_light_tight_linear_constraints_only_gaussian(): types = "Equal" contraint_numbers = 5 tight_constraints.append([name, values, types, contraint_numbers]) - - name = "C19" - values = 0.5 - types = "Equal" - contraint_numbers = 4 - tight_constraints.append([name, values, types, contraint_numbers]) - - name = "C10.1" - values = -0.3 - types = "Equal" - contraint_numbers = 4 - tight_constraints.append([name, values, types, contraint_numbers]) - - name = "constant" - values = -0.5 - types = "Equal" - contraint_numbers = 4 - tight_constraints.append([name, values, types, contraint_numbers]) - - name = "C18" - values = 0.75 - types = "Equal" - contraint_numbers = 5 - tight_constraints.append([name, values, types, contraint_numbers]) - - name = "C20" - values = -0.13 - types = "Equal" - contraint_numbers = 5 - tight_constraints.append([name, values, types, contraint_numbers]) - - name = "constant" - values = -3 - types = "Equal" - contraint_numbers = 5 - tight_constraints.append([name, values, types, contraint_numbers]) linear_constraints2 = h2o.H2OFrame(tight_constraints) linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) @@ -317,6 +281,6 @@ def test_light_tight_linear_constraints_only_gaussian(): "not.".format(logloss, random_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian) + pyunit_utils.standalone_test(test_tight_beta_linear_constraints_binomial) else: - test_light_tight_linear_constraints_only_gaussian() + test_tight_beta_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py index 1f4888b195a3..94ac1155c494 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_light_tight_linear_constraints_only_gaussian(): +def test_tight_equality_linear_constraints_binomial(): ''' Test constrained GLM with equality and less than and equal to constraints. The constraints are very tight and coefficients from GLM built without constraints won't be able to satisfied the constraints. @@ -225,7 +225,7 @@ def test_light_tight_linear_constraints_only_gaussian(): print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) - assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ + assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \ " constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss) assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ @@ -237,6 +237,6 @@ def test_light_tight_linear_constraints_only_gaussian(): "not.".format(logloss, random_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian) + pyunit_utils.standalone_test(test_tight_equality_linear_constraints_binomial) else: - test_light_tight_linear_constraints_only_gaussian() + test_tight_equality_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py index d8668bad9776..cc5b5385c8d3 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py @@ -3,7 +3,7 @@ from tests import pyunit_utils from tests.pyunit_utils import utils_for_glm_tests -def test_light_tight_linear_constraints_only_gaussian(): +def test_tight_linear_constraints_binomial(): ''' Test constrained GLM with less than and equal to constraints. The constraints are very tight and coefficients from GLM built without constraints won't be able to satisfied the constraints. @@ -189,7 +189,7 @@ def test_light_tight_linear_constraints_only_gaussian(): print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) - assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ + assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \ " constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss) assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ @@ -201,6 +201,6 @@ def test_light_tight_linear_constraints_only_gaussian(): "not.".format(logloss, random_init_logloss) if __name__ == "__main__": - pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian) + pyunit_utils.standalone_test(test_tight_linear_constraints_binomial) else: - test_light_tight_linear_constraints_only_gaussian() + test_tight_linear_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_gh_16203_constrained_glm_example.py b/h2o-py/tests/testdir_algos/glm/pyunit_gh_16203_constrained_glm_example.py new file mode 100644 index 000000000000..86ecfb4236f0 --- /dev/null +++ b/h2o-py/tests/testdir_algos/glm/pyunit_gh_16203_constrained_glm_example.py @@ -0,0 +1,206 @@ +import h2o +from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm +from tests import pyunit_utils + +def test_constrained_glm_example(): + ''' + Simple example to showcase how to call constrained GLM. + ''' + #train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/binomial_20_cols_10KRows.csv") + train = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) + for ind in range(10): + train[ind] = train[ind].asfactor() + train["C21"] = train["C21"].asfactor() + response = "C21" + predictors = list(range(0,20)) + # add beta constraints + bc = [] + name = "C11" + lower_bound = -3.5 + upper_bound = 0 + bc.append([name, lower_bound, upper_bound]) + + name = "C18" + lower_bound = 6 + upper_bound = 7 + bc.append([name, lower_bound, upper_bound]) + + name = "C15" + lower_bound = -9 + upper_bound = -6 + bc.append([name, lower_bound, upper_bound]) + + name = "C16" + lower_bound = -20 + upper_bound = -10 + bc.append([name, lower_bound, upper_bound]) + + beta_constraints = h2o.H2OFrame(bc) + beta_constraints.set_names(["names", "lower_bounds", "upper_bounds"]) + + tight_constraints = [] # this constraint is satisfied by default coefficient initialization + + # add tight constraints + name = "C1.1" + values = 0.5 + types = "LessThanEqual" + contraint_numbers = 0 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C2.1" + values = -0.25 + types = "LessThanEqual" + contraint_numbers = 0 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -1 + types = "LessThanEqual" + contraint_numbers = 0 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C4.1" + values = 1.5 + types = "LessThanEqual" + contraint_numbers = 1 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C17" + values = 3 + types = "LessThanEqual" + contraint_numbers = 1 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C15" + values = -2 + types = "LessThanEqual" + contraint_numbers = 1 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -5 + types = "LessThanEqual" + contraint_numbers = 1 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C12" + values = -0.5 + types = "LessThanEqual" + contraint_numbers = 2 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C13" + values = -1.5 + types = "LessThanEqual" + contraint_numbers = 2 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C14" + values = 2 + types = "LessThanEqual" + contraint_numbers = 2 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -3 + types = "LessThanEqual" + contraint_numbers = 2 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C11" + values = 0.25 + types = "LessThanEqual" + contraint_numbers = 3 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C18" + values = -0.5 + types = "LessThanEqual" + contraint_numbers = 3 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C19" + values = 0.75 + types = "LessThanEqual" + contraint_numbers = 3 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 5 + types = "LessThanEqual" + contraint_numbers = 3 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C19" + values = 0.5 + types = "Equal" + contraint_numbers = 4 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C10.1" + values = -0.3 + types = "Equal" + contraint_numbers = 4 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -0.25 + types = "Equal" + contraint_numbers = 4 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C18" + values = 0.75 + types = "Equal" + contraint_numbers = 5 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "C20" + values = -0.13 + types = "Equal" + contraint_numbers = 5 + tight_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -1.5 + types = "Equal" + contraint_numbers = 5 + tight_constraints.append([name, values, types, contraint_numbers]) + + linear_constraints2 = h2o.H2OFrame(tight_constraints) + linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) + + random_coef = [0.9740393731418461, 0.9021970400494406, 0.8337282995102272, 0.20588758679724872, 0.12522385214612453, + 0.6390730524643073, 0.7055779213989253, 0.9004255614099713, 0.4075431157767999, 0.161093231584713, + 0.15250197544465616, 0.7172682822215489, 0.60836236371404, 0.07086628306822396, 0.263719138602719, + 0.16102036359390437, 0.0065987448849305075, 0.5881312311814277, 0.7836567678399617, 0.9104401158881326, + 0.8432891635016235, 0.033440093086177236, 0.8514611306363931, 0.2855332934628241, 0.36525972112514427, + 0.7526593301495519, 0.9963694184200753, 0.5614168317678196, 0.7950126291921057, 0.6212978800904426, + 0.176936615687169, 0.8817788599562331, 0.13699370230879637, 0.5754950980437555, 0.1507294463182668, + 0.23409699287029495, 0.6949148063429461, 0.47140569181488556, 0.1470896240551064, 0.8475557222612405, + 0.05957485472498203, 0.07490903723892406, 0.8412381196460251, 0.26874846387453943, 0.13669341206289243, + 0.8525684329438777, 0.46716360402752777, 0.8522055745422484, 0.3129394551398561, 0.908966336417204, + 0.26259461196353984, 0.07245314277889847, 0.41429401839807156, 0.22772860293274222, 0.26662443208488784, + 0.9875655504027848, 0.5832266083052889, 0.24205847206862052, 0.9843760682096272, 0.16269008279311103, + 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, + 0.224690851359456, 0.5809304720756304, 0.36863807988348585] + params = {"family":"binomial", "lambda_":0.0, "seed":12345, "remove_collinear_columns":True, "solver":"IRLSM", + "linear_constraints":linear_constraints2, "beta_constraints":beta_constraints} + + # build constrained GLM with default coefficient initialization, all coefficients zero except intercept + constrained_glm_default_init = glm(**params) + constrained_glm_default_init.train(x=predictors, y=response, training_frame=train) + logloss_default = constrained_glm_default_init.model_performance()._metric_json['logloss'] + # build constrained GLM model with random coefficient initialization + params["startval"] = random_coef + constrained_glm_random_init = glm(**params) + constrained_glm_random_init.train(x=predictors, y=response, training_frame=train) + logloss = constrained_glm_random_init.model_performance()._metric_json['logloss'] + + assert logloss_default >= logloss or abs(logloss_default-logloss) < 1e-2 + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_constrained_glm_example) +else: + test_constrained_glm_example() diff --git a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py index 7540abf1dc5c..186062fba82f 100644 --- a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py +++ b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py @@ -8,7 +8,8 @@ from h2o.estimators.glm import H2OGeneralizedLinearEstimator # Megan Kurka found that categorical columns do not work with modelselection backward mode. I fixed the bug and -# extended her test to check that each time a predictor is dropped, it must has the smallest z-value magnitude. +# extended her test to check that each time a predictor is dropped, the best performing level is compared to other +# predictors. If the best level is not good enough, the whole enum predictor is dropped. def test_megan_failure(): df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/demos/bank-additional-full.csv") y = "y" @@ -28,7 +29,6 @@ def test_megan_failure(): best_predictor_subset = backward_model.get_best_model_predictors() counter = 0 - back_coef = backward_model.coef() for ind in list(range(num_models-1, 0, -1)): pred_large = coefficient_orders[ind] pred_small = coefficient_orders[ind-1] @@ -40,11 +40,11 @@ def test_megan_failure(): # assert z-values removed has smallest magnitude x = best_predictor_subset[ind] - assert_smallest_z_removed(back_coef[ind], z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df) + assert_correct_z_removed(z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df) counter += 1 -def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df): +def assert_correct_z_removed(z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df): glm_model = H2OGeneralizedLinearEstimator(seed=1234, remove_collinear_columns=True, lambda_=0.0, compute_p_values=True) glm_model.train(x=x, y=y, training_frame=df) cat_predictors = extractCatCols(df, x) @@ -53,11 +53,21 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co model_z_values = glm_model._model_json["output"]["coefficients_table"]["z_value"] model_coeffs = glm_model._model_json["output"]["coefficients_table"]["names"] - assert_equal_z_values(back_coef, glm_model.coef(), z_values_backward, coeff_backward, model_z_values, model_coeffs) - min_z_value = min(z_values_removed) + assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, model_coeffs) + + num_predictor_removed = False + for one_value in predictor_removed: + if one_value in num_predictors: + num_predictor_removed = True + break + if num_predictor_removed: + min_z_value = min(z_values_removed) + else: + min_z_value = max(z_values_removed) + # check that predictor with smallest z-value magnitude is removed - assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values) - assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values) + assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values) + assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values) for name in cat_predictors: for coeff_name in predictor_removed: @@ -66,7 +76,7 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co return x.remove(predictor_removed[0]) # numerical predictor is removed -def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values): +def assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values): for name in cat_predictors: model_z = [] for coeff_name in model_coeffs: @@ -80,7 +90,7 @@ def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeff "than mininum_z_values {2}".format(name, model_z, min_z_value) -def assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values): +def assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values): for name in num_predictors: pred_ind = model_coeffs.index(name) val = model_z_values[pred_ind] @@ -96,7 +106,7 @@ def extractCatCols(df, x): cat_pred.append(name) return cat_pred -def assert_equal_z_values(back_coef, curr_coef, z_values_backward, coeff_backward, model_z_values, glm_coeff): +def assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, glm_coeff): for coeff in glm_coeff: backward_z_value = z_values_backward[coeff_backward.index(coeff)] model_z_value = model_z_values[glm_coeff.index(coeff)] diff --git a/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py b/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py index 3d030e349a1e..de79c21c063d 100644 --- a/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py +++ b/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py @@ -14,11 +14,16 @@ def word2vec(): w2v_model = H2OWord2vecEstimator(epochs=1, word_model=word_model) w2v_model.train(training_frame=train) - synonyms = w2v_model.find_synonyms("horse", 3) + cnt = 10 + synonyms = w2v_model.find_synonyms("horse", cnt) print(synonyms) - - assert len(synonyms) == 3, "there should be three synonmys" - + assert len(synonyms) == cnt, "There should be ten synonyms." + + # GH-16192 find_synonyms returns empty dataset if there is no synonyms to find + synonyms = w2v_model.find_synonyms("hhorse", cnt) + print(synonyms) + assert len(synonyms) == 0, "There should be zero synonyms." + if __name__ == "__main__": pyunit_utils.standalone_test(word2vec) diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py index b9f101dda2a7..b284f6e53f39 100644 --- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py +++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py @@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame(): assert dupl4.columns == ["n1", "s1"] -def H2OFrame_skipped_columns_is_BUGGY(): - try: - h2o.H2OFrame(data, skipped_columns=[1]) - assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 - except ValueError as e: - assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e) +def H2OFrame_skipped_columns_BUG_fixed(): + f1 = h2o.H2OFrame(data, skipped_columns=[1]) + f2 = h2o.H2OFrame(data) + assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1)) pu.run_tests([ @@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY(): H2OFrame_from_pandas, H2OFrame_from_scipy, H2OFrame_from_H2OFrame, - H2OFrame_skipped_columns_is_BUGGY + H2OFrame_skipped_columns_BUG_fixed ]) diff --git a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py index ae4c5e76eeba..ecd499ee01a7 100644 --- a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py +++ b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py @@ -19,7 +19,7 @@ def h2oget_timezone(): timezones = h2o.list_timezones() assert_is_type(timezones, H2OFrame) - assert timezones.nrow == 467, "h2o.get_timezone() returns frame with wrong row number." + assert timezones.nrow == 459, "h2o.get_timezone() returns frame with wrong row number." assert timezones.ncol == 1, "h2o.get_timezone() returns frame with wrong column number." if __name__ == "__main__": diff --git a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py index c6ae86359813..d5330ff20cd3 100644 --- a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py +++ b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py @@ -14,7 +14,7 @@ def h2olist_timezones(): timezones = h2o.list_timezones() assert_is_type(timezones, H2OFrame) - assert timezones.nrow == 467, "h2o.get_timezone() returns frame with wrong row number." + assert timezones.nrow == 459, "h2o.get_timezone() returns frame with wrong row number." assert timezones.ncol == 1, "h2o.get_timezone() returns frame with wrong column number." diff --git a/h2o-py/tests/testdir_misc/pyunit_cluster.py b/h2o-py/tests/testdir_misc/pyunit_cluster.py index 2424170ef1d6..9c367484a27f 100644 --- a/h2o-py/tests/testdir_misc/pyunit_cluster.py +++ b/h2o-py/tests/testdir_misc/pyunit_cluster.py @@ -11,9 +11,9 @@ def test_cluster_status(): def test_cluster_properties(): cl = h2o.cluster() - assert len(cl._schema_attrs_) == 24 + assert len(cl._schema_attrs_) == 25 for k in cl._schema_attrs_.keys(): - assert getattr(cl, k) is not None + assert getattr(cl, k) is not None or k == "web_ip" def test_exception_on_unknown_cluster_property(): diff --git a/h2o-py/tests/testdir_misc/pyunit_export_parquet_npe.py b/h2o-py/tests/testdir_misc/pyunit_export_parquet_npe.py new file mode 100644 index 000000000000..1a8c8fb306e7 --- /dev/null +++ b/h2o-py/tests/testdir_misc/pyunit_export_parquet_npe.py @@ -0,0 +1,23 @@ +import sys +import tempfile + +sys.path.insert(1, "../../../") +import h2o +from tests import pyunit_utils + + +def test_export_file_npe_gh_16161(): + with tempfile.TemporaryDirectory() as dir: + df = h2o.create_frame(rows=100, cols=10, string_fraction=0.1, seed=5, seed_for_column_types=25) + h2o.export_file(df, path=dir, format="parquet", write_checksum=False) + df2 = h2o.import_file(dir) + assert pyunit_utils.compare_frames(df, df2, tol_numeric=1e-10, numElements=0) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_export_file_npe_gh_16161) +else: + test_export_file_npe_gh_16161() + + + diff --git a/h2o-py/tests/testdir_misc/pyunit_export_zstd.py b/h2o-py/tests/testdir_misc/pyunit_export_zstd.py new file mode 100644 index 000000000000..e3704c2cb11a --- /dev/null +++ b/h2o-py/tests/testdir_misc/pyunit_export_zstd.py @@ -0,0 +1,37 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from os import path +import struct + +''' +Export file with h2o.export_file compressed with 'zstd' +''' + + +def is_zstd_file(path): + with open(path, 'rb') as f: + magic_bytes = f.read(4) + return struct.unpack(' 0) { - securityWarnings <- grep("SECURITY_WARNING", readLines(stdout), value=TRUE) - } - if (length(securityWarnings) > 0) { - msg = paste( - "Server process startup raise a security warning:", - paste(securityWarnings, collapse = "\n"), sep = "\n") - warning(msg) - } } else stop("Can only start H2O launcher if IP address is localhost.") } diff --git a/h2o-r/h2o-package/R/explain.R b/h2o-r/h2o-package/R/explain.R index 2243388bbdb7..67220f3a323e 100644 --- a/h2o-r/h2o-package/R/explain.R +++ b/h2o-r/h2o-package/R/explain.R @@ -242,10 +242,10 @@ case_insensitive_match_arg <- function(arg, choices) { .self }, get_model = function(model_id) { - model <- memoised_models$get_model(model_id) - if (!is.null(model@allparameters$treatment_column)) + m <- memoised_models$get_model(model_id) + if (!is.null(m@allparameters$treatment_column)) stop("Uplift models have not supported in explain yet.") - return(model) + return(m) } ) ) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index c3a2fcf74925..2e7b9f23ad29 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -4109,6 +4109,7 @@ use.package <- function(package, #' #' @param x An \code{R} object. #' @param destination_frame A string with the desired name for the H2OFrame +#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame #' @param use_datatable allow usage of data.table #' @param \dots arguments passed to method arguments. #' @export @@ -4135,15 +4136,19 @@ use.package <- function(package, #' stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m)) #' } #' } -as.h2o <- function(x, destination_frame="", ...) { +as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) { .key.validate(destination_frame) - UseMethod("as.h2o") + if (is.null(skipped_columns)) { + UseMethod("as.h2o") + } else { + as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns) + } } #' @rdname as.h2o #' @method as.h2o default #' @export -as.h2o.default <- function(x, destination_frame="", ...) { +as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) { if( destination_frame=="" ) { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = "")) @@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) { data.frame(C1=x) else as.data.frame(x, ...) - as.h2o.data.frame(x, destination_frame=destination_frame) + as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns) } #' @rdname as.h2o #' @method as.h2o H2OFrame #' @export -as.h2o.H2OFrame <- function(x, destination_frame="", ...) { +as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) { if( destination_frame=="" ) { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy") @@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) { #' @seealso \code{\link{use.package}} #' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/} #' @export -as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) { +as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) { if( destination_frame=="" ) { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame") @@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt)) #if (verbose) pt <- proc.time()[[3]] # timings inside h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types, - col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x))) + col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)), + skipped_columns=skipped_columns) #if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt)) file.remove(tmpf) h2f @@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) #' To speedup execution time for large sparse matrices, use h2o datatable. Make sure you have installed and imported data.table and slam packages. #' Turn on h2o datatable by options("h2o.use.data.table"=TRUE) #' @export -as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) { +as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) { if( destination_frame=="") { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix") diff --git a/h2o-r/h2o-package/R/parse.R b/h2o-r/h2o-package/R/parse.R index d49b8db5564c..0ba9ed0afbdf 100755 --- a/h2o-r/h2o-package/R/parse.R +++ b/h2o-r/h2o-package/R/parse.R @@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA else col.names if (!is.null(parseSetup$column_names) && - (length(parseSetup$column_names) != parsedColLength)) { - stop("length of col.names must equal to the number of columns in dataset") + (length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns + if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength)) + || is.null(skipped_columns)) # if no skipped column, this is an error. If skipped columns, check length + stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") } # change column names to what the user specified if (!is.null(skipped_columns)) { diff --git a/h2o-r/h2o-package/R/stackedensemble.R b/h2o-r/h2o-package/R/stackedensemble.R index 6cc96bc9d000..ba11e0c82a9a 100644 --- a/h2o-r/h2o-package/R/stackedensemble.R +++ b/h2o-r/h2o-package/R/stackedensemble.R @@ -59,8 +59,12 @@ #' h2o.init() #' #' # Import a sample binary outcome train/test set -#' train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") -#' test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") +#' train <- h2o.importFile( +#' "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv" +#' ) +#' test <- h2o.importFile( +#' "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv" +#' ) #' #' # Identify predictors and response #' y <- "response" diff --git a/h2o-r/h2o-package/R/w2vutils.R b/h2o-r/h2o-package/R/w2vutils.R index d7d163e52801..c1e60d0334fe 100644 --- a/h2o-r/h2o-package/R/w2vutils.R +++ b/h2o-r/h2o-package/R/w2vutils.R @@ -27,7 +27,10 @@ h2o.findSynonyms <- function(word2vec, word, count = 20) { res <- .h2o.__remoteSend(method="GET", "Word2VecSynonyms", model = word2vec@model_id, word = word, count = count) fr <- data.frame(synonym = res$synonyms, score = res$scores) - fr[with(fr, order(score, decreasing = TRUE)),] + if (length(fr) > 0) { + fr[with(fr, order(score, decreasing = TRUE)),] + } + fr } #' diff --git a/h2o-r/scripts/h2o-r-test-setup.R b/h2o-r/scripts/h2o-r-test-setup.R index 6bd0e25d851f..9fbcecca52a9 100755 --- a/h2o-r/scripts/h2o-r-test-setup.R +++ b/h2o-r/scripts/h2o-r-test-setup.R @@ -2,7 +2,6 @@ .origEchoValue <- getOption("echo") options(echo=FALSE) options(scipen=999) -options(stringsAsFactors=T) #' #' diff --git a/h2o-r/tests/runitUtils/shared_javapredict.R b/h2o-r/tests/runitUtils/shared_javapredict.R index 661cbbef6bc8..54ba6a17cc72 100644 --- a/h2o-r/tests/runitUtils/shared_javapredict.R +++ b/h2o-r/tests/runitUtils/shared_javapredict.R @@ -68,7 +68,7 @@ doJavapredictTest <- function(model,test_file,test_frame,params, separator=",", safeSystem(cmd) print("Comparing predictions between H2O and Java POJO") - prediction2 <- read.csv(sprintf("%s/out_pojo.csv", tmpdir_name), header=T) + prediction2 <- read.csv(sprintf("%s/out_pojo.csv", tmpdir_name), header=T, stringsAsFactors=TRUE) if (nrow(prediction1) != nrow(prediction2)) { warning("Prediction mismatch") print(paste("Rows from H2O", nrow(prediction1))) diff --git a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R index 5ce5f56ebada..dfc606f794d3 100644 --- a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R +++ b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R @@ -15,11 +15,12 @@ test.CoxPH.concordance <- function() { rModel <- coxph(Surv(time, status) ~ age + sex + meal.cal + age:meal.cal, data = tstdata, ties = "efron") rPredictor <- rModel$linear.predictors + hexModel <- h2o.coxph(x = c("age", "sex", "meal.cal"), interaction_pairs = list(c("age", "meal.cal")), event_column = "status", stop_column = "time", ties = "efron", training_frame = tstdataHex) hexPredictor <- pred(hexModel, tstdataHex) - - expect_equal(rPredictor, hexPredictor, scale = 1, tolerance = 1e-3) + + expect_equal(rPredictor - mean(rPredictor), hexPredictor, scale = 1, tolerance = 1e-3) rConcordance <- unname(summary(rModel)$concordance)[1] hexConcordance <- h2o.performance(hexModel, data=tstdataHex)@metrics$concordance diff --git a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R index ce2f0fcb5918..cce60312a173 100644 --- a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R +++ b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R @@ -26,7 +26,7 @@ test.CoxPH.predict <- function() { check.pred <- function(r.model, hex.model, r.tstdata, hex.tstdata) { fit.pred <- pred.r(r.model, r.tstdata) hex.lp <- pred.h2o(hex.model, hex.tstdata) - expect_equal(fit.pred, hex.lp, tolerance = 1e-5, scale = 1) + expect_equal(fit.pred - mean(fit.pred), hex.lp, tolerance = 1e-5, scale = 1) } check.concordance <- function (rModel, hexModel, data, tolerance = 1e-3) { diff --git a/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R b/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R index ced349a5ef72..64edb3092f6d 100644 --- a/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R +++ b/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R @@ -16,7 +16,10 @@ pred.h2o <- function(model, data) { compare.results <- function(fit, hex.fit, tstdata, tstdata.hex) { fit.pred <- pred.r(fit, tstdata) hex.lp <- pred.h2o(hex.fit, tstdata.hex) - expect_equal(fit.pred, hex.lp, tolerance = 1e-7, scale = 1) + w <- tstdata$weights + if (is.null(w)) + w <- rep_len(1, length(fit.pred)) + expect_equal(fit.pred - weighted.mean(fit.pred, w, na.rm=TRUE), hex.lp, tolerance = 1e-7, scale = 1) } cancer.with.sex <- function () { diff --git a/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R b/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R index cbd36d6f3259..0982925868c3 100644 --- a/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R +++ b/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R @@ -10,7 +10,6 @@ test.CoxPH.mojo_interactions_impl <- function(stratify_by = NULL) { interaction_pairs = list(c("C1", "C3"), c("C1", "C2"), c("C3", "C4"), c("C4", "C2"), c("C1", "age"), c("surgery", "C3")), training_frame = training_frame) - browser() predict_h2o <- h2o.predict(coxph_h2o, training_frame) print(predict_h2o) @@ -21,7 +20,6 @@ test.CoxPH.mojo_interactions_impl <- function(stratify_by = NULL) { predict_mojo <- h2o.predict(coxph_mojo, training_frame) print(predict_mojo) - browser() expect_equal(as.data.frame(predict_h2o), as.data.frame(predict_mojo)) } diff --git a/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R b/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R index b34586373496..accf63f09824 100644 --- a/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R +++ b/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R @@ -40,7 +40,7 @@ test.GBM.ecology <- function() { print(ecology.sum) #import csv data for R to use - ecology.data <- read.csv(locate("smalldata/gbm_test/ecology_model.csv"), header = TRUE) + ecology.data <- read.csv(locate("smalldata/gbm_test/ecology_model.csv"), header = TRUE, stringsAsFactors=TRUE) ecology.data <- na.omit(ecology.data) #this omits NAs... does GBM do this? Perhaps better to model w/o doing this? Log.info("H2O GBM with parameters:\nntrees = 100, max_depth = 5, min_rows = 10, learn_rate = 0.1\n") diff --git a/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R b/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R index 29077cae8f5a..14101d3b5c3a 100644 --- a/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R +++ b/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R @@ -16,16 +16,15 @@ test <- function() { #htable= as.h2o(table.1.2,destination_frame = "htable") hh = h2o.gbm(x = 1:3,y = "medskad",training_frame = htable,distribution = "gamma",weights_column = "antskad", ntrees = 20,max_depth = 1,min_rows = 1,learn_rate = 1) - ph = as.vector(as.data.frame(h2o.predict(hh,newdata = htable))) - + ph = as.data.frame(h2o.predict(hh,newdata = htable))$predict #expect_equal(gg$initF,hh@model$init_f,tolerance = 1e-6) - #expect_equal(min(pr),min(ph[,1]),tolerance = 1e-6) - #expect_equal(max(pr),max(ph[,1]),tolerance = 1e-6) - #expect_equal(mean(pr),mean(ph[,1]),tolerance = 1e-6) + #expect_equal(min(pr),min(ph),tolerance = 1e-6) + #expect_equal(max(pr),max(ph),tolerance = 1e-6) + #expect_equal(mean(pr),mean(ph),tolerance = 1e-6) expect_equal(8.804447,hh@model$init_f,tolerance = 1e-6) - expect_equal(3751.01,min(ph[,1]),tolerance = 1e-4) - expect_equal(15291,max(ph[,1]),tolerance = 1e-4) - expect_equal(8119,mean(ph[,1]),tolerance = 1e-4) + expect_equal(3751.01,min(ph),tolerance = 1e-4) + expect_equal(15291,max(ph),tolerance = 1e-4) + expect_equal(8119,mean(ph),tolerance = 1e-4) } diff --git a/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R b/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R index d41fd01ded8c..59d0ca896cbe 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R +++ b/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R @@ -24,7 +24,7 @@ test_constraints_redundant <- function() { }, error = function(e) { print("***") print(e) - expect_true(grepl("redundant and possibly conflicting linear constraints:", e)) + expect_true(grepl("redundant linear constraints:", e)) }) } diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R index ecfe7314453e..ad4ec4b0cf32 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R +++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R @@ -13,6 +13,8 @@ test.LiblineaR.airlines <- function() { Log.info("epsilon = 1E-4: Tolerance of termination criterion\n") Log.info(" cross = 0: No kfold cross-validation\n") + dimnames(test) <- dimnames(train) + LibR.m <- LiblineaR(train, trainLabels,type=0, epsilon=1E-4, cost=100) LibRpreds <- predict(LibR.m, test, proba=1, decisionValues=TRUE) LibRCM <- table(testLabels, LibRpreds$predictions) diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R index a40c585049ac..32d877981868 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R +++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R @@ -26,7 +26,7 @@ test <- function() { hh = h2o.glm(x = 2:31,y = 1,training_frame = frm,family = "binomial",offset_column = "off",lambda = 0) gr = glm(formula = y~X1+X2 + X3 +X4 +X5+X6+X7+X8+X9+X10+ X11+X12+X13+X14+X15+X16+X17+X18+X19+ X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30, family = "binomial",data = rfm,offset= rfm[,32]) - gg = glmnet(x = as.matrix(rfm[,-c(1,32)]),y = as.factor(rfm[,1]),family = "binomial",lambda =0,offse = rfm[,32]) + gg = glmnet(x = as.matrix(rfm[,-c(1,32)]),y = as.factor(rfm[,1]),family = "binomial",lambda =0, offset = rfm[,32]) print("compare results") expect_equal(gr$null.deviance, hh@model$training_metrics@metrics$null_deviance) expect_equal(gr$aic, hh@model$training_metrics@metrics$AIC,tolerance = 0.00001) @@ -34,7 +34,7 @@ test <- function() { expect_equal(gr$df.residual,hh@model$training_metrics@metrics$residual_degrees_of_freedom) #predictions ph = h2o.predict(object = hh,newdata = val) - pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]),offset = as.matrix(valid[,32]),type = "response") + pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]), newoffset = as.matrix(valid[,32]), offset = as.matrix(valid[,32]),type = "response") print("compare predictions") expect_equal(min(pr),min(ph$p1),tolerance = 0.0001) expect_equal(max(pr),max(ph$p1),tolerance = 0.0001) @@ -49,7 +49,7 @@ test <- function() { expect_equal(deviance(gg),hh@model$training_metrics@metrics$residual_deviance,tolerance = 0.00001) #predictions ph = h2o.predict(object = hh,newdata = val) - pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]),offset = as.matrix(valid[,32]),type = "response") + pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]), newoffset = as.matrix(valid[,32]), offset = as.matrix(valid[,32]),type = "response") print("compare predictions") expect_equal(min(pr),min(ph$p1),tolerance = 0.0001) expect_equal(max(pr),max(ph$p1),tolerance = 0.0001) diff --git a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R index 7c5d9517f050..a01cad170895 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R +++ b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R @@ -9,14 +9,13 @@ test.fractionalbinomial <- # Run the test #---------------------------------------------------------------------- - browser() params_prob_data <- setParmsData() # generate model parameters, random dataset modelAndDir<-buildModelSaveMojoGLM(params_prob_data$params) # build the model and save mojo filename = sprintf("%s/in.csv", modelAndDir$dirName) # save the test dataset into a in.csv file. h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4) } diff --git a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R index b851c081610c..24cdef51e822 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R +++ b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R @@ -9,7 +9,6 @@ source("../../../scripts/h2o-r-test-setup.R") test_RID_binomial_compareR <- function() { fat <- h2o.importFile(locate("smalldata/glm_test/bodyfat.csv")) bodyfat <- as.data.frame(fat) - browser() rGlmBinomial <- glm(bmi ~ neck+density+hip, data=bodyfat, family=binomial()) dfbetasGlmB <- dfbetas(rGlmBinomial) hGlmBinomial <- h2o.glm(x=c("neck", "density", "hip"), y="bmi", lambda=0, family="binomial", standardize=FALSE, influence="dfbetas", training_frame=fat) diff --git a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R index 914c524d5c61..a92b1acfe66f 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R +++ b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R @@ -3,8 +3,8 @@ source("../../../scripts/h2o-r-test-setup.R") # add test from Erin Ledell glmBetaConstraints <- function() { - df <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + df <- h2o.importFile(locate("smalldata/higgs/higgs_train_10k.csv")) + test <- h2o.importFile(locate("smalldata/higgs/higgs_test_5k.csv")) y <- "response" x <- setdiff(names(df), y) diff --git a/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R b/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R index 158d91fb0d2c..f6aba8b57eec 100644 --- a/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R +++ b/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R @@ -7,12 +7,11 @@ test.glrm.pubdev.3788 <- function() { # Create data frame with a constant column data <- data.frame('NumericCol' = runif(50), 'ConstantCol' = rep(1, 50), - 'CategoricalCol' = sample(c("A", "B", "C", "D"), size = 50, replace = T)) + 'CategoricalCol' = sample(c("A", "B", "C", "D"), size = 50, replace = T), + stringsAsFactors = TRUE) data <- as.h2o(data) - browser() - # Specify loss by column and set ignore_const_cols to TRUE glrm_model <- h2o.glrm(data, k = 2, model_id = "glrm_test.hex", loss_by_col = c("Quadratic", "Categorical", "Categorical"), diff --git a/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R b/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R index de11ada0ccea..dceff0de9ca1 100644 --- a/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R +++ b/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R @@ -4,15 +4,15 @@ source("../../../scripts/h2o-r-test-setup.R") test.IsolationForest.accuracy <- function() { - set.seed(1234) + set.seed(12345) N = 1e6 random_data <- data.frame( x = c(rnorm(N, 0, 0.5), rnorm(N*0.05, -2, 1)), y = c(rnorm(N, 0, 0.5), rnorm(N*0.05, 2, 1)), - outlier = c(rep("NO", N), rep("YES", (0.05*N))) + outlier = c(rep("NO", N), rep("YES", (0.05*N))), + stringsAsFactors = TRUE ) random_data.hex <- as.h2o(random_data) - # different approach than in the paper - build a smaller number of deeper trees trained on a much larger sample h2o_isolation_forest <- h2o.isolationForest(x = c("x", "y"), training_frame = random_data.hex[, c("x", "y")], ntrees = 25, seed = 1234, sample_rate = 0.7, min_rows = 1000, max_depth = 16) diff --git a/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R b/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R index b480a1d3551f..ae25a0f41b9e 100644 --- a/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R +++ b/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R @@ -6,7 +6,7 @@ testModelSelectionV <- function() { bhexFV2 <- h2o.uploadFile(locate("smalldata/logreg/prostate.csv")) Y <- "GLEASON" X <- c("AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS") - browser() + Log.info("Build the MaxRGLM model") allsubsetsModel <- h2o.modelSelection(y=Y, x=X, seed=12345, training_frame = bhexFV, max_predictor_number=2, mode="allsubsets") diff --git a/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R b/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R index 893f1ccc31a5..856cb38eeb63 100644 --- a/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R +++ b/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R @@ -3,7 +3,6 @@ source("../../../scripts/h2o-r-test-setup.R") # Make sure we can run with airline data test.pca.airline<- function() { - browser() dimD = 234 pp = h2o.uploadFile(locate("smalldata/airlines/AirlinesTest.csv.zip")) aa = h2o.prcomp(pp, k=dimD, transform="STANDARDIZE") diff --git a/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R b/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R index 41f6e6ca6cca..b151260dba8a 100644 --- a/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R +++ b/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R @@ -9,7 +9,6 @@ test.pca.la1s <- function() { run_time_c <- c() num_run <- 1 - browser() dataR <- h2o.importFile(locate("bigdata/laptop/jira/la1s.wc.arff.txt.zip"), sep = ',', destination_frame = "data", header = T, parse = FALSE) data <- h2o.parseRaw(dataR, destination_frame = "bigParse", parse_type = "CSV", header = T) # chunk_size = 124022500 size will make one chunk. diff --git a/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R b/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R index e4774d4048d8..fac014a57764 100644 --- a/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R +++ b/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R @@ -6,7 +6,6 @@ test.pca.arrests <- function() { Log.info("Importing USArrests.csv data...\n") arrests.hex <- h2o.uploadFile(locate("smalldata/pca_test/USArrests.csv")) arrests.pca.h2o <- h2o.prcomp(training_frame = arrests.hex, k = 1, seed=12345) - browser() pca_noK <- h2o.prcomp(training_frame = arrests.hex, seed=12345) pred1 <- h2o.predict(arrests.pca.h2o, arrests.hex) diff --git a/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R b/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R index cf42dfd27aea..a60e44485568 100644 --- a/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R +++ b/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R @@ -2,8 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) source("../../../scripts/h2o-r-test-setup.R") -library(randomForest) - test.DRF.bigcat <- function() { # Training set has 100 categories from cat001 to cat100 # Categories cat001, cat003, ... are perfect predictors of y = 1 @@ -25,10 +23,10 @@ test.DRF.bigcat <- function() { drfperf <- h2o.performance(drfmodel) expect_equal(h2o.auc(drfperf), 1) # No errors off the diagonal - default_cm <- h2o.confusionMatrix(drfmodel,bigcat.hex)[[1]] -# expect_equal(default_cm[1,2], 0) -# expect_equal(default_cm[2,1], 0) - + default_cm <- h2o.confusionMatrix(drfmodel,bigcat.hex) + print(default_cm) + expect_equal(default_cm[[1,2]], 0) + expect_equal(default_cm[[2,1]], 0) } doTest("DRF Test: Classification with 100 categorical level predictor", test.DRF.bigcat) diff --git a/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R b/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R index 395ffa564b51..5b8546921a99 100644 --- a/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R +++ b/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R @@ -2,8 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) source("../../../scripts/h2o-r-test-setup.R") -library(randomForest) - test.DRF.smallcat <- function() { # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 @@ -33,25 +31,9 @@ test.DRF.smallcat <- function() { print(h2o.confusionMatrix(drfmodel,alphabet.hex)) expect_equal(h2o.auc(drfperf), 1) # No errors off the diagonal - default_cm <- h2o.confusionMatrix(drfmodel,alphabet.hex)[[1]] - #iexpect_equal(default_cm[1,2], 0) - #expect_equal(default_cm[2,1], 0) - - # Train R DRF Model: - # Log.info("R DRF with same parameters:") - # drfmodel.r <- randomForest(y ~ ., data = alphabet.data, ntree = 1, nodesize = 1) - # drfmodel.r.pred <- predict(drfmodel.r, alphabet.data, type = "response") - - # Compute confusion matrices - # Log.info("R Confusion Matrix:"); print(drfmodel.r$confusion) - # Log.info("H2O (Group Split) Confusion Matrix:"); print(drfmodel.grpsplit@model$confusion) - - # Compute the AUC - need to convert factors back to numeric - # actual <- ifelse(alphabet.data$y == "0", 0, 1) - # pred <- ifelse(drfmodel.r.pred == "0", 0, 1) - # R.auc = gbm.roc.area(actual, pred) - # Log.info(paste("R AUC:", R.auc, "\tH2O (Group Split) AUC:", drfmodel.grpsplit@model$AUC)) - + default_cm <- h2o.confusionMatrix(drfmodel,alphabet.hex) + expect_equal(default_cm[1,2], 0) + expect_equal(default_cm[2,1], 0) } doTest("DRF Test: Classification with 26 categorical level predictor", test.DRF.smallcat) diff --git a/h2o-r/tests/testdir_algos/word2vec/runit_word2vec_find_synonyms.R b/h2o-r/tests/testdir_algos/word2vec/runit_word2vec_find_synonyms.R new file mode 100644 index 000000000000..85aa6810eb7b --- /dev/null +++ b/h2o-r/tests/testdir_algos/word2vec/runit_word2vec_find_synonyms.R @@ -0,0 +1,20 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") + + +test.word2vec.findSynonyms <- function() { + job_titles <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv", col.names = c("category", "jobtitle"), col.types = c("String", "String"), header = TRUE) + + words <- h2o.tokenize(job_titles, " ") + vec <- h2o.word2vec(training_frame = words) + + cnt <- 10 + syn <- h2o.findSynonyms(vec, "teacher", count = cnt) + expect_equal(length(syn$score), cnt) + + # # GH-16192 h2o.findSynonyms returns empty dataset if there is no synonyms to find + syn2 <- h2o.findSynonyms(vec, "Tteacher", count = cnt) + expect_equal(length(syn2$score), 0) +} + +doTest("Test findSynonyms function", test.word2vec.findSynonyms) diff --git a/h2o-r/tests/testdir_golden/runit_pca_5_golden.R b/h2o-r/tests/testdir_golden/runit_pca_5_golden.R index b312391c63b1..d0fa3e841aa3 100644 --- a/h2o-r/tests/testdir_golden/runit_pca_5_golden.R +++ b/h2o-r/tests/testdir_golden/runit_pca_5_golden.R @@ -5,7 +5,7 @@ source("../../scripts/h2o-r-test-setup.R") test.poison.golden <- function() { Log.info("Importing poison.csv data...") - poisonR <- read.csv(locate("smalldata/pca_test/poison.csv"), header = TRUE) + poisonR <- read.csv(locate("smalldata/pca_test/poison.csv"), header = TRUE, stringsAsFactors = TRUE) poisonH2O <- h2o.uploadFile(locate("smalldata/pca_test/poison.csv"), destination_frame = "poisonH2O") k_test <- sort(sample(1:8,3)) diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R index e6088d867185..92ced338b1d2 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R @@ -15,7 +15,7 @@ test.ordinalGlm.mojo <- h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4) } diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R index 548675cbf166..affa516fb7f8 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R @@ -15,7 +15,7 @@ test.multinomialGlm.mojo <- h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4) } diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R index 15f58343b0f4..f6695a8c9fad 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R @@ -15,7 +15,7 @@ test.ordinalGlm.mojo <- h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4) } diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R index 64fb8ed18062..d899777e056e 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R @@ -16,7 +16,6 @@ test.gbm.leaf.assignment.mojo <- h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, get_leaf_node_assignment=TRUE) # perform H2O and mojo prediction and return frames print("Finished mojo. Going to compare two frames") - browser() print(twoFrames) compareStringFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1) }, error = function(x) x) diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R index 3f950ad67e2f..aee634d0ac2a 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R @@ -20,7 +20,7 @@ test.binomial.gam.mojo <- h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) h2o.downloadCSV(twoFrames$mojoPredict, - sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames( twoFrames$h2oPredict, twoFrames$mojoPredict, diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R index 8a6fd74918c1..93b1b0e02748 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R @@ -41,7 +41,7 @@ test.GAM.quasibinomial <- function() { h2o.downloadCSV(htest[1:100, x], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("enum", "numeric", "numeric")) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) twoFrames$h2oPredict[,1] <- h2o.asfactor(twoFrames$h2oPredict[,1]) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6) } diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R index 8d2045e9a0cd..4de31d3874c4 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R @@ -26,7 +26,7 @@ test.GAM.binomial <- function() { h2o.downloadCSV(test, filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("enum", "numeric", "numeric")) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) twoFrames$h2oPredict[,1] <- h2o.asfactor(twoFrames$h2oPredict[,1]) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6) } diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R index 1e5d5b7151e4..507b769fe917 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R @@ -23,7 +23,7 @@ test.GLM.offset.fractionalbinomial <- function() { h2o.downloadCSV(hf[1:100, xOffset], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("numeric", "numeric", "numeric")) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6) } diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R index 065aa7f83af5..65bced0aca32 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R @@ -37,7 +37,7 @@ test.GLM.offset.quasibinomial <- function() { h2o.downloadCSV(hf[1:100, xOffset], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("numeric", "numeric", "numeric")) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) predictFrame <- twoFrames$h2oPredict[, 2:3] mojoFrame <- twoFrames$mojoPredict[, 2:3] compareFrames(predictFrame, mojoFrame, prob=1, tolerance = 1e-6) diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R index 7d3483d199be..3c1699f448f7 100644 --- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R +++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R @@ -24,7 +24,7 @@ test.GLM.offset.tweedie <- function() { h2o.downloadCSV(hf[1:100, xOffset], filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("numeric", "numeric", "numeric")) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6) } diff --git a/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R b/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R index b0d9f584e784..08d4040d16a9 100644 --- a/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R +++ b/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R @@ -19,7 +19,6 @@ test <- test_file <- locate("smalldata/logreg/prostate_train_null_column_name.csv") test_frame <- h2o.importFile(test_file) params = prepTest() - browser() doJavapredictTest("gbm",test_file,test_frame,params) # make sure original code run # check a separator that is not a special character @@ -53,4 +52,4 @@ prepTest <- function() { return(params) } -doTest("pubdev-4531: PredictCsv test", test) \ No newline at end of file +doTest("pubdev-4531: PredictCsv test", test) diff --git a/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R index 0cfe66c57166..2d13ea3eeb89 100644 --- a/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R +++ b/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R @@ -16,7 +16,7 @@ test.PCA.mojo <- h2o.downloadCSV(params_prob_data$tDataset, filename) twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName)) - h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname)) + h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName)) compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4) } diff --git a/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R b/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R index 7121f7694eae..df83640a3f4d 100644 --- a/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R +++ b/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R @@ -18,7 +18,6 @@ test.mojo.setInvNumNA <- params$y <- "C2" params$family <- "gaussian" modelAndDir<-buildModelSaveMojoGLM(params) # build the model and save mojo - browser() modelPred <- h2o.predict(modelAndDir$model, testModel) # predict with invalid row value replaced with mean value # get genmodel.jar pathname a = strsplit(modelAndDir$dirName, '/') diff --git a/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R b/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R index 0c6b0b0ee094..4d3af484b75b 100644 --- a/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R +++ b/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R @@ -4,8 +4,8 @@ source("../../scripts/h2o-r-test-setup.R") # R behavior: Reports an error but keeps the frame as is test.pubdev.2800 <- function(conn){ - df <- h2o.importFile("http://h2o-smalldata.s3.amazonaws.com/jira/test_string_missing.csv") + df <- h2o.importFile("smalldata/jira/test_string_missing.csv") expect_false(is.na(df[3,2])) } -doTest("'0' Parsed incorrectly", test.pubdev.2800) \ No newline at end of file +doTest("'0' Parsed incorrectly", test.pubdev.2800) diff --git a/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R b/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R index 3a5c19945d4c..49005fb01a1f 100644 --- a/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R +++ b/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R @@ -4,38 +4,40 @@ source("../../scripts/h2o-r-test-setup.R") # problem with merge. test <- function() { # code from Kuba - left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15))) # [A, 12][B, 13][C, 14][D, 15] - right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000))) #[Y, 10000][B, 20000][X, 30000][D, 40000] + left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15), stringsAsFactors = TRUE)) # [A, 12][B, 13][C, 14][D, 15] + right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000), stringsAsFactors = TRUE)) #[Y, 10000][B, 20000][X, 30000][D, 40000] merged <- h2o.merge(right, left, all.x = TRUE, method="radix") - resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), bigvalue=c(20000, 40000, 30000, 10000), value = c(13, 15, NA, NA))) + resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), bigvalue=c(20000, 40000, 30000, 10000), value = c(13, 15, NA, NA), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic")) merged <- h2o.merge(left, right, all.y = TRUE, method="radix") - resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000))) + resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic")) merged <- h2o.merge(left, right, all.x = FALSE, all.y = FALSE, method="radix") - resultF <- as.h2o(data.frame(topic=c("B","D"), value = c(13, 15), bigvalue=c(20000, 40000))) + resultF <- as.h2o(data.frame(topic=c("B","D"), value = c(13, 15), bigvalue=c(20000, 40000), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic")) merged <- h2o.merge(right, left, all.x = FALSE, all.y = FALSE, method="radix") - resultF <- as.h2o(data.frame(topic=c("B","D"), bigvalue=c(20000, 40000), value = c(13, 15))) + resultF <- as.h2o(data.frame(topic=c("B","D"), bigvalue=c(20000, 40000), value = c(13, 15), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic")) # customer code left_hf <- as.h2o(data.frame(fruit = c(-177000000, -4000000, 100000000000, 200000000000, 1000000000000), - color <- c('red', 'orange', 'yellow', 'red', 'blue'))) - right_hf <- as.h2o(data.frame(fruit = c(-177000000), citrus <- c(FALSE))) + color = c('red', 'orange', 'yellow', 'red', 'blue'), stringsAsFactors = TRUE)) + right_hf <- as.h2o(data.frame(fruit = c(-177000000), citrus = c(FALSE))) merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) resultF <- as.h2o(data.frame(fruit = c(100000000000,200000000000,1000000000000,-177000000,-4000000), - color=c('yellow','red','blue','red','orange'), citrus=c(NA, NA, NA, FALSE, NA))) + color=c('yellow','red','blue','red','orange'), citrus=c(NA, NA, NA, FALSE, NA), + stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # left frame starts lower - left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256), color <- c('red', 'orange', 'yellow', 'red', 'blue'))) + left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256), color = c('red', 'orange', 'yellow', 'red', 'blue'), + stringsAsFactors = TRUE)) right_hf <- as.h2o( data.frame(fruit = c(258,518,517,1030,1028,1028,1030,2049), - citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE))) + citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE))) merged2 <- h2o.merge(left_hf, right_hf, all.y = TRUE) # H2O give wrong answer print(merged2) resultF <- as.h2o(data.frame(fruit=c(258,517,518,1028,1028,1030,1030,2049), @@ -45,13 +47,15 @@ test <- function() { merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) print(merged) - resultF <- as.h2o(data.frame(fruit=c(0,2,3,256,257), color=c('yellow','red','orange','blue','red'), citrus=c(NA,NA,NA,NA,NA))) + resultF <- as.h2o(data.frame(fruit=c(0,2,3,256,257), color=c('yellow','red','orange','blue','red'), + citrus=c(NA,NA,NA,NA,NA), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # both frame more or less overlapped left_hf <- as.h2o(data.frame(fruit = c(2,3,3,3,0,4,7,9,257,256,518,518,1028), color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', - 'cyan','red', 'orange', 'yellow', 'red', 'blue','negra'))) + 'cyan','red', 'orange', 'yellow', 'red', 'blue','negra'), + stringsAsFactors = TRUE)) right_hf <- as.h2o(data.frame(fruit = c(3,3,3,3,6,8,12,14,258,518,518,517,1030,1028,1028,1030,2049), citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE))) @@ -61,7 +65,8 @@ test <- function() { 'yellow','red','red','red','red','purple','cyan','red','yellow','orange', 'red','red','blue','blue','negra','negra'), citrus=c(NA,NA,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE, - NA,NA,NA,NA,NA,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE))) + NA,NA,NA,NA,NA,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE), + stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) merged <- h2o.merge(left_hf, right_hf, all.x=FALSE, all.y=FALSE) @@ -69,86 +74,87 @@ test <- function() { color=c('orange','orange','orange','orange','yellow','yellow','yellow','yellow', 'red','red','red','red','red','red','blue','blue','negra','negra'), citrus=c(TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE, - FALSE,FALSE,FALSE,FALSE,FALSE,FALSE))) + FALSE,FALSE,FALSE,FALSE,FALSE,FALSE), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged, "fruit"), h2o.arrange(resultF,"fruit")) # both frame with duplicate keys # left frame starts higher and with overlap - left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028), color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'))) - right_hf <- as.h2o(data.frame(fruit = c(258,518,517,1030,1028,1030,1035), citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE))) + left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028), color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), stringsAsFactors = TRUE)) + right_hf <- as.h2o(data.frame(fruit = c(258,518,517,1030,1028,1030,1035), citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE), stringsAsFactors = TRUE)) merged <- h2o.merge(left_hf, right_hf, all.x = FALSE, all.y=FALSE) - resultF <- as.h2o(data.frame(fruit=c(518, 1028), color=c('purple', 'cyan'), citrus=c(TRUE, TRUE))) + resultF <- as.h2o(data.frame(fruit=c(518, 1028), color=c('purple', 'cyan'), citrus=c(TRUE, TRUE), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # left frame starts higher and no overlap left_hf <- as.h2o(data.frame(fruit = c(2,3,0,14,15,16,17), - color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'))) + color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), + stringsAsFactors = TRUE)) right_hf <- as.h2o(data.frame(fruit = c(258,518,517,1030,1028,1030,1035), - citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE))) + citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE))) merged <- h2o.merge(left_hf, right_hf, all.x = FALSE, all.y=FALSE) print(merged) expect_true((nrow(merged) == 0 && ncol(merged) == 3), info="Merged frame and expected result are different in size.") merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) resultF <-as.h2o(data.frame(fruit=c(0,2,3,14,15,16,17), color=c('yellow','red','orange','red', 'blue', 'purple', 'cyan'), - citrus=c(NA,NA,NA,NA,NA,NA,NA))) + citrus=c(NA,NA,NA,NA,NA,NA,NA), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # code from Kuba - left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15))) # [A, 12][B, 13][C, 14][D, 15] - right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000))) #[Y, 10000][B, 20000][X, 30000][D, 40000] + left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15), stringsAsFactors = TRUE)) # [A, 12][B, 13][C, 14][D, 15] + right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000), stringsAsFactors = TRUE)) #[Y, 10000][B, 20000][X, 30000][D, 40000] merged <- h2o.merge(left, right, all.y = TRUE, method="radix") - resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000))) + resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic")) # example from Neema left_hf <- as.h2o(data.frame(fruit = c(-177000000, -4000000, 100000000000, 200000000000, 1000000000000), - color <- c('red', 'orange', 'yellow', 'red', 'blue'))) + color = c('red', 'orange', 'yellow', 'red', 'blue'), stringsAsFactors = TRUE)) right_hf <- as.h2o(data.frame(fruit = c(-177000000, -177000000), - citrus <- c(FALSE))) + citrus = c(FALSE))) merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) resultF <- as.h2o(data.frame(fruit = c(100000000000,200000000000,1000000000000,-177000000,-177000000,-4000000), - color=c('yellow','red','blue','red','red','orange'), citrus=c(NA, NA, NA, FALSE, FALSE, NA))) + color=c('yellow','red','blue','red','red','orange'), citrus=c(NA, NA, NA, FALSE, FALSE, NA), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) merged <- h2o.merge(left_hf, right_hf, all.y = TRUE) resultF <- as.h2o(data.frame(fruit = c(-177000000,-177000000), - color=c('red','red'), citrus=c(FALSE, FALSE))) + color=c('red','red'), citrus=c(FALSE, FALSE), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # more or less overlapped left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028), - color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'))) + color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), stringsAsFactors = TRUE)) right_hf <- as.h2o(data.frame(fruit = c(2,1,3,258,518,517,1030,1028,1030,1035,0), - citrus <- c(FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE))) + citrus = c(FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE))) merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) resultF <- as.h2o(data.frame(fruit=c(0,2,3,256,257,518,1028), color=c('yellow','red','orange','blue','red','purple','cyan'), - citrus=c(TRUE, FALSE, FALSE, NA, NA, TRUE, TRUE))) + citrus=c(TRUE, FALSE, FALSE, NA, NA, TRUE, TRUE), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # left frame with duplicate keys left_hf <- as.h2o(data.frame(fruit = c(2,3,3,3,3,0,257,256,518,1028, 1028, 1028), - color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan', - 'black','red','violet','magenta','cyan'))) + color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan', + 'black','red','violet','magenta','cyan'), stringsAsFactors = TRUE)) right_hf <- as.h2o(data.frame(fruit = c(258,517,1030,1028,1030,2049), - citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE))) + citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE))) merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) resultF <- as.h2o(data.frame(fruit=c(0,2,3,3,3,3,256,257,518,1028,1028,1028), color=c('purple','red', 'orange', 'yellow', 'red', 'blue','black','cyan','red','violet','magenta','cyan'), - citrus=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,FALSE,FALSE,FALSE))) + citrus=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,FALSE,FALSE,FALSE), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) # rite frame with duplicate keys left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028), - color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'))) + color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), stringsAsFactors = TRUE)) right_hf <- as.h2o(data.frame(fruit = c(3,3,3,3,258,518,517,1030,1028,1028,1030,2049), - citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE, TRUE, FALSE, FALSE, TRUE))) + citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE, TRUE, FALSE, FALSE, TRUE))) merged <- h2o.merge(left_hf, right_hf, all.x = TRUE) resultF <- as.h2o(data.frame(fruit=c(0,2,3,3,3,3,256,257,518,1028,1028), color=c('yellow','red','orange','orange','orange','orange','blue','red','purple','cyan','cyan'), - citrus=c(NA, NA, TRUE, TRUE, FALSE, FALSE, NA, NA, FALSE, TRUE, FALSE))) + citrus=c(NA, NA, TRUE, TRUE, FALSE, FALSE, NA, NA, FALSE, TRUE, FALSE), stringsAsFactors = TRUE)) assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit")) } diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R index 70cd5b20f450..a3199523e3dd 100644 --- a/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R +++ b/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R @@ -12,7 +12,8 @@ test.continuous.or.categorical <- function() { aa <- data.frame( h1 = c( 1, 8, 4, 3, 6), h2 = c('fish', 'cat', 'fish', 'dog', 'bird'), - h3 = c( 0, 1, 0, 0, 1) + h3 = c( 0, 1, 0, 0, 1), + stringsAsFactors = TRUE ) df.hex <- as.h2o(aa) diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R index f26476d67026..5b37d499a6c5 100644 --- a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R +++ b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R @@ -36,7 +36,7 @@ test.continuous.or.categorical <- function() { e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"), col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x) - expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset") + expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") # col.types as character vector df.hex2 <- h2o.importFile(locate("smalldata/iris/iris.csv"), col.types=c("Numeric","Numeric","Enum","Numeric","Enum")) @@ -66,7 +66,7 @@ test.continuous.or.categorical <- function() { e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"), col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x) - expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset") + expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") # col.types as character vector df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"), @@ -98,7 +98,7 @@ test.continuous.or.categorical <- function() { e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"), col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x) - expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset") + expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") # col.types as character vector df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"), diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_1383.R b/h2o-r/tests/testdir_jira/runit_pubdev_1383.R index 494c98a776ec..196b84bc2739 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_1383.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_1383.R @@ -6,7 +6,7 @@ source("../../scripts/h2o-r-test-setup.R") test.pubdev.1383 <- function() { k <- 10 Log.info("Importing fgl_tr.csv...") - fgl.dat <- read.csv(locate("smalldata/pca_test/fgl_tr.csv")) + fgl.dat <- read.csv(locate("smalldata/pca_test/fgl_tr.csv"), stringsAsFactors = TRUE) fgl.hex <- h2o.importFile(locate("smalldata/pca_test/fgl_tr.csv")) print(summary(fgl.hex)) diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_1398.R b/h2o-r/tests/testdir_jira/runit_pubdev_1398.R index bd72fbe2a9ad..c2be64abe8c9 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_1398.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_1398.R @@ -6,7 +6,7 @@ source("../../scripts/h2o-r-test-setup.R") test.pubdev.1398 <- function() { k <- 13 Log.info("Importing decathlon.csv...") - dec.dat <- read.csv(locate("smalldata/pca_test/decathlon.csv")) + dec.dat <- read.csv(locate("smalldata/pca_test/decathlon.csv"), stringsAsFactors = TRUE) dec.hex <- h2o.importFile(locate("smalldata/pca_test/decathlon.csv")) print(summary(dec.hex)) diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_1654.R b/h2o-r/tests/testdir_jira/runit_pubdev_1654.R index df947c113326..08e573da130e 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_1654.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_1654.R @@ -8,7 +8,7 @@ test.pubdev.1654 <- function() { use_all_factor_levels <- FALSE Log.info("Importing birds.csv data...") - birds.dat <- read.csv(locate("smalldata/pca_test/birds.csv"), header = TRUE) + birds.dat <- read.csv(locate("smalldata/pca_test/birds.csv"), header = TRUE, stringsAsFactors = TRUE) birds.hex <- h2o.importFile(locate("smalldata/pca_test/birds.csv")) print(summary(birds.hex)) diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R b/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R index e12f7694ffcc..4cca01509f0c 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R @@ -3,7 +3,6 @@ source("../../scripts/h2o-r-test-setup.R") # Test derived from Nidhi Mehta. Thanks. test.pubdev.5518 <- function() { - browser() N=1000 set.seed(5) color = sample(c("D","E","I","F","M"),size=N,replace=TRUE) diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_8218.R b/h2o-r/tests/testdir_jira/runit_pubdev_8218.R index 357a1fc12a45..7b764ba52413 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_8218.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_8218.R @@ -6,7 +6,8 @@ test.pubdev_8218 = function(){ df = data.frame( v1 = c('Y', 'Y', 'Y', 'N', 'N'), v2 = c('S', 'S', 'S', 'A', 'A'), - v3 = c('E1', 'E1', 'E1', 'B1', 'B1') + v3 = c('E1', 'E1', 'E1', 'B1', 'B1'), + stringsAsFactors = TRUE ) df.hex = as.h2o(df, 'dfhex') interaction = h2o.interaction( @@ -30,7 +31,8 @@ test.pubdev_8218 = function(){ df = data.frame( v1 = c('Y', 'Y', 'Y', 'N', 'N', 'Y'), v2 = c('S', 'S', 'S', 'A', 'A', 'N'), - v3 = c('E1', 'E1', 'E1', 'B1', 'B1', 'B1') + v3 = c('E1', 'E1', 'E1', 'B1', 'B1', 'B1'), + stringsAsFactors = TRUE ) df.hex = as.h2o(df, 'dfhex') interaction = h2o.interaction( diff --git a/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R b/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R index 0bd6e3507df1..8579e17b124c 100644 --- a/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R +++ b/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R @@ -7,7 +7,6 @@ test <- function() { ## Change CAPSULE to Enum prostate_hex[, "CAPSULE"] = as.factor(prostate_hex[, "CAPSULE"]) - browser() ## Run Random Forest in H2O temp_filename_no_extension <- tempfile(pattern = "pdp", tmpdir = tempdir(), fileext = "") ## Calculate partial dependence using h2o.partialPlot for columns "AGE" and "RACE" diff --git a/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R b/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R index 437a51ebf532..6dcc0dd6a3a4 100644 --- a/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R +++ b/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R @@ -22,7 +22,8 @@ test.as.h2o.sparse <- function() { j <- c(2, 9, 6:10, 46343) x <- pi * (1:8) m.large <- Matrix::sparseMatrix(i, j, x = x) - expect_error(as.matrix(m.large), "Cholmod error 'problem too large'") + # When we have enough memory R 4.4 can create the matrix without failing + # expect_error(as.matrix(m.large), "Cholmod error 'problem too large'|vector memory limit of .* reached") Log.info("Loading a large sparse matrix into H2O") h2o.large <- as.h2o(m.large, "large_matrix") diff --git a/h2o-r/tests/testdir_misc/runit_h2oconfig.R b/h2o-r/tests/testdir_misc/runit_h2oconfig.R index 0b52c13eb439..7afada31b7b0 100644 --- a/h2o-r/tests/testdir_misc/runit_h2oconfig.R +++ b/h2o-r/tests/testdir_misc/runit_h2oconfig.R @@ -88,7 +88,7 @@ test.config <- function() { password = password"),fileConn) #Parse config and check if correct config = .parse.h2oconfig(h2oconfig_filename) - expect_equal(config,data.frame(init.username = "name" ,init.password = "password")) + expect_equal(config,data.frame(init.username = "name" ,init.password = "password", stringsAsFactors = TRUE)) #Create tmp config writeLines(c("[general] @@ -99,7 +99,7 @@ test.config <- function() { password = password"),fileConn) #Parse config and check if correct config = .parse.h2oconfig(h2oconfig_filename) - expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password")) + expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password", stringsAsFactors = TRUE)) #Create tmp config writeLines(c(" @@ -108,10 +108,10 @@ test.config <- function() { init.password = password"),fileConn) #Parse config and check if correct config = .parse.h2oconfig(h2oconfig_filename) - expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password")) + expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password", stringsAsFactors = TRUE)) #Delete tmp directory on.exit(unlink(dir,recursive=TRUE)) } -doTest("Test h2o config parsing", test.config) \ No newline at end of file +doTest("Test h2o config parsing", test.config) diff --git a/h2o-r/tests/testdir_misc/runit_ifelse.R b/h2o-r/tests/testdir_misc/runit_ifelse.R index e7e06c3114d3..8733385ef00f 100644 --- a/h2o-r/tests/testdir_misc/runit_ifelse.R +++ b/h2o-r/tests/testdir_misc/runit_ifelse.R @@ -10,7 +10,7 @@ test.ifelse <- function() { Log.info("Find Setosa species H2O's ifelse...") setosa.hex <- ifelse(iris.hex$Species == "setosa", "N", "Y") - expect_equal(as.data.frame(setosa.hex), data.frame(C1 = setosa)) + expect_equal(as.data.frame(setosa.hex), data.frame(C1 = setosa, stringsAsFactors = TRUE)) } doTest("R and H2O ifelse Function", test.ifelse) diff --git a/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R b/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R index 0400d1b1660b..b1b36cfebe2b 100644 --- a/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R +++ b/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R @@ -13,7 +13,7 @@ test.import_single_quoted <- function() { expect_true(h2o.ncol(hdf) == 20) expect_true(h2o.nrow(hdf) == 7) - df <- read.csv(path, quote="'") + df <- read.csv(path, quote="'", stringsAsFactors = TRUE) hddf <- as.data.frame(hdf) # comparing last column only as it's difficult to compare dataframes in R (always cryptic errors on some column): # if parsing was ok, last column should be identical, otherwise it should be shifted @@ -29,7 +29,7 @@ test.upload_single_quoted <- function() { expect_true(h2o.ncol(hdf) == 20) expect_true(h2o.nrow(hdf) == 7) - df <- read.csv(path, quote="'") + df <- read.csv(path, quote="'", stringsAsFactors = TRUE) hddf <- as.data.frame(hdf) expect_equal(df['status'], hddf['status']) } diff --git a/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R b/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R index ebc210f9c5e1..8254ea838708 100644 --- a/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R +++ b/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R @@ -14,7 +14,6 @@ testPartialPlots <- function() { assert_twoDTable_equal(h2o_pp_weight[[1]], h2o_pp_weight_NA[[1]]) # compare Input_miss pdp assert_twoDTable_equal(h2o_pp_weight[[2]], h2o_pp_weight_NA[[2]]) # compare fDayOfWeek pdp - browser() manual_weighted_stats_im <- manual_partial_dependency(airlines_gbm, airlines_hex, h2o_pp_weight_NA[[1]][[1]], "Input_miss", as.data.frame(airlines_hex["Weight"]), 3) assert_twoDTable_array_equal(h2o_pp_weight_NA[[1]], manual_weighted_stats_im[1,], manual_weighted_stats_im[2,], manual_weighted_stats_im[3,]) manual_weighted_stats_day <- manual_partial_dependency(airlines_gbm, airlines_hex, h2o_pp_weight_NA[[2]][[1]], "fDayOfWeek", as.data.frame(airlines_hex["Weight"]), 3) diff --git a/h2o-r/tests/testdir_misc/runit_relevel.R b/h2o-r/tests/testdir_misc/runit_relevel.R index 48b57b61620e..87d95edfc7ad 100644 --- a/h2o-r/tests/testdir_misc/runit_relevel.R +++ b/h2o-r/tests/testdir_misc/runit_relevel.R @@ -28,7 +28,7 @@ test.relevel <- function() { expect_true(("DPROS.Both" %in% ns2), "Both level IS NOT expected to be skipped in re-leveled column") # compare against R - dr <- read.csv(locate("smalldata/prostate/prostate_cat.csv")) + dr <- read.csv(locate("smalldata/prostate/prostate_cat.csv"), stringsAsFactors=TRUE) dr$DPROS <- relevel(dr$DPROS,"None") mr <- glm(data=dr,CAPSULE ~ ., family=binomial) print(mr) diff --git a/h2o-r/tests/testdir_munging/unop/runit_head_tail.R b/h2o-r/tests/testdir_munging/unop/runit_head_tail.R index e9aa742a9a57..ee29a2fa23f7 100644 --- a/h2o-r/tests/testdir_munging/unop/runit_head_tail.R +++ b/h2o-r/tests/testdir_munging/unop/runit_head_tail.R @@ -6,7 +6,7 @@ source("../../../scripts/h2o-r-test-setup.R") test.head_tail <- function() { Log.info("Uploading iris/iris_wheader.csv") iris.hex <- h2o.importFile(locate("smalldata/iris/iris_wheader.csv"), "iris_wheader.hex") - iris.dat <- read.csv(locate("smalldata/iris/iris_wheader.csv")) + iris.dat <- read.csv(locate("smalldata/iris/iris_wheader.csv"), stringsAsFactors=TRUE) nrows <- nrow(iris.dat) ncols <- ncol(iris.dat) diff --git a/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R b/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R index 25fc429a970f..43d5f4a1d726 100644 --- a/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R +++ b/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R @@ -2,7 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) source("../../scripts/h2o-r-test-setup.R") test.force_col_types <- function() { - browser() originalTypes <- c("real", "int", "int", "int", "int", "string", "real", "string", "real", "real", "enum", "int", "int", "int", "int", "enum", 'real', 'real', "enum", "enum", "enum", 'real', "int", "int", "enum", "enum", "string", "int", "int", "int", "int", "int", "int", "int", "enum", "int", "string", "int", "string", "int", "string", "string", 'real', "int", "string", "int", 'real', 'real', "int", "int") h2odata <- h2o.importFile(path = locate("smalldata/parser/synthetic_dataset.csv")) checkTypes(originalTypes, h2odata) diff --git a/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R b/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R index 86a4fa29c5e0..10925a38e511 100644 --- a/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R +++ b/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R @@ -2,7 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) source("../../scripts/h2o-r-test-setup.R") test.force_col_types <- function() { - browser() originalTypes <- c("real", "int") # old H2O parse column tyoes h2odata <- h2o.importFile(path = locate("smalldata/parser/parquet/df.parquet")) checkTypes(originalTypes, h2odata) diff --git a/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R new file mode 100644 index 000000000000..94c7e016ef9a --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R @@ -0,0 +1,10 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") + +test.skipped_columns <- function() { + iris_hf <- as.h2o(iris, skipped_columns=c(1,2)) + expect_true(ncol(iris_hf) == (ncol(iris)-2)) + print("Columns are skipped!!!") +} + +doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns) diff --git a/h2o-r/tests/testdir_parser/runit_GH_16161_parquet_npe.R b/h2o-r/tests/testdir_parser/runit_GH_16161_parquet_npe.R new file mode 100644 index 000000000000..0ee55e1b639c --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_GH_16161_parquet_npe.R @@ -0,0 +1,19 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") + +test.parseParquetString<- function() { + df <- h2o.createFrame(rows = 100, + cols = 10, + string_fraction = 0.1, # create one string column + seed = 5, + seed_for_column_types = 25) + target <- file.path(sandbox(), "createdFrame.parquet") + h2o.exportFile(data = df, + path = target, + format = "parquet", + write_checksum = FALSE) + df2 <- h2o.importFile(target) + compareFrames(df, df2) +} + +doTest("Test Parquet String export error.", test.parseParquetString) diff --git a/h2o-r/tests/testdir_parser/runit_parse_zstd.R b/h2o-r/tests/testdir_parser/runit_parse_zstd.R new file mode 100644 index 000000000000..2db078cf9e18 --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_parse_zstd.R @@ -0,0 +1,14 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") + +test.parseExportZSTD<- function() { + f1 <- h2o.importFile(locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) + + target <- file.path(sandbox(), "gaussian_20cols_10000Rows.csv.zst") + h2o.exportFile(f1, target) + + f2 <- h2o.importFile(target) + compareFrames(f1, f2, prob=1) +} + +doTest("Test ZSTD parser and export", test.parseExportZSTD) diff --git a/scripts/jenkins/groovy/buildConfig.groovy b/scripts/jenkins/groovy/buildConfig.groovy index d3ea58ba1fd4..a212fe3b2150 100644 --- a/scripts/jenkins/groovy/buildConfig.groovy +++ b/scripts/jenkins/groovy/buildConfig.groovy @@ -14,7 +14,7 @@ class BuildConfig { private static final String DEFAULT_HADOOP_IMAGE_NAME = 'dev-build-hadoop' private static final String DEFAULT_RELEASE_IMAGE_NAME = 'dev-release' - public static final int DEFAULT_IMAGE_VERSION_TAG = 44 + public static final int DEFAULT_IMAGE_VERSION_TAG = 45 public static final String AWSCLI_IMAGE = DOCKER_REGISTRY + '/opsh2oai/awscli' public static final String S3CMD_IMAGE = DOCKER_REGISTRY + '/opsh2oai/s3cmd' @@ -99,7 +99,7 @@ class BuildConfig { changesMap[COMPONENT_HADOOP] = buildHadoop changedPythonTests = detectPythonTestChanges(changes) - nodeLabels = NodeLabels.findByBuildURL(context.env.BUILD_URL) + nodeLabels = NodeLabels.LABELS_C1 supportedXGBEnvironments = [ 'centos7.3': [ [name: 'CentOS 7.3 Minimal', dockerfile: 'xgb/centos/Dockerfile-centos-minimal', fromImage: 'centos:7.3.1611', targetName: XGB_TARGET_MINIMAL, nodeLabel: getDefaultNodeLabel()], @@ -369,13 +369,13 @@ class BuildConfig { } static enum NodeLabels { - LABELS_C1('docker && !mr-0xc8', 'mr-0xc9', 'gpu && !2gpu', 'mr-0xk10'), //master or nightly build - LABELS_B4('docker', 'docker', 'gpu && !2gpu', 'docker') //PR build + LABELS_C1('docker && !mr-0xc8', 'mr-0xc9', 'gpu && !2gpu', 'mr-0xk10'), //master or nightly build - use only this one + LABELS_B4('docker', 'docker', 'gpu && !2gpu', 'docker') //PR build - not used static Map LABELS_MAP = [ "c1": LABELS_C1, "g1": LABELS_C1, //mr-0xg1 was set as alias to mr-0xc1 - "b4": LABELS_B4 + "b4": LABELS_B4 // not used ] private final String defaultNodeLabel @@ -405,16 +405,6 @@ class BuildConfig { String getGPUBenchmarkNodeLabel() { return gpuBenchmarkNodeLabel } - - private static NodeLabels findByBuildURL(final String buildURL) { - final String name = buildURL.replaceAll('http://mr-0x', '').replaceAll(':8080.*', '') - - if (LABELS_MAP.containsKey(name)) { - return LABELS_MAP.get(name) - } else { - throw new IllegalArgumentException(String.format("Master %s (%s) is unknown", name, buildURL)) - } - } } } diff --git a/scripts/jenkins/groovy/defineTestStages.groovy b/scripts/jenkins/groovy/defineTestStages.groovy index e0a0dd38e8cd..d009bdb6905a 100644 --- a/scripts/jenkins/groovy/defineTestStages.groovy +++ b/scripts/jenkins/groovy/defineTestStages.groovy @@ -53,6 +53,10 @@ def call(final pipelineContext) { stageName: 'R4.0 Smoke', target: 'test-r-smoke', rVersion: '4.0.2',timeoutValue: 8, component: pipelineContext.getBuildConfig().COMPONENT_R ], + [ + stageName: 'R4.4 Smoke', target: 'test-r-smoke', rVersion: '4.4.0',timeoutValue: 8, + component: pipelineContext.getBuildConfig().COMPONENT_R + ], [ stageName: 'Flow Headless Smoke', target: 'test-flow-headless-smoke',timeoutValue: 36, component: pipelineContext.getBuildConfig().COMPONENT_JS @@ -173,6 +177,14 @@ def call(final pipelineContext) { stageName: 'R4.0 CMD Check as CRAN', target: 'test-r-cmd-check-as-cran', rVersion: '4.0.2', timeoutValue: 20, hasJUnit: false, component: pipelineContext.getBuildConfig().COMPONENT_R ], + [ + stageName: 'R4.4 Small', target: 'test-r-small', rVersion: '4.4.0', + timeoutValue: 190, component: pipelineContext.getBuildConfig().COMPONENT_R + ], + [ + stageName: 'R4.4 CMD Check as CRAN', target: 'test-r-cmd-check-as-cran', rVersion: '4.4.0', + timeoutValue: 20, hasJUnit: false, component: pipelineContext.getBuildConfig().COMPONENT_R + ], [ stageName: 'R3.5 Booklets', target: 'test-r-booklets', rVersion: '3.5.3', timeoutValue: 60, component: pipelineContext.getBuildConfig().COMPONENT_R @@ -559,6 +571,10 @@ def call(final pipelineContext) { stageName: 'R4.0 Explain', target: 'test-r-explain', rVersion: '4.0.2', timeoutValue: 180, component: pipelineContext.getBuildConfig().COMPONENT_R ], + [ + stageName: 'R4.4 Explain', target: 'test-r-explain', rVersion: '4.4.0', + timeoutValue: 180, component: pipelineContext.getBuildConfig().COMPONENT_R + ], [ stageName: 'LOGGER inicialization test', target: 'test-logger-initialize-properly', javaVersion: 8, timeoutValue: 10, component: pipelineContext.getBuildConfig().COMPONENT_JAVA diff --git a/scripts/jenkins/jenkinsfiles/Jenkinsfile b/scripts/jenkins/jenkinsfiles/Jenkinsfile index 7a2bf78c4050..1a6d9dfcb1c9 100644 --- a/scripts/jenkins/jenkinsfiles/Jenkinsfile +++ b/scripts/jenkins/jenkinsfiles/Jenkinsfile @@ -2,7 +2,7 @@ final String MODE_PR = 'MODE_PR' final String MODE_MASTER = 'MODE_MASTER' -final String DEFAULT_NODE_LABEL = 'h2o-3 && docker && !mr-0xc8 && (!micro || micro_21)' +final String DEFAULT_NODE_LABEL = 'h2o-3' final int HEALTH_CHECK_RETRIES = 5 def defineTestStages = null diff --git a/scripts/jenkins/jenkinsfiles/Jenkinsfile-PrismaScan b/scripts/jenkins/jenkinsfiles/Jenkinsfile-PrismaScan index 263875e91ab0..0b36d48fb85f 100644 --- a/scripts/jenkins/jenkinsfiles/Jenkinsfile-PrismaScan +++ b/scripts/jenkins/jenkinsfiles/Jenkinsfile-PrismaScan @@ -3,8 +3,9 @@ @Library('test-shared-library') _ def dockerImage +def trivyVersion = "0.54.1" -def setPrismaScanningStages(assemblyType, stageIndex) { +def setScanningStages(assemblyType, stageIndex) { branchName = "${env.BRANCH_NAME}".replace('/', '-') assemblyImage = "h2o-assemblies/${assemblyType}:${BUILD_NUMBER}-${branchName}" @@ -13,13 +14,25 @@ def setPrismaScanningStages(assemblyType, stageIndex) { sh "docker build . -t ${assemblyImage} -f ./docker/prisma/Dockerfile.${assemblyType}jars" } } - stage ("${stageIndex}.B. Scan ${assemblyType} jar using Snyk") { - withCredentials([string(credentialsId: 'H2O_3_SNYK_TOKEN_JENKINS_TEXT', variable: 'SNYK_TOKEN')]) { - script { - sh "./snyk container test ${assemblyImage} --file=./docker/prisma/Dockerfile.${assemblyType}jars --severity-threshold=medium --app-vulns --nested-jars-depth=4 | tee ${assemblyImage}-snyk.out || true" - } - archiveArtifacts artifacts: "${assemblyImage}-snyk.out" + stage ("${stageIndex}.B. Scan ${assemblyType} jar using Trivy") { + script { + sh "./trivy image ${assemblyImage} --output ${assemblyImage}-trivy.out" + // Replace special characters with * in order to show it directly in browser + sh """ + sed -i 's/─/*/g' ${assemblyImage}-trivy.out + sed -i 's/│/*/g' ${assemblyImage}-trivy.out + sed -i 's/┤/*/g' ${assemblyImage}-trivy.out + sed -i 's/├/*/g' ${assemblyImage}-trivy.out + sed -i 's/┼/*/g' ${assemblyImage}-trivy.out + sed -i 's/┐/*/g' ${assemblyImage}-trivy.out + sed -i 's/┌/*/g' ${assemblyImage}-trivy.out + sed -i 's/└/*/g' ${assemblyImage}-trivy.out + sed -i 's/┘/*/g' ${assemblyImage}-trivy.out + sed -i 's/┬/*/g' ${assemblyImage}-trivy.out + sed -i 's/┴/*/g' ${assemblyImage}-trivy.out + """ } + archiveArtifacts artifacts: "${assemblyImage}-trivy.out" } stage("${stageIndex}.C. Scan ${assemblyType} jar using Prisma") { script { @@ -62,8 +75,9 @@ pipeline { dir("docker/prisma"){ dockerImage = docker.build("node-java","-f Dockerfile .") } - sh "curl --compressed https://static.snyk.io/cli/latest/snyk-linux -o snyk" - sh "chmod +x ./snyk" + sh "wget https://github.com/aquasecurity/trivy/releases/download/v${trivyVersion}/trivy_${trivyVersion}_Linux-64bit.tar.gz" + sh "tar -zxvf trivy_${trivyVersion}_Linux-64bit.tar.gz" + sh "chmod +x ./trivy" } } @@ -80,14 +94,14 @@ pipeline { } } } - stage('2. Steam assembly jar (Prisma)') { + stage('2. Steam assembly jar') { steps { - setPrismaScanningStages("steam", 2) + setScanningStages("steam", 2) } } - stage('3. Main assembly jar (Prisma)') { + stage('3. Main assembly jar') { steps { - setPrismaScanningStages("main", 3) + setScanningStages("main", 3) } } } diff --git a/scripts/validate_r_cmd_check_output.py b/scripts/validate_r_cmd_check_output.py index b9748ffda763..465fc2be6873 100644 --- a/scripts/validate_r_cmd_check_output.py +++ b/scripts/validate_r_cmd_check_output.py @@ -35,10 +35,17 @@ def process(self): r"^\* using log directory", r"^\* using R version", r"^\* using R Under development", + r"^\* R was compiled by", + r"^ Apple clang.*", + r"^ gcc.*", + r"^ GNU Fortran.*", r"^\* using platform", + r"^\* using platform:.*", + r"^\* running under:.*", r"^\* using session charset", r"^\* using option .*", r"^\* checking .* \.\.\. OK", + r"^\* checking .* \.\.\. \[\d+s/\d+s\] OK", r"^\* checking extension type ... Package", r"^\* this is package",