+ * This is required, as default {@link WebSocketServletFactory} uses {@link org.eclipse.jetty.util.DecoratedObjectFactory}
+ * to instantiate {@link WebSocketListener} classes. This class is only able to instantiate static classes with 0-arg constructor,
+ * which inner non-static class {@link Jetty9WebsocketHandler} is NOT.
+ */
+ public class H2OWebSocketCreator implements WebSocketCreator {
+
+ @Override
+ public Object createWebSocket(ServletUpgradeRequest req, ServletUpgradeResponse resp) {
+ return new Jetty9WebsocketHandler();
+ }
+ }
+}
diff --git a/h2o-parsers/h2o-avro-parser/build.gradle b/h2o-parsers/h2o-avro-parser/build.gradle
index be498665d8ed..d61f58043d90 100644
--- a/h2o-parsers/h2o-avro-parser/build.gradle
+++ b/h2o-parsers/h2o-avro-parser/build.gradle
@@ -6,7 +6,7 @@ description = "H2O Avro Parser"
dependencies {
api project(":h2o-core")
// Avro support
- api 'org.apache.avro:avro:1.11.3'
+ api 'org.apache.avro:avro:1.11.4'
testImplementation project(":h2o-test-support")
testRuntimeOnly project(":${defaultWebserverModule}")
diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java
index 8c35cc72c8b8..e1fc1f8a85ed 100644
--- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java
+++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java
@@ -112,7 +112,9 @@ public void map(Chunk[] cs) {
group = group.append(currColName, cs[j].at8(i));
break;
case (T_STR):
- group = group.append(currColName, cs[j].atStr(new BufferedString(), i).toString());
+ if (!cs[j].isNA(i)) {
+ group = group.append(currColName, cs[j].atStr(new BufferedString(), i).toString());
+ }
break;
case (T_CAT):
if (cs[j].isNA(i)) {
diff --git a/h2o-persist-gcs/build.gradle b/h2o-persist-gcs/build.gradle
index 5d1ba374ad2b..d07448510c39 100644
--- a/h2o-persist-gcs/build.gradle
+++ b/h2o-persist-gcs/build.gradle
@@ -4,10 +4,18 @@ description = "H2O Persist GCS"
dependencies {
api project(":h2o-core")
- api 'com.google.cloud:google-cloud-storage:2.13.1'
+ api ('com.google.cloud:google-cloud-storage:2.13.1')
testImplementation project(":h2o-test-support")
testRuntimeOnly project(":${defaultWebserverModule}")
+
+ constraints {
+ api('com.google.protobuf:protobuf-java:3.25.5') {
+ because 'Fixes CVE-2024-7254'
+ because 'Fixes SNYK-JAVA-COMGOOGLEPROTOBUF-8055227'
+ because 'Fixes SNYK-JAVA-COMGOOGLEPROTOBUF-8055228'
+ }
+ }
}
apply from: "${rootDir}/gradle/dataCheck.gradle"
diff --git a/h2o-py/h2o/display.py b/h2o-py/h2o/display.py
index 6815c19a1f79..5b382433605c 100644
--- a/h2o-py/h2o/display.py
+++ b/h2o-py/h2o/display.py
@@ -5,6 +5,7 @@
:copyright: (c) 2016 H2O.ai
:license: Apache License Version 2.0 (see LICENSE for details)
"""
+# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython
from contextlib import contextmanager
import os
import sys
diff --git a/h2o-py/h2o/estimators/decision_tree.py b/h2o-py/h2o/estimators/decision_tree.py
index e598396b2a82..0e7f391b515f 100644
--- a/h2o-py/h2o/estimators/decision_tree.py
+++ b/h2o-py/h2o/estimators/decision_tree.py
@@ -107,6 +107,22 @@ def ignore_const_cols(self):
Ignore constant columns.
Type: ``bool``, defaults to ``True``.
+
+ :examples:
+
+ >>> import h2o
+ >>> from h2o.estimators import H2ODecisionTreeEstimator
+ >>> h2o.init()
+ >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
+ >>> target_variable = 'CAPSULE'
+ >>> prostate[target_variable] = prostate[target_variable].asfactor()
+ >>> prostate["const_1"] = 6
+ >>> train, test = prostate.split_frame(ratios=[0.7])
+ >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex",
+ ... max_depth=5,
+ ... ignore_const_cols=True)
+ >>> sdt_h2o.train(y=target_variable, training_frame=train)
+ >>> pred_test = sdt_h2o.predict(test)
"""
return self._parms.get("ignore_const_cols")
@@ -122,6 +138,22 @@ def categorical_encoding(self):
Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
"sort_by_response", "enum_limited"]``, defaults to ``"auto"``.
+
+ :examples:
+
+ >>> import h2o
+ >>> from h2o.estimators import H2ODecisionTreeEstimator
+ >>> h2o.init()
+ >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
+ >>> target_variable = 'CAPSULE'
+ >>> prostate["RACE"] = prostate["RACE"].asfactor()
+ >>> prostate[target_variable] = prostate[target_variable].asfactor()
+ >>> train, test = prostate.split_frame(ratios=[0.7])
+ >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex",
+ ... max_depth=5,
+ ... categorical_encoding="binary")
+ >>> sdt_h2o.train(y=target_variable, training_frame=train)
+ >>> pred_test = sdt_h2o.predict(test)
"""
return self._parms.get("categorical_encoding")
@@ -164,6 +196,20 @@ def max_depth(self):
Max depth of tree.
Type: ``int``, defaults to ``20``.
+
+ :examples:
+
+ >>> import h2o
+ >>> from h2o.estimators import H2ODecisionTreeEstimator
+ >>> h2o.init()
+ >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
+ >>> target_variable = 'CAPSULE'
+ >>> prostate[target_variable] = prostate[target_variable].asfactor()
+ >>> train, test = prostate.split_frame(ratios=[0.7])
+ >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex",
+ ... max_depth=5)
+ >>> sdt_h2o.train(y=target_variable, training_frame=train)
+ >>> pred_test = sdt_h2o.predict(test)
"""
return self._parms.get("max_depth")
@@ -178,6 +224,21 @@ def min_rows(self):
Fewest allowed (weighted) observations in a leaf.
Type: ``int``, defaults to ``10``.
+
+ :examples:
+
+ >>> import h2o
+ >>> from h2o.estimators import H2ODecisionTreeEstimator
+ >>> h2o.init()
+ >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
+ >>> target_variable = 'CAPSULE'
+ >>> prostate[target_variable] = prostate[target_variable].asfactor()
+ >>> train, test = prostate.split_frame(ratios=[0.7])
+ >>> sdt_h2o = H2ODecisionTreeEstimator(model_id="decision_tree.hex",
+ ... max_depth=5,
+ ... min_rows=20)
+ >>> sdt_h2o.train(y=target_variable, training_frame=train)
+ >>> pred_test = sdt_h2o.predict(test)
"""
return self._parms.get("min_rows")
diff --git a/h2o-py/h2o/estimators/rulefit.py b/h2o-py/h2o/estimators/rulefit.py
index 529b371780ea..be80309794b0 100644
--- a/h2o-py/h2o/estimators/rulefit.py
+++ b/h2o-py/h2o/estimators/rulefit.py
@@ -206,6 +206,22 @@ def algorithm(self):
The algorithm to use to generate rules.
Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... algorithm="gbm",
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("algorithm")
@@ -220,6 +236,22 @@ def min_rule_length(self):
Minimum length of rules. Defaults to 3.
Type: ``int``, defaults to ``3``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... min_rule_length=4,
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("min_rule_length")
@@ -234,6 +266,22 @@ def max_rule_length(self):
Maximum length of rules. Defaults to 3.
Type: ``int``, defaults to ``3``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... min_rule_length=3,
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("max_rule_length")
@@ -249,6 +297,21 @@ def max_num_rules(self):
by diminishing returns in model deviance.
Type: ``int``, defaults to ``-1``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=3,
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("max_num_rules")
@@ -263,6 +326,22 @@ def model_type(self):
Specifies type of base learners in the ensemble.
Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... model_type="rules",
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("model_type")
@@ -298,6 +377,22 @@ def distribution(self):
Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
"quantile", "huber"]``, defaults to ``"auto"``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... distribution="bernoulli",
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("distribution")
@@ -312,6 +407,22 @@ def rule_generation_ntrees(self):
Specifies the number of trees to build in the tree model. Defaults to 50.
Type: ``int``, defaults to ``50``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... rule_generation_ntrees=60,
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("rule_generation_ntrees")
@@ -370,6 +481,22 @@ def max_categorical_levels(self):
for categorical_encoding == EnumLimited.
Type: ``int``, defaults to ``10``.
+
+ :examples:
+
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... max_categorical_levels=11,
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> print(rfit.rule_importance())
"""
return self._parms.get("max_categorical_levels")
@@ -385,6 +512,21 @@ def rule_importance(self):
Retrieve rule importances for a Rulefit model
:return: H2OTwoDimTable
+
+ :examples:
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+ >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+ >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+ >>> y = "survived"
+ >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+ ... max_num_rules=100,
+ ... seed=1)
+ >>> rfit.train(training_frame=df, x=x, y=y)
+ >>> rule_importance = rfit.rule_importance()
+ >>> print(rfit.rule_importance())
"""
if self._model_json["algo"] != "rulefit":
raise H2OValueError("This function is available for Rulefit models only")
@@ -397,11 +539,29 @@ def rule_importance(self):
def predict_rules(self, frame, rule_ids):
"""
- Evaluates validity of the given rules on the given data.
+ Evaluates validity of the given rules on the given data.
:param frame: H2OFrame on which rule validity is to be evaluated
:param rule_ids: string array of rule ids to be evaluated against the frame
:return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
+
+ :examples:
+ >>> import h2o
+ >>> h2o.init()
+ >>> from h2o.estimators import H2ORuleFitEstimator
+ >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv"
+ >>> df = h2o.import_file(path=f, col_types={'species': "enum"})
+ >>> x = df.columns
+ >>> y = "species"
+ >>> x.remove(y)
+ >>> train, test = df.split_frame(ratios=[.8], seed=1234)
+ >>> rfit = H2ORuleFitEstimator(min_rule_length=4,
+ ... max_rule_length=5,
+ ... max_num_rules=3,
+ ... seed=1234,
+ ... model_type="rules")
+ >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test)
+ >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica']))
"""
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
index 42726858deba..432e197016cb 100644
--- a/h2o-py/h2o/frame.py
+++ b/h2o-py/h2o/frame.py
@@ -28,7 +28,7 @@
from h2o.utils.metaclass import deprecated_fn
from h2o.utils.shared_utils import(gen_header, is_list, is_list_of_lists, is_str_list, py_tmp_key, quoted,
can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized,
- check_frame_id, can_use_datatable, can_use_polars, can_use_pyarrow)
+ check_frame_id, can_use_polars, can_use_pyarrow)
from h2o.utils.threading import local_context, local_env
from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray,
numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U)
@@ -1942,17 +1942,16 @@ def structure(self):
else:
print("num {}".format(" ".join(it[0] if it else "nan" for it in h2o.as_list(self[:10, i], False)[1:])))
- def as_data_frame(self, use_pandas=True, header=True):
+ def as_data_frame(self, use_pandas=True, header=True, use_multi_thread=False):
"""
Obtain the dataset as a python-local object.
:param bool use_pandas: If True (default) then return the H2OFrame as a pandas DataFrame (requires that the
``pandas`` library was installed). If False, then return the contents of the H2OFrame as plain nested
- list, in a row-wise order. The conversion to pandas frame will use multi-thread whenever
- possible with the right python modules (datatable or polars and pyarrow) installed. Otherwise, single
- thread operation will be used in the conversion.
+ list, in a row-wise order.
:param bool header: If True (default), then column names will be appended as the first row in list
-
+ :param bool use_multi_thread: If True (False by default), will use polars/pyarrow to perform conversion in
+ multi-thread which is faster.
:returns: A python object (a list of lists of strings, each list is a row, if ``use_pandas=False``, otherwise
a pandas DataFrame) containing this H2OFrame instance's data.
@@ -1969,22 +1968,19 @@ def as_data_frame(self, use_pandas=True, header=True):
"""
if can_use_pandas() and use_pandas:
import pandas
- if (can_use_datatable()) or (can_use_polars() and can_use_pyarrow()): # can use multi-thread
- exportFile = tempfile.NamedTemporaryFile(suffix=".h2oframe2Convert.csv", delete=False)
- try:
- exportFile.close() # needed for Windows
- h2o.export_file(self, exportFile.name, force=True)
- if can_use_datatable(): # use datatable for multi-thread by default
- return self.convert_with_datatable(exportFile.name)
- elif can_use_polars() and can_use_pyarrow(): # polar/pyarrow if datatable is not available
- return self.convert_with_polars(exportFile.name)
- finally:
- os.unlink(exportFile.name)
+ if use_multi_thread:
+ with local_context(polars_enabled=True): # turn on multi-thread toolboxes
+ if can_use_polars() and can_use_pyarrow(): # can use multi-thread
+ exportFile = tempfile.NamedTemporaryFile(suffix=".h2oframe2Convert.csv", delete=False)
+ try:
+ exportFile.close() # needed for Windows
+ h2o.export_file(self, exportFile.name, force=True)
+ return self.convert_with_polars(exportFile.name)
+ finally:
+ os.unlink(exportFile.name)
warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
- " multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow "
- "(for Python 3.10 or above) and activate it using:\n\n"+
- "with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):\n"
- " pandas_df = h2o_df.as_data_frame()\n", H2ODependencyWarning)
+ " multi-thread, install polars and pyarrow and use it as "
+ "pandas_df = h2o_df.as_data_frame(use_multi_thread=True)\n", H2ODependencyWarning)
return pandas.read_csv(StringIO(self.get_frame_data()), low_memory=False, skip_blank_lines=False)
from h2o.utils.csv.readers import reader
@@ -1998,18 +1994,6 @@ def convert_with_polars(self, fileName):
import polars as pl
dt_frame = pl.read_csv(fileName, null_values = "")
return dt_frame.to_pandas()
-
- def convert_with_datatable(self, fileName):
- import datatable as dt
- frameTypes = self.types
- validFrameTypes = {}
- for key, value in frameTypes.items():
- if value.startswith('int'):
- validFrameTypes[key] = dt.int64
- elif value.startswith("real"):
- validFrameTypes[key] = dt.float64
- dt_frame = dt.fread(fileName, na_strings=[""], columns=validFrameTypes)
- return dt_frame.to_pandas()
def save_to_hive(self, jdbc_url, table_name, format="csv", table_path=None, tmp_path=None):
"""
diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py
index 542cb117d872..db67d4fb93be 100644
--- a/h2o-py/h2o/h2o.py
+++ b/h2o-py/h2o/h2o.py
@@ -868,14 +868,22 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
if ind in skipped_columns:
use_type[ind]=False
- if column_names is not None:
+ if column_names is not None:
if not isinstance(column_names, list): raise ValueError("col_names should be a list")
if (skipped_columns is not None) and len(skipped_columns)>0:
- if (len(column_names)) != parse_column_len:
+ # when we are converting a python object to H2OFrame, column_names will include all columns despite
+ # skipped columns are specified. In this case, we need to make sure that
+ # len(column_names)-len(skipped_columns)==parse_column_len
+ # When we are importing a file with skipped columns mentioned, column_names will only contain columns that
+ # are not skipped. Hence, in this case, we need to check len(column_names) == parse_column_len.
+ # To combine the two, correct parsing will have conditions len(column_names)-len(skipped_columns)==parse_column_len
+ # or len(column_names)==parse_column_len. Hence, we will raise an error when
+ # not(len(column_names)-len(skipped_columns)==parse_column_len or len(column_names)==parse_column_len happened.
+ if not((len(column_names) == parse_column_len) or ((len(column_names)-len(skipped_columns))==parse_column_len)):
raise ValueError(
- "length of col_names should be equal to the number of columns parsed: %d vs %d"
- % (len(column_names), parse_column_len))
- else:
+ "length of col_names minus length of skipped_columns should equal the number of columns parsed: "
+ "%d vs %d" % (len(column_names), parse_column_len))
+ else: # no skipped columns here
if len(column_names) != len(j["column_types"]): raise ValueError(
"length of col_names should be equal to the number of columns: %d vs %d"
% (len(column_names), len(j["column_types"])))
diff --git a/h2o-py/h2o/plot/_matplotlib.py b/h2o-py/h2o/plot/_matplotlib.py
index fa0b4212041f..d7e97caf541c 100644
--- a/h2o-py/h2o/plot/_matplotlib.py
+++ b/h2o-py/h2o/plot/_matplotlib.py
@@ -1,9 +1,11 @@
def get_matplotlib_pyplot(server, raise_if_not_available=False):
+ # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
try:
# noinspection PyUnresolvedReferences
import matplotlib
- matplotlib.use("Agg")
+ if server:
+ matplotlib.use("Agg")
try:
# noinspection PyUnresolvedReferences
import matplotlib.pyplot as plt
@@ -25,6 +27,7 @@ def get_matplotlib_pyplot(server, raise_if_not_available=False):
def get_polycollection(server, raise_if_not_available=False):
+ # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
try:
from matplotlib.collections import PolyCollection as polycoll
return polycoll
@@ -36,6 +39,7 @@ def get_polycollection(server, raise_if_not_available=False):
def get_matplotlib_cm(function_name):
+ # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
try:
from matplotlib import cm
return cm
@@ -45,6 +49,7 @@ def get_matplotlib_cm(function_name):
def get_mplot3d_axes(function_name):
+ # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
try:
# noinspection PyUnresolvedReferences
from mpl_toolkits.mplot3d import Axes3D
diff --git a/h2o-py/h2o/plot/_plot_result.py b/h2o-py/h2o/plot/_plot_result.py
index ad8b3bab2d87..25e6642957af 100644
--- a/h2o-py/h2o/plot/_plot_result.py
+++ b/h2o-py/h2o/plot/_plot_result.py
@@ -1,5 +1,6 @@
# -*- encoding: utf-8 -*-
# mutable versions of py immutable types
+# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython
from h2o.exceptions import H2OError
__no_export = set(dir()) # all variables defined above this are not exported
diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py
index 67b622fd62c6..89356c1144ed 100644
--- a/h2o-py/h2o/utils/shared_utils.py
+++ b/h2o-py/h2o/utils/shared_utils.py
@@ -137,28 +137,19 @@ def is_module_enabled(mod):
def can_use_pandas():
return is_module_available('pandas')
-
-def can_use_datatable():
- return is_module_enabled('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9
-
-
-def can_install_datatable():
- return sys.version_info.major == 3 and sys.version_info.minor <= 9
-
-
def can_install_polars():
- return sys.version_info.major == 3 and sys.version_info.minor > 9
+ return sys.version_info.major == 3 and sys.version_info.minor >= 6
def can_use_polars():
- return is_module_enabled('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9
+ return is_module_enabled('polars') and sys.version_info.major == 3 and sys.version_info.minor >= 6
def can_use_pyarrow():
- if can_use_pandas() and sys.version_info.minor > 9:
+ if can_use_pandas() and sys.version_info.minor >= 6:
import pandas
- return is_module_available('pyarrow') and sys.version_info.major == 3 and sys.version_info.minor > 9 and \
- sys.version_info.major == 3 and float(pandas.__version__[0]) >= 1
+ return is_module_available('pyarrow') and sys.version_info.major == 3 and sys.version_info.minor >= 6 and \
+ float(pandas.__version__[0]) >= 1
else:
return False
diff --git a/h2o-py/tests/pyunit_utils/__init__.py b/h2o-py/tests/pyunit_utils/__init__.py
index b2987ec0859d..a8415192911b 100644
--- a/h2o-py/tests/pyunit_utils/__init__.py
+++ b/h2o-py/tests/pyunit_utils/__init__.py
@@ -3,3 +3,4 @@
from .utils_model_custom_distribution import *
from .utils_for_glm_tests import *
from .sklearn_multinomial_auc_method import roc_auc_score
+from .utils_parser_tests import *
diff --git a/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py b/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py
index 010845eb56e0..f993375e9578 100644
--- a/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py
+++ b/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py
@@ -1,4 +1,3 @@
-import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator as glm
from h2o.exceptions import H2OValueError
from h2o.grid.grid_search import H2OGridSearch
diff --git a/h2o-py/tests/pyunit_utils/utils_parser_tests.py b/h2o-py/tests/pyunit_utils/utils_parser_tests.py
new file mode 100644
index 000000000000..4993887032eb
--- /dev/null
+++ b/h2o-py/tests/pyunit_utils/utils_parser_tests.py
@@ -0,0 +1,45 @@
+from tests import pyunit_utils
+import h2o
+import time
+import pandas as pd
+
+def test_frame_conversion(dataset, original_pandas_frame):
+ # convert frame using datatable or polar/pyarrow
+ h2oframe = h2o.import_file(pyunit_utils.locate(dataset))
+ test_frames_conversion(h2oframe, original_pandas_frame)
+
+def test_frames_conversion(h2oframe, original_pandas_frame):
+ start_time = time.time()
+ new_pandas_frame = h2oframe.as_data_frame(use_multi_thread=True)
+ new_time = time.time()-start_time
+ print("H2O frame to Pandas frame conversion time with multi-thread using module polars/pyarrow: {0}".format(new_time))
+ # compare two frames column types
+ new_types = new_pandas_frame.dtypes
+ old_types = original_pandas_frame.dtypes
+ ncol = h2oframe.ncol
+ col_names = new_pandas_frame.columns
+
+ for ind in list(range(ncol)):
+ assert new_types[col_names[ind]] == old_types[col_names[ind]], "Expected column types: {0}, actual column types: " \
+ "{1}".format(old_types[col_names[ind]], new_types[col_names[ind]])
+ if new_types[col_names[ind]] == "object":
+ diff = new_pandas_frame[col_names[ind]] == original_pandas_frame[col_names[ind]]
+ if not diff.all(): # difference caused by the presence of NAs
+ new_series = pd.Series(new_pandas_frame[col_names[ind]])
+ new_NA = new_series.isna()
+ old_series = pd.Series(original_pandas_frame[col_names[ind]])
+ old_NA = old_series.isna()
+ assert (new_NA==old_NA).all()
+ else:
+ diff = (new_pandas_frame[col_names[ind]] - original_pandas_frame[col_names[ind]]).abs()
+ assert diff.max() < 1e-10
+
+
+def single_thread_pandas_conversion(dataset):
+ print("converting h2o frame to pandas frame using single thread")
+ h2oframe = h2o.import_file(pyunit_utils.locate(dataset))
+ start_time = time.time()
+ h2oframe_panda = h2oframe.as_data_frame()
+ new_time = time.time()-start_time
+ print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(new_time, dataset))
+ return h2oframe_panda
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_bad_constraints_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_bad_constraints_large.py
new file mode 100644
index 000000000000..c9a81a79a75f
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_bad_constraints_large.py
@@ -0,0 +1,156 @@
+import h2o
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm
+import numpy as np
+import pandas as pd
+from tests import pyunit_utils
+
+# this test needs to run into completion duplicating/conflicting constraints
+def data_prep(seed):
+ np.random.seed(seed)
+ x1 = np.random.normal(0, 10, 100000)
+ x2 = np.random.normal(10, 100 , 100000)
+ x3 = np.random.normal(20, 200, 100000)
+ x4 = np.random.normal(30, 3000, 100000)
+ x5 = np.random.normal(400, 4000, 100000)
+
+ y_raw = np.sin(x1)*100 + np.sin(x2)*100 + x3/20 + x3/30 + x5/400
+ y = np.random.normal(y_raw, 20)
+
+ data = {
+ 'x1': x1,
+ 'x2': x2,
+ 'x3': x3,
+ 'x4': x4,
+ 'x5': x5,
+ 'y': y,
+ }
+ return h2o.H2OFrame(pd.DataFrame(data))
+
+def test_duplicate_conflicting_constraints():
+ train_data = data_prep(123)
+ family = 'gaussian'
+ link = 'identity'
+ nfolds = 0
+ lambda_ = 0.0
+ seed = 1234
+ calc_like = True
+ compute_p_values = True
+ solver = 'irlsm'
+ predictors = ['x1', 'x2', 'x3', 'x4', 'x5']
+ response = "y"
+
+ linear_constraints2 = []
+
+ name = "x2"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = -1
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x4"
+ values = -1
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x2"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x4"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ linear_constraints = h2o.H2OFrame(linear_constraints2)
+ linear_constraints.set_names(["names", "values", "types", "constraint_numbers"])
+
+ params = {
+ "family" : family,
+ "link": link,
+ "lambda_" : lambda_,
+ "seed" : seed,
+ "nfolds" : nfolds,
+ "compute_p_values" : compute_p_values,
+ "calc_like" : calc_like,
+ "solver" : solver,
+ "linear_constraints": linear_constraints
+ }
+
+ model = glm(**params)
+ model.train(x = predictors, y = response, training_frame = train_data)
+ print(model.coef())
+ coef_constrained = model.coef()
+ print(glm.getConstraintsInfo(model))
+
+ params = {
+ "family" : family,
+ "link": link,
+ "lambda_" : lambda_,
+ "seed" : seed,
+ "nfolds" : nfolds,
+ "compute_p_values" : compute_p_values,
+ "calc_like" : calc_like,
+ "solver" : solver,
+ }
+
+ model_no_constraints = glm(**params)
+ model_no_constraints.train(x = predictors, y = response, training_frame = train_data)
+ coef_no_constraints = model_no_constraints.coef()
+ print("model built without constraints")
+ print(coef_no_constraints)
+ print("x2-x3: {0}".format(coef_no_constraints['x2']-coef_no_constraints['x3']))
+ print("x3-x4: {0}".format(coef_no_constraints['x3']-coef_no_constraints['x4']))
+ print("x2+x3+x4: {0}".format(coef_no_constraints['x2']+coef_no_constraints['x3']+coef_no_constraints['x4']))
+ # assert that model with linear constraints does a better job than model without constraints
+ assert (coef_constrained['x2']-coef_constrained['x3']) < (coef_no_constraints['x2']-coef_no_constraints['x3']), \
+ "Model built with constraints should be closer to the constraint x2-x3 <= 0"
+ assert (coef_constrained['x3']-coef_constrained['x4']) < (coef_no_constraints['x3']-coef_no_constraints['x4']), \
+ "Model built with constraints should be closer to the constraint x3-x4 <= 0"
+ assert (coef_constrained['x2']+coef_constrained['x3']+coef_constrained['x4']) < \
+ (coef_no_constraints['x2']+coef_no_constraints['x3']+coef_no_constraints['x4']), \
+ "Model built with constraints should be closer to the constraint x2+x3+x4 <= 0"
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(test_duplicate_conflicting_constraints)
+else:
+ test_duplicate_conflicting_constraints()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_beta_constraint_NPE_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_beta_constraint_NPE_large.py
new file mode 100644
index 000000000000..7c7b18ef72c7
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_beta_constraint_NPE_large.py
@@ -0,0 +1,163 @@
+import h2o
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm
+from tests import pyunit_utils
+import numpy as np
+import pandas as pd
+
+# For beta constraints, if only upper_bounds are specified, there are NPE errors because the code expects both upper
+# and lower bounds to be specified. I have since fixed this error.
+def data_prep(seed):
+ np.random.seed(seed)
+ x1 = np.random.normal(0, 10, 100000)
+ x2 = np.random.normal(10, 100 , 100000)
+ x3 = np.random.normal(20, 200, 100000)
+ x4 = np.random.normal(30, 3000, 100000)
+ x5 = np.random.normal(400, 4000, 100000)
+
+ y_raw = np.sin(x1)*100 + np.sin(x2)*100 + x3/20 + x3/30 + x5/400
+ y = np.random.normal(y_raw, 20)
+
+ data = {
+ 'x1': x1,
+ 'x2': x2,
+ 'x3': x3,
+ 'x4': x4,
+ 'x5': x5,
+ 'y': y,
+ }
+ return h2o.H2OFrame(pd.DataFrame(data))
+
+def test_bad_lambda_specification():
+ train_data = data_prep(123)
+ family = 'gaussian'
+ link = 'identity'
+ nfolds = 0
+ lambda_ = 0.0
+ seed = 1234
+ calc_like = True
+ compute_p_values = True
+ solver = 'irlsm'
+ predictors = ['x1', 'x2', 'x3', 'x4', 'x5']
+ response = "y"
+
+ # beta constraints
+ bc = []
+ name = 'x1'
+ lower_bound = 0.03
+ bc.append([name, lower_bound])
+
+ beta_constraints = h2o.H2OFrame(bc)
+ beta_constraints.set_names(["names", "lower_bounds"])
+
+ linear_constraints2 = []
+
+ name = "x2"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = -1
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x4"
+ values = -1
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x2"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x4"
+ values = 1
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+
+ linear_constraints = h2o.H2OFrame(linear_constraints2)
+ linear_constraints.set_names(["names", "values", "types", "constraint_numbers"])
+
+ linear_constraints = h2o.H2OFrame(linear_constraints2)
+ linear_constraints.set_names(["names", "values", "types", "constraint_numbers"])
+ # check lower bound of beta constraint will not generate error but lambda will.
+ params = {
+ "family" : family,
+ "link": link,
+ "lambda_" : lambda_,
+ "seed" : seed,
+ "nfolds" : nfolds,
+ "compute_p_values" : compute_p_values,
+ "calc_like" : calc_like,
+ "solver" : solver,
+ "linear_constraints": linear_constraints,
+ "beta_constraints": beta_constraints
+ }
+
+ model = glm(**params)
+ model.train(x = predictors, y = response, training_frame = train_data)
+ coefs = model.coef()
+ print(coefs)
+ print(glm.getConstraintsInfo(model))
+ # beta constraints should be satisfied
+ assert coefs["x1"] >= 0.03 or abs(coefs["x1"]-0.03) < 1e-6, "beta constraint x1 ({0}) >= 0.03 is violated!".format(coefs["x1"])
+
+ # beta constraints
+ bc = []
+ name = 'x1'
+ upper_bound = 1.5
+ bc.append([name, upper_bound])
+
+ beta_constraints2 = h2o.H2OFrame(bc)
+ beta_constraints2.set_names(["names", "upper_bounds"])
+
+ params['beta_constraints'] = beta_constraints2
+ model = glm(**params)
+ model.train(x = predictors, y = response, training_frame = train_data)
+ coefs = model.coef()
+ print(coefs)
+ print(glm.getConstraintsInfo(model))
+ # beta constraints should always be satisfied
+ assert coefs["x1"] <= 1.5 or abs(1.5-coefs["x1"])<1e-6, "beta constraint x1 ({0}) >= 1.5 is violated.".format(coefs["x1"])
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(test_bad_lambda_specification)
+else:
+ test_bad_lambda_specification()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_test_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_test_large.py
new file mode 100644
index 000000000000..dc337ae33cf6
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16312_contrained_GLM_test_large.py
@@ -0,0 +1,108 @@
+import h2o
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm
+from tests import pyunit_utils
+import numpy as np
+import pandas as pd
+
+def data_prep(seed):
+ np.random.seed(seed)
+ x1 = np.random.normal(0, 10, 100000)
+ x2 = np.random.normal(10, 100 , 100000)
+ x3 = np.random.normal(20, 200, 100000)
+ x4 = np.random.normal(30, 3000, 100000)
+ x5 = np.random.normal(400, 4000, 100000)
+
+ y_raw = np.sin(x1)*100 + np.sin(x2)*100 + x3/20 + x3/30 + x5/400
+ y = np.random.normal(y_raw, 20)
+
+ data = {
+ 'x1': x1,
+ 'x2': x2,
+ 'x3': x3,
+ 'x4': x4,
+ 'x5': x5,
+ 'y': y,
+ }
+ return h2o.H2OFrame(pd.DataFrame(data))
+
+def test_bad_linear_constraints():
+ train_data = data_prep(123)
+ family = 'gaussian'
+ link = 'identity'
+ nfolds = 0
+ lambda_ = 0
+ seed = 1234
+ calc_like = True
+ compute_p_values = True
+ solver = 'irlsm'
+ predictors = ['x1', 'x2', 'x3', 'x4', 'x5']
+ response = "y"
+
+ linear_constraints2 = []
+
+ name = "x2"
+ values = 1
+ types = "Equal"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "x3"
+ values = 1
+ types = "Equal"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 0
+ types = "Equal"
+ contraint_numbers = 0
+ linear_constraints2.append([name, values, types, contraint_numbers])
+
+
+ linear_constraints = h2o.H2OFrame(linear_constraints2)
+ linear_constraints.set_names(["names", "values", "types", "constraint_numbers"])
+
+ params3 = {
+ "family" : family,
+ "link": link,
+ "lambda_" : lambda_,
+ "seed" : seed,
+ "nfolds" : nfolds,
+ "compute_p_values" : compute_p_values,
+ "calc_like" : calc_like,
+ "solver" : solver,
+ "linear_constraints": linear_constraints,
+ "standardize": True,
+ }
+
+ glm3 = glm(**params3)
+ glm3.train(x = predictors, y = response, training_frame = train_data)
+ print(glm.getConstraintsInfo(glm3))
+ coef3 = glm3.coef()
+ print(glm3.coef())
+
+ params2 = {
+ "family" : family,
+ "link": link,
+ "lambda_" : lambda_,
+ "seed" : seed,
+ "nfolds" : nfolds,
+ "compute_p_values" : compute_p_values,
+ "calc_like" : calc_like,
+ "solver" : solver
+ }
+ glm2 = glm(**params2)
+ glm2.train(x = predictors, y = response, training_frame = train_data)
+ print("Models built without linear constraints")
+ coef2 = glm2.coef()
+ print(coef2)
+ print("x2 + x3: {0}".format(coef2["x2"]+coef2["x3"]))
+
+ # check that model with constraints are closer to the constraints than models without constraints
+ assert (coef3["x2"]+coef3["x3"])<(coef2["x2"]+coef3["x3"]), \
+ "models built with constraints should be closer to the constraints x2+x3 but is not."
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(test_bad_linear_constraints)
+else:
+ test_bad_linear_constraints()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py
index 4c01ed1f67d7..569a4304268b 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py
@@ -195,13 +195,13 @@ def test_constraints_binomial():
print(glm.getConstraintsInfo(h2o_glm_default_init))
- assert abs(logloss-init_logloss)<2e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \
+ assert abs(logloss-init_logloss)<1e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \
"and initialized with optimal GLM {1} should equal but is not." \
"".format(logloss, init_logloss)
- assert logloss <= init_random_logloss, "logloss from optimal GLM {0} should be less than GLM with constraints " \
+ assert abs(logloss-init_random_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \
"and with random initial coefficients {1} but is" \
" not.".format(logloss, init_random_logloss)
- assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be less than GLM with constraints " \
+ assert abs(logloss-default_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \
"and with default initial coefficients {1} but is" \
" not.".format(logloss, default_init_logloss)
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py
index 8a47d3773279..8c29822f0b87 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_constraints_binomial():
+def test_equality_constraints_only_binomial():
'''
This test checks and make sure the equality constraints work with binomial family. Coefficients are initialized
with glm coefficients built without contraints, default coefficients and random coefficients.
@@ -124,11 +124,12 @@ def test_constraints_binomial():
"".format(default_init_logloss, h2o_glm_default_init._model_json["output"]["model_summary"].cell_values[0][6]))
print(glm.getConstraintsInfo(h2o_glm_default_init))
- assert init_random_logloss >= logloss, "Random initialization logloss with constraints should be worst than GLM " \
- "without constraints but is not."
+ assert abs(init_random_logloss - logloss) < 1e-6, \
+ "Random initialization logloss {0} with constraints should be similary to than GLM without constraints {1} but" \
+ " is not.".format(init_random_logloss, logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_constraints_binomial)
+ pyunit_utils.standalone_test(test_equality_constraints_only_binomial)
else:
- test_constraints_binomial()
+ test_equality_constraints_only_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py
index acf74d648dea..dd884aaea48b 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_constraints_binomial():
+def test_equality_linear_constraints_binomial():
'''
This test checks and make sure the equality constraints work with binomial family. Coefficients are initialized
with glm coefficients built without constraints, default coefficients and random coefficients. Note in this case,
@@ -156,18 +156,18 @@ def test_constraints_binomial():
" taken to build the model: {1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init)))
print(glm.getConstraintsInfo(h2o_glm_default_init))
- assert abs(logloss-init_logloss)<2e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \
+ assert abs(logloss-init_logloss)<1e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \
"and initialized with optimal GLM {1} should equal but is not." \
"".format(logloss, init_logloss)
- assert logloss<=init_random_logloss, "logloss from optimal GLM {0} should be lower than GLM with constraints " \
+ assert abs(logloss-init_random_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \
"and with random initial coefficients {1} but is" \
" not.".format(logloss, init_random_logloss)
- assert logloss<=default_init_logloss, "logloss from optimal GLM {0} should be less than GLM with constraints " \
+ assert abs(logloss-default_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to GLM with constraints " \
"and with default initial coefficients {1} but is" \
" not.".format(logloss, default_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_constraints_binomial)
+ pyunit_utils.standalone_test(test_equality_linear_constraints_binomial)
else:
- test_constraints_binomial()
+ test_equality_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py
index 97ddf83bbe29..732bdda9d8e5 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_light_tight_linear_constraints_only_gaussian():
+def test_light_tight_linear_constraints_binomial():
'''
Test constrained GLM with beta, equality and less than and equal to constraints. The constraints are not very
tight. However, coefficients from GLM built without constraints won't be able to satisfied the constraints.
@@ -207,6 +207,6 @@ def test_light_tight_linear_constraints_only_gaussian():
"not.".format(logloss, random_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian)
+ pyunit_utils.standalone_test(test_light_tight_linear_constraints_binomial)
else:
- test_light_tight_linear_constraints_only_gaussian()
+ test_light_tight_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py
index 96146d98234a..8e50603e9f15 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_light_tight_linear_constraints_only_gaussian():
+def test_light_tight_linear_constraints_only_binomial():
'''
Test constrained GLM with equality and less than and equal to constraints. The constraints are not very
tight. However, coefficients from GLM built without constraints won't be able to satisfied the constraints.
@@ -173,7 +173,7 @@ def test_light_tight_linear_constraints_only_gaussian():
"{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init)))
print(glm.getConstraintsInfo(h2o_glm_random_init))
- assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
+ assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \
" constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss)
assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
@@ -185,6 +185,6 @@ def test_light_tight_linear_constraints_only_gaussian():
"not.".format(logloss, random_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian)
+ pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_binomial)
else:
- test_light_tight_linear_constraints_only_gaussian()
+ test_light_tight_linear_constraints_only_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py
index 103086f4ca6f..d59c80fd99a5 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_light_tight_linear_constraints_only_gaussian():
+def test_light_tight_linear_constraints_only_binomial():
'''
Test constrained GLM with less than and equal to constraints. The constraints are not very
tight. However, coefficients from GLM built without constraints won't be able to satisfied the constraints.
@@ -189,7 +189,7 @@ def test_light_tight_linear_constraints_only_gaussian():
print(glm.getConstraintsInfo(h2o_glm_random_init))
print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init)))
- assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
+ assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \
" constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss)
assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
@@ -201,6 +201,6 @@ def test_light_tight_linear_constraints_only_gaussian():
"not.".format(logloss, random_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian)
+ pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_binomial)
else:
- test_light_tight_linear_constraints_only_gaussian()
+ test_light_tight_linear_constraints_only_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py
index 3a75c301cc65..08778437db07 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_constraints_binomial():
+def test_loose_beta_linear_constraints_binomial():
'''
check and make sure coefficients close to GLM built without constraints are generated with loose constraints
that are satisfied with coefficients from GLM without constraints. Only beta and less than and equal to
@@ -157,6 +157,6 @@ def test_constraints_binomial():
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_constraints_binomial)
+ pyunit_utils.standalone_test(test_loose_beta_linear_constraints_binomial)
else:
- test_constraints_binomial()
+ test_loose_beta_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py
index af40e6503b38..f9b0d17f976d 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_constraints_binomial():
+def test_loose_linear_constraints_binomial():
'''
check and make sure coefficients close to GLM built without constraints are generated with loose constraints
that are satisfied with coefficients from GLM without constraints. Only less than and equal to
@@ -135,6 +135,6 @@ def test_constraints_binomial():
" but is not.".format(logloss, default_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_constraints_binomial)
+ pyunit_utils.standalone_test(test_loose_linear_constraints_binomial)
else:
- test_constraints_binomial()
+ test_loose_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py
index 56a9c052625b..81fd336e2603 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_redundant_constraints.py
@@ -198,7 +198,7 @@ def test_redundant_constraints():
except Exception as ex:
print(ex)
temp = str(ex)
- assert ("redundant and possibly conflicting linear constraints" in temp), "Wrong exception was received."
+ assert ("redundant linear constraints:" in temp), "Wrong exception was received."
print("redundant constraint test passed!")
if __name__ == "__main__":
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py
index d7befd7da108..f40887389093 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_light_tight_linear_constraints_only_gaussian():
+def test_tight_beta_linear_constraints_binomial():
'''
Test constrained GLM with beta, equality and less than and equal to constraints. The constraints are very
tight and coefficients from GLM built without constraints won't be able to satisfied the constraints.
@@ -174,42 +174,6 @@ def test_light_tight_linear_constraints_only_gaussian():
types = "Equal"
contraint_numbers = 5
tight_constraints.append([name, values, types, contraint_numbers])
-
- name = "C19"
- values = 0.5
- types = "Equal"
- contraint_numbers = 4
- tight_constraints.append([name, values, types, contraint_numbers])
-
- name = "C10.1"
- values = -0.3
- types = "Equal"
- contraint_numbers = 4
- tight_constraints.append([name, values, types, contraint_numbers])
-
- name = "constant"
- values = -0.5
- types = "Equal"
- contraint_numbers = 4
- tight_constraints.append([name, values, types, contraint_numbers])
-
- name = "C18"
- values = 0.75
- types = "Equal"
- contraint_numbers = 5
- tight_constraints.append([name, values, types, contraint_numbers])
-
- name = "C20"
- values = -0.13
- types = "Equal"
- contraint_numbers = 5
- tight_constraints.append([name, values, types, contraint_numbers])
-
- name = "constant"
- values = -3
- types = "Equal"
- contraint_numbers = 5
- tight_constraints.append([name, values, types, contraint_numbers])
linear_constraints2 = h2o.H2OFrame(tight_constraints)
linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"])
@@ -317,6 +281,6 @@ def test_light_tight_linear_constraints_only_gaussian():
"not.".format(logloss, random_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian)
+ pyunit_utils.standalone_test(test_tight_beta_linear_constraints_binomial)
else:
- test_light_tight_linear_constraints_only_gaussian()
+ test_tight_beta_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py
index 1f4888b195a3..94ac1155c494 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_light_tight_linear_constraints_only_gaussian():
+def test_tight_equality_linear_constraints_binomial():
'''
Test constrained GLM with equality and less than and equal to constraints. The constraints are very
tight and coefficients from GLM built without constraints won't be able to satisfied the constraints.
@@ -225,7 +225,7 @@ def test_light_tight_linear_constraints_only_gaussian():
print(glm.getConstraintsInfo(h2o_glm_random_init))
print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init)))
- assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
+ assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \
" constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss)
assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
@@ -237,6 +237,6 @@ def test_light_tight_linear_constraints_only_gaussian():
"not.".format(logloss, random_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian)
+ pyunit_utils.standalone_test(test_tight_equality_linear_constraints_binomial)
else:
- test_light_tight_linear_constraints_only_gaussian()
+ test_tight_equality_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py
index d8668bad9776..cc5b5385c8d3 100644
--- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py
@@ -3,7 +3,7 @@
from tests import pyunit_utils
from tests.pyunit_utils import utils_for_glm_tests
-def test_light_tight_linear_constraints_only_gaussian():
+def test_tight_linear_constraints_binomial():
'''
Test constrained GLM with less than and equal to constraints. The constraints are very
tight and coefficients from GLM built without constraints won't be able to satisfied the constraints.
@@ -189,7 +189,7 @@ def test_light_tight_linear_constraints_only_gaussian():
print(glm.getConstraintsInfo(h2o_glm_random_init))
print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init)))
- assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
+ assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \
" constraints and initialized with optimal GLM {1} but is not.".format(logloss, optimal_init_logloss)
assert logloss <= default_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \
@@ -201,6 +201,6 @@ def test_light_tight_linear_constraints_only_gaussian():
"not.".format(logloss, random_init_logloss)
if __name__ == "__main__":
- pyunit_utils.standalone_test(test_light_tight_linear_constraints_only_gaussian)
+ pyunit_utils.standalone_test(test_tight_linear_constraints_binomial)
else:
- test_light_tight_linear_constraints_only_gaussian()
+ test_tight_linear_constraints_binomial()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_gh_16203_constrained_glm_example.py b/h2o-py/tests/testdir_algos/glm/pyunit_gh_16203_constrained_glm_example.py
new file mode 100644
index 000000000000..86ecfb4236f0
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_gh_16203_constrained_glm_example.py
@@ -0,0 +1,206 @@
+import h2o
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm
+from tests import pyunit_utils
+
+def test_constrained_glm_example():
+ '''
+ Simple example to showcase how to call constrained GLM.
+ '''
+ #train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/binomial_20_cols_10KRows.csv")
+ train = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
+ for ind in range(10):
+ train[ind] = train[ind].asfactor()
+ train["C21"] = train["C21"].asfactor()
+ response = "C21"
+ predictors = list(range(0,20))
+ # add beta constraints
+ bc = []
+ name = "C11"
+ lower_bound = -3.5
+ upper_bound = 0
+ bc.append([name, lower_bound, upper_bound])
+
+ name = "C18"
+ lower_bound = 6
+ upper_bound = 7
+ bc.append([name, lower_bound, upper_bound])
+
+ name = "C15"
+ lower_bound = -9
+ upper_bound = -6
+ bc.append([name, lower_bound, upper_bound])
+
+ name = "C16"
+ lower_bound = -20
+ upper_bound = -10
+ bc.append([name, lower_bound, upper_bound])
+
+ beta_constraints = h2o.H2OFrame(bc)
+ beta_constraints.set_names(["names", "lower_bounds", "upper_bounds"])
+
+ tight_constraints = [] # this constraint is satisfied by default coefficient initialization
+
+ # add tight constraints
+ name = "C1.1"
+ values = 0.5
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C2.1"
+ values = -0.25
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = -1
+ types = "LessThanEqual"
+ contraint_numbers = 0
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C4.1"
+ values = 1.5
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C17"
+ values = 3
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C15"
+ values = -2
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = -5
+ types = "LessThanEqual"
+ contraint_numbers = 1
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C12"
+ values = -0.5
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C13"
+ values = -1.5
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C14"
+ values = 2
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = -3
+ types = "LessThanEqual"
+ contraint_numbers = 2
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C11"
+ values = 0.25
+ types = "LessThanEqual"
+ contraint_numbers = 3
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C18"
+ values = -0.5
+ types = "LessThanEqual"
+ contraint_numbers = 3
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C19"
+ values = 0.75
+ types = "LessThanEqual"
+ contraint_numbers = 3
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = 5
+ types = "LessThanEqual"
+ contraint_numbers = 3
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C19"
+ values = 0.5
+ types = "Equal"
+ contraint_numbers = 4
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C10.1"
+ values = -0.3
+ types = "Equal"
+ contraint_numbers = 4
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = -0.25
+ types = "Equal"
+ contraint_numbers = 4
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C18"
+ values = 0.75
+ types = "Equal"
+ contraint_numbers = 5
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "C20"
+ values = -0.13
+ types = "Equal"
+ contraint_numbers = 5
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ name = "constant"
+ values = -1.5
+ types = "Equal"
+ contraint_numbers = 5
+ tight_constraints.append([name, values, types, contraint_numbers])
+
+ linear_constraints2 = h2o.H2OFrame(tight_constraints)
+ linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"])
+
+ random_coef = [0.9740393731418461, 0.9021970400494406, 0.8337282995102272, 0.20588758679724872, 0.12522385214612453,
+ 0.6390730524643073, 0.7055779213989253, 0.9004255614099713, 0.4075431157767999, 0.161093231584713,
+ 0.15250197544465616, 0.7172682822215489, 0.60836236371404, 0.07086628306822396, 0.263719138602719,
+ 0.16102036359390437, 0.0065987448849305075, 0.5881312311814277, 0.7836567678399617, 0.9104401158881326,
+ 0.8432891635016235, 0.033440093086177236, 0.8514611306363931, 0.2855332934628241, 0.36525972112514427,
+ 0.7526593301495519, 0.9963694184200753, 0.5614168317678196, 0.7950126291921057, 0.6212978800904426,
+ 0.176936615687169, 0.8817788599562331, 0.13699370230879637, 0.5754950980437555, 0.1507294463182668,
+ 0.23409699287029495, 0.6949148063429461, 0.47140569181488556, 0.1470896240551064, 0.8475557222612405,
+ 0.05957485472498203, 0.07490903723892406, 0.8412381196460251, 0.26874846387453943, 0.13669341206289243,
+ 0.8525684329438777, 0.46716360402752777, 0.8522055745422484, 0.3129394551398561, 0.908966336417204,
+ 0.26259461196353984, 0.07245314277889847, 0.41429401839807156, 0.22772860293274222, 0.26662443208488784,
+ 0.9875655504027848, 0.5832266083052889, 0.24205847206862052, 0.9843760682096272, 0.16269008279311103,
+ 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289,
+ 0.224690851359456, 0.5809304720756304, 0.36863807988348585]
+ params = {"family":"binomial", "lambda_":0.0, "seed":12345, "remove_collinear_columns":True, "solver":"IRLSM",
+ "linear_constraints":linear_constraints2, "beta_constraints":beta_constraints}
+
+ # build constrained GLM with default coefficient initialization, all coefficients zero except intercept
+ constrained_glm_default_init = glm(**params)
+ constrained_glm_default_init.train(x=predictors, y=response, training_frame=train)
+ logloss_default = constrained_glm_default_init.model_performance()._metric_json['logloss']
+ # build constrained GLM model with random coefficient initialization
+ params["startval"] = random_coef
+ constrained_glm_random_init = glm(**params)
+ constrained_glm_random_init.train(x=predictors, y=response, training_frame=train)
+ logloss = constrained_glm_random_init.model_performance()._metric_json['logloss']
+
+ assert logloss_default >= logloss or abs(logloss_default-logloss) < 1e-2
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(test_constrained_glm_example)
+else:
+ test_constrained_glm_example()
diff --git a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py
index 7540abf1dc5c..186062fba82f 100644
--- a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py
+++ b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py
@@ -8,7 +8,8 @@
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
# Megan Kurka found that categorical columns do not work with modelselection backward mode. I fixed the bug and
-# extended her test to check that each time a predictor is dropped, it must has the smallest z-value magnitude.
+# extended her test to check that each time a predictor is dropped, the best performing level is compared to other
+# predictors. If the best level is not good enough, the whole enum predictor is dropped.
def test_megan_failure():
df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/demos/bank-additional-full.csv")
y = "y"
@@ -28,7 +29,6 @@ def test_megan_failure():
best_predictor_subset = backward_model.get_best_model_predictors()
counter = 0
- back_coef = backward_model.coef()
for ind in list(range(num_models-1, 0, -1)):
pred_large = coefficient_orders[ind]
pred_small = coefficient_orders[ind-1]
@@ -40,11 +40,11 @@ def test_megan_failure():
# assert z-values removed has smallest magnitude
x = best_predictor_subset[ind]
- assert_smallest_z_removed(back_coef[ind], z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df)
+ assert_correct_z_removed(z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df)
counter += 1
-def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df):
+def assert_correct_z_removed(z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df):
glm_model = H2OGeneralizedLinearEstimator(seed=1234, remove_collinear_columns=True, lambda_=0.0, compute_p_values=True)
glm_model.train(x=x, y=y, training_frame=df)
cat_predictors = extractCatCols(df, x)
@@ -53,11 +53,21 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co
model_z_values = glm_model._model_json["output"]["coefficients_table"]["z_value"]
model_coeffs = glm_model._model_json["output"]["coefficients_table"]["names"]
- assert_equal_z_values(back_coef, glm_model.coef(), z_values_backward, coeff_backward, model_z_values, model_coeffs)
- min_z_value = min(z_values_removed)
+ assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, model_coeffs)
+
+ num_predictor_removed = False
+ for one_value in predictor_removed:
+ if one_value in num_predictors:
+ num_predictor_removed = True
+ break
+ if num_predictor_removed:
+ min_z_value = min(z_values_removed)
+ else:
+ min_z_value = max(z_values_removed)
+
# check that predictor with smallest z-value magnitude is removed
- assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values)
- assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values)
+ assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values)
+ assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values)
for name in cat_predictors:
for coeff_name in predictor_removed:
@@ -66,7 +76,7 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co
return
x.remove(predictor_removed[0]) # numerical predictor is removed
-def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values):
+def assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values):
for name in cat_predictors:
model_z = []
for coeff_name in model_coeffs:
@@ -80,7 +90,7 @@ def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeff
"than mininum_z_values {2}".format(name, model_z, min_z_value)
-def assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values):
+def assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values):
for name in num_predictors:
pred_ind = model_coeffs.index(name)
val = model_z_values[pred_ind]
@@ -96,7 +106,7 @@ def extractCatCols(df, x):
cat_pred.append(name)
return cat_pred
-def assert_equal_z_values(back_coef, curr_coef, z_values_backward, coeff_backward, model_z_values, glm_coeff):
+def assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, glm_coeff):
for coeff in glm_coeff:
backward_z_value = z_values_backward[coeff_backward.index(coeff)]
model_z_value = model_z_values[glm_coeff.index(coeff)]
diff --git a/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py b/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py
index 3d030e349a1e..de79c21c063d 100644
--- a/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py
+++ b/h2o-py/tests/testdir_algos/word2vec/pyunit_text8_word2vec_large.py
@@ -14,11 +14,16 @@ def word2vec():
w2v_model = H2OWord2vecEstimator(epochs=1, word_model=word_model)
w2v_model.train(training_frame=train)
- synonyms = w2v_model.find_synonyms("horse", 3)
+ cnt = 10
+ synonyms = w2v_model.find_synonyms("horse", cnt)
print(synonyms)
-
- assert len(synonyms) == 3, "there should be three synonmys"
-
+ assert len(synonyms) == cnt, "There should be ten synonyms."
+
+ # GH-16192 find_synonyms returns empty dataset if there is no synonyms to find
+ synonyms = w2v_model.find_synonyms("hhorse", cnt)
+ print(synonyms)
+ assert len(synonyms) == 0, "There should be zero synonyms."
+
if __name__ == "__main__":
pyunit_utils.standalone_test(word2vec)
diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
index b9f101dda2a7..b284f6e53f39 100644
--- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
+++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
@@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame():
assert dupl4.columns == ["n1", "s1"]
-def H2OFrame_skipped_columns_is_BUGGY():
- try:
- h2o.H2OFrame(data, skipped_columns=[1])
- assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947
- except ValueError as e:
- assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
+def H2OFrame_skipped_columns_BUG_fixed():
+ f1 = h2o.H2OFrame(data, skipped_columns=[1])
+ f2 = h2o.H2OFrame(data)
+ assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1))
pu.run_tests([
@@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY():
H2OFrame_from_pandas,
H2OFrame_from_scipy,
H2OFrame_from_H2OFrame,
- H2OFrame_skipped_columns_is_BUGGY
+ H2OFrame_skipped_columns_BUG_fixed
])
diff --git a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py
index ae4c5e76eeba..ecd499ee01a7 100644
--- a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py
+++ b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2oget_timezone_DEPRECATED.py
@@ -19,7 +19,7 @@ def h2oget_timezone():
timezones = h2o.list_timezones()
assert_is_type(timezones, H2OFrame)
- assert timezones.nrow == 467, "h2o.get_timezone() returns frame with wrong row number."
+ assert timezones.nrow == 459, "h2o.get_timezone() returns frame with wrong row number."
assert timezones.ncol == 1, "h2o.get_timezone() returns frame with wrong column number."
if __name__ == "__main__":
diff --git a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py
index c6ae86359813..d5330ff20cd3 100644
--- a/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py
+++ b/h2o-py/tests/testdir_apis/H2O_Module/pyunit_h2olist_timezones_DEPRECATED.py
@@ -14,7 +14,7 @@ def h2olist_timezones():
timezones = h2o.list_timezones()
assert_is_type(timezones, H2OFrame)
- assert timezones.nrow == 467, "h2o.get_timezone() returns frame with wrong row number."
+ assert timezones.nrow == 459, "h2o.get_timezone() returns frame with wrong row number."
assert timezones.ncol == 1, "h2o.get_timezone() returns frame with wrong column number."
diff --git a/h2o-py/tests/testdir_misc/pyunit_cluster.py b/h2o-py/tests/testdir_misc/pyunit_cluster.py
index 2424170ef1d6..9c367484a27f 100644
--- a/h2o-py/tests/testdir_misc/pyunit_cluster.py
+++ b/h2o-py/tests/testdir_misc/pyunit_cluster.py
@@ -11,9 +11,9 @@ def test_cluster_status():
def test_cluster_properties():
cl = h2o.cluster()
- assert len(cl._schema_attrs_) == 24
+ assert len(cl._schema_attrs_) == 25
for k in cl._schema_attrs_.keys():
- assert getattr(cl, k) is not None
+ assert getattr(cl, k) is not None or k == "web_ip"
def test_exception_on_unknown_cluster_property():
diff --git a/h2o-py/tests/testdir_misc/pyunit_export_parquet_npe.py b/h2o-py/tests/testdir_misc/pyunit_export_parquet_npe.py
new file mode 100644
index 000000000000..1a8c8fb306e7
--- /dev/null
+++ b/h2o-py/tests/testdir_misc/pyunit_export_parquet_npe.py
@@ -0,0 +1,23 @@
+import sys
+import tempfile
+
+sys.path.insert(1, "../../../")
+import h2o
+from tests import pyunit_utils
+
+
+def test_export_file_npe_gh_16161():
+ with tempfile.TemporaryDirectory() as dir:
+ df = h2o.create_frame(rows=100, cols=10, string_fraction=0.1, seed=5, seed_for_column_types=25)
+ h2o.export_file(df, path=dir, format="parquet", write_checksum=False)
+ df2 = h2o.import_file(dir)
+ assert pyunit_utils.compare_frames(df, df2, tol_numeric=1e-10, numElements=0)
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(test_export_file_npe_gh_16161)
+else:
+ test_export_file_npe_gh_16161()
+
+
+
diff --git a/h2o-py/tests/testdir_misc/pyunit_export_zstd.py b/h2o-py/tests/testdir_misc/pyunit_export_zstd.py
new file mode 100644
index 000000000000..e3704c2cb11a
--- /dev/null
+++ b/h2o-py/tests/testdir_misc/pyunit_export_zstd.py
@@ -0,0 +1,37 @@
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from os import path
+import struct
+
+'''
+Export file with h2o.export_file compressed with 'zstd'
+'''
+
+
+def is_zstd_file(path):
+ with open(path, 'rb') as f:
+ magic_bytes = f.read(4)
+ return struct.unpack(' 0) {
- securityWarnings <- grep("SECURITY_WARNING", readLines(stdout), value=TRUE)
- }
- if (length(securityWarnings) > 0) {
- msg = paste(
- "Server process startup raise a security warning:",
- paste(securityWarnings, collapse = "\n"), sep = "\n")
- warning(msg)
- }
} else
stop("Can only start H2O launcher if IP address is localhost.")
}
diff --git a/h2o-r/h2o-package/R/explain.R b/h2o-r/h2o-package/R/explain.R
index 2243388bbdb7..67220f3a323e 100644
--- a/h2o-r/h2o-package/R/explain.R
+++ b/h2o-r/h2o-package/R/explain.R
@@ -242,10 +242,10 @@ case_insensitive_match_arg <- function(arg, choices) {
.self
},
get_model = function(model_id) {
- model <- memoised_models$get_model(model_id)
- if (!is.null(model@allparameters$treatment_column))
+ m <- memoised_models$get_model(model_id)
+ if (!is.null(m@allparameters$treatment_column))
stop("Uplift models have not supported in explain yet.")
- return(model)
+ return(m)
}
)
)
diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R
index c3a2fcf74925..2e7b9f23ad29 100644
--- a/h2o-r/h2o-package/R/frame.R
+++ b/h2o-r/h2o-package/R/frame.R
@@ -4109,6 +4109,7 @@ use.package <- function(package,
#'
#' @param x An \code{R} object.
#' @param destination_frame A string with the desired name for the H2OFrame
+#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame
#' @param use_datatable allow usage of data.table
#' @param \dots arguments passed to method arguments.
#' @export
@@ -4135,15 +4136,19 @@ use.package <- function(package,
#' stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m))
#' }
#' }
-as.h2o <- function(x, destination_frame="", ...) {
+as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) {
.key.validate(destination_frame)
- UseMethod("as.h2o")
+ if (is.null(skipped_columns)) {
+ UseMethod("as.h2o")
+ } else {
+ as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
+ }
}
#' @rdname as.h2o
#' @method as.h2o default
#' @export
-as.h2o.default <- function(x, destination_frame="", ...) {
+as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = ""))
@@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) {
data.frame(C1=x)
else
as.data.frame(x, ...)
- as.h2o.data.frame(x, destination_frame=destination_frame)
+ as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
}
#' @rdname as.h2o
#' @method as.h2o H2OFrame
#' @export
-as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
+as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy")
@@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
#' @seealso \code{\link{use.package}}
#' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}
#' @export
-as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) {
+as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame")
@@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt))
#if (verbose) pt <- proc.time()[[3]] # timings inside
h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types,
- col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)))
+ col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)),
+ skipped_columns=skipped_columns)
#if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt))
file.remove(tmpf)
h2f
@@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
#' To speedup execution time for large sparse matrices, use h2o datatable. Make sure you have installed and imported data.table and slam packages.
#' Turn on h2o datatable by options("h2o.use.data.table"=TRUE)
#' @export
-as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) {
+as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
if( destination_frame=="") {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix")
diff --git a/h2o-r/h2o-package/R/parse.R b/h2o-r/h2o-package/R/parse.R
index d49b8db5564c..0ba9ed0afbdf 100755
--- a/h2o-r/h2o-package/R/parse.R
+++ b/h2o-r/h2o-package/R/parse.R
@@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA
else
col.names
if (!is.null(parseSetup$column_names) &&
- (length(parseSetup$column_names) != parsedColLength)) {
- stop("length of col.names must equal to the number of columns in dataset")
+ (length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns
+ if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength))
+ || is.null(skipped_columns)) # if no skipped column, this is an error. If skipped columns, check length
+ stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
}
# change column names to what the user specified
if (!is.null(skipped_columns)) {
diff --git a/h2o-r/h2o-package/R/stackedensemble.R b/h2o-r/h2o-package/R/stackedensemble.R
index 6cc96bc9d000..ba11e0c82a9a 100644
--- a/h2o-r/h2o-package/R/stackedensemble.R
+++ b/h2o-r/h2o-package/R/stackedensemble.R
@@ -59,8 +59,12 @@
#' h2o.init()
#'
#' # Import a sample binary outcome train/test set
-#' train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-#' test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+#' train <- h2o.importFile(
+#' "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv"
+#' )
+#' test <- h2o.importFile(
+#' "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv"
+#' )
#'
#' # Identify predictors and response
#' y <- "response"
diff --git a/h2o-r/h2o-package/R/w2vutils.R b/h2o-r/h2o-package/R/w2vutils.R
index d7d163e52801..c1e60d0334fe 100644
--- a/h2o-r/h2o-package/R/w2vutils.R
+++ b/h2o-r/h2o-package/R/w2vutils.R
@@ -27,7 +27,10 @@ h2o.findSynonyms <- function(word2vec, word, count = 20) {
res <- .h2o.__remoteSend(method="GET", "Word2VecSynonyms", model = word2vec@model_id,
word = word, count = count)
fr <- data.frame(synonym = res$synonyms, score = res$scores)
- fr[with(fr, order(score, decreasing = TRUE)),]
+ if (length(fr) > 0) {
+ fr[with(fr, order(score, decreasing = TRUE)),]
+ }
+ fr
}
#'
diff --git a/h2o-r/scripts/h2o-r-test-setup.R b/h2o-r/scripts/h2o-r-test-setup.R
index 6bd0e25d851f..9fbcecca52a9 100755
--- a/h2o-r/scripts/h2o-r-test-setup.R
+++ b/h2o-r/scripts/h2o-r-test-setup.R
@@ -2,7 +2,6 @@
.origEchoValue <- getOption("echo")
options(echo=FALSE)
options(scipen=999)
-options(stringsAsFactors=T)
#'
#'
diff --git a/h2o-r/tests/runitUtils/shared_javapredict.R b/h2o-r/tests/runitUtils/shared_javapredict.R
index 661cbbef6bc8..54ba6a17cc72 100644
--- a/h2o-r/tests/runitUtils/shared_javapredict.R
+++ b/h2o-r/tests/runitUtils/shared_javapredict.R
@@ -68,7 +68,7 @@ doJavapredictTest <- function(model,test_file,test_frame,params, separator=",",
safeSystem(cmd)
print("Comparing predictions between H2O and Java POJO")
- prediction2 <- read.csv(sprintf("%s/out_pojo.csv", tmpdir_name), header=T)
+ prediction2 <- read.csv(sprintf("%s/out_pojo.csv", tmpdir_name), header=T, stringsAsFactors=TRUE)
if (nrow(prediction1) != nrow(prediction2)) {
warning("Prediction mismatch")
print(paste("Rows from H2O", nrow(prediction1)))
diff --git a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R
index 5ce5f56ebada..dfc606f794d3 100644
--- a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R
+++ b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance.R
@@ -15,11 +15,12 @@ test.CoxPH.concordance <- function() {
rModel <- coxph(Surv(time, status) ~ age + sex + meal.cal + age:meal.cal, data = tstdata, ties = "efron")
rPredictor <- rModel$linear.predictors
+
hexModel <- h2o.coxph(x = c("age", "sex", "meal.cal"), interaction_pairs = list(c("age", "meal.cal")),
event_column = "status", stop_column = "time", ties = "efron", training_frame = tstdataHex)
hexPredictor <- pred(hexModel, tstdataHex)
-
- expect_equal(rPredictor, hexPredictor, scale = 1, tolerance = 1e-3)
+
+ expect_equal(rPredictor - mean(rPredictor), hexPredictor, scale = 1, tolerance = 1e-3)
rConcordance <- unname(summary(rModel)$concordance)[1]
hexConcordance <- h2o.performance(hexModel, data=tstdataHex)@metrics$concordance
diff --git a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R
index ce2f0fcb5918..cce60312a173 100644
--- a/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R
+++ b/h2o-r/tests/testdir_algos/coxph/runit_coxph_concordance_heart.R
@@ -26,7 +26,7 @@ test.CoxPH.predict <- function() {
check.pred <- function(r.model, hex.model, r.tstdata, hex.tstdata) {
fit.pred <- pred.r(r.model, r.tstdata)
hex.lp <- pred.h2o(hex.model, hex.tstdata)
- expect_equal(fit.pred, hex.lp, tolerance = 1e-5, scale = 1)
+ expect_equal(fit.pred - mean(fit.pred), hex.lp, tolerance = 1e-5, scale = 1)
}
check.concordance <- function (rModel, hexModel, data, tolerance = 1e-3) {
diff --git a/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R b/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R
index ced349a5ef72..64edb3092f6d 100644
--- a/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R
+++ b/h2o-r/tests/testdir_algos/coxph/runit_coxph_predict.R
@@ -16,7 +16,10 @@ pred.h2o <- function(model, data) {
compare.results <- function(fit, hex.fit, tstdata, tstdata.hex) {
fit.pred <- pred.r(fit, tstdata)
hex.lp <- pred.h2o(hex.fit, tstdata.hex)
- expect_equal(fit.pred, hex.lp, tolerance = 1e-7, scale = 1)
+ w <- tstdata$weights
+ if (is.null(w))
+ w <- rep_len(1, length(fit.pred))
+ expect_equal(fit.pred - weighted.mean(fit.pred, w, na.rm=TRUE), hex.lp, tolerance = 1e-7, scale = 1)
}
cancer.with.sex <- function () {
diff --git a/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R b/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R
index cbd36d6f3259..0982925868c3 100644
--- a/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R
+++ b/h2o-r/tests/testdir_algos/coxph/runit_pubdev_8945_coxph_all_interactions_mojo.R
@@ -10,7 +10,6 @@ test.CoxPH.mojo_interactions_impl <- function(stratify_by = NULL) {
interaction_pairs = list(c("C1", "C3"), c("C1", "C2"), c("C3", "C4"), c("C4", "C2"), c("C1", "age"), c("surgery", "C3")),
training_frame = training_frame)
- browser()
predict_h2o <- h2o.predict(coxph_h2o, training_frame)
print(predict_h2o)
@@ -21,7 +20,6 @@ test.CoxPH.mojo_interactions_impl <- function(stratify_by = NULL) {
predict_mojo <- h2o.predict(coxph_mojo, training_frame)
print(predict_mojo)
- browser()
expect_equal(as.data.frame(predict_h2o), as.data.frame(predict_mojo))
}
diff --git a/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R b/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R
index b34586373496..accf63f09824 100644
--- a/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R
+++ b/h2o-r/tests/testdir_algos/gbm/runit_GBM_ecology.R
@@ -40,7 +40,7 @@ test.GBM.ecology <- function() {
print(ecology.sum)
#import csv data for R to use
- ecology.data <- read.csv(locate("smalldata/gbm_test/ecology_model.csv"), header = TRUE)
+ ecology.data <- read.csv(locate("smalldata/gbm_test/ecology_model.csv"), header = TRUE, stringsAsFactors=TRUE)
ecology.data <- na.omit(ecology.data) #this omits NAs... does GBM do this? Perhaps better to model w/o doing this?
Log.info("H2O GBM with parameters:\nntrees = 100, max_depth = 5, min_rows = 10, learn_rate = 0.1\n")
diff --git a/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R b/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R
index 29077cae8f5a..14101d3b5c3a 100644
--- a/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R
+++ b/h2o-r/tests/testdir_algos/gbm/runit_GBM_weight_gamma.R
@@ -16,16 +16,15 @@ test <- function() {
#htable= as.h2o(table.1.2,destination_frame = "htable")
hh = h2o.gbm(x = 1:3,y = "medskad",training_frame = htable,distribution = "gamma",weights_column = "antskad",
ntrees = 20,max_depth = 1,min_rows = 1,learn_rate = 1)
- ph = as.vector(as.data.frame(h2o.predict(hh,newdata = htable)))
-
+ ph = as.data.frame(h2o.predict(hh,newdata = htable))$predict
#expect_equal(gg$initF,hh@model$init_f,tolerance = 1e-6)
- #expect_equal(min(pr),min(ph[,1]),tolerance = 1e-6)
- #expect_equal(max(pr),max(ph[,1]),tolerance = 1e-6)
- #expect_equal(mean(pr),mean(ph[,1]),tolerance = 1e-6)
+ #expect_equal(min(pr),min(ph),tolerance = 1e-6)
+ #expect_equal(max(pr),max(ph),tolerance = 1e-6)
+ #expect_equal(mean(pr),mean(ph),tolerance = 1e-6)
expect_equal(8.804447,hh@model$init_f,tolerance = 1e-6)
- expect_equal(3751.01,min(ph[,1]),tolerance = 1e-4)
- expect_equal(15291,max(ph[,1]),tolerance = 1e-4)
- expect_equal(8119,mean(ph[,1]),tolerance = 1e-4)
+ expect_equal(3751.01,min(ph),tolerance = 1e-4)
+ expect_equal(15291,max(ph),tolerance = 1e-4)
+ expect_equal(8119,mean(ph),tolerance = 1e-4)
}
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R b/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R
index d41fd01ded8c..59d0ca896cbe 100644
--- a/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R
+++ b/h2o-r/tests/testdir_algos/glm/runit_GH_6722_redundant_constraints.R
@@ -24,7 +24,7 @@ test_constraints_redundant <- function() {
}, error = function(e) {
print("***")
print(e)
- expect_true(grepl("redundant and possibly conflicting linear constraints:", e))
+ expect_true(grepl("redundant linear constraints:", e))
})
}
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R
index ecfe7314453e..ad4ec4b0cf32 100644
--- a/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R
+++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_libR_airlines.R
@@ -13,6 +13,8 @@ test.LiblineaR.airlines <- function() {
Log.info("epsilon = 1E-4: Tolerance of termination criterion\n")
Log.info(" cross = 0: No kfold cross-validation\n")
+ dimnames(test) <- dimnames(train)
+
LibR.m <- LiblineaR(train, trainLabels,type=0, epsilon=1E-4, cost=100)
LibRpreds <- predict(LibR.m, test, proba=1, decisionValues=TRUE)
LibRCM <- table(testLabels, LibRpreds$predictions)
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R
index a40c585049ac..32d877981868 100644
--- a/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R
+++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_offset.R
@@ -26,7 +26,7 @@ test <- function() {
hh = h2o.glm(x = 2:31,y = 1,training_frame = frm,family = "binomial",offset_column = "off",lambda = 0)
gr = glm(formula = y~X1+X2 + X3 +X4 +X5+X6+X7+X8+X9+X10+ X11+X12+X13+X14+X15+X16+X17+X18+X19+ X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30,
family = "binomial",data = rfm,offset= rfm[,32])
- gg = glmnet(x = as.matrix(rfm[,-c(1,32)]),y = as.factor(rfm[,1]),family = "binomial",lambda =0,offse = rfm[,32])
+ gg = glmnet(x = as.matrix(rfm[,-c(1,32)]),y = as.factor(rfm[,1]),family = "binomial",lambda =0, offset = rfm[,32])
print("compare results")
expect_equal(gr$null.deviance, hh@model$training_metrics@metrics$null_deviance)
expect_equal(gr$aic, hh@model$training_metrics@metrics$AIC,tolerance = 0.00001)
@@ -34,7 +34,7 @@ test <- function() {
expect_equal(gr$df.residual,hh@model$training_metrics@metrics$residual_degrees_of_freedom)
#predictions
ph = h2o.predict(object = hh,newdata = val)
- pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]),offset = as.matrix(valid[,32]),type = "response")
+ pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]), newoffset = as.matrix(valid[,32]), offset = as.matrix(valid[,32]),type = "response")
print("compare predictions")
expect_equal(min(pr),min(ph$p1),tolerance = 0.0001)
expect_equal(max(pr),max(ph$p1),tolerance = 0.0001)
@@ -49,7 +49,7 @@ test <- function() {
expect_equal(deviance(gg),hh@model$training_metrics@metrics$residual_deviance,tolerance = 0.00001)
#predictions
ph = h2o.predict(object = hh,newdata = val)
- pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]),offset = as.matrix(valid[,32]),type = "response")
+ pr = predict(object = gg,newx = as.matrix(valid[,-c(1,32)]), newoffset = as.matrix(valid[,32]), offset = as.matrix(valid[,32]),type = "response")
print("compare predictions")
expect_equal(min(pr),min(ph$p1),tolerance = 0.0001)
expect_equal(max(pr),max(ph$p1),tolerance = 0.0001)
diff --git a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R
index 7c5d9517f050..a01cad170895 100644
--- a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R
+++ b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6037_fractionalbinomial_mojo.R
@@ -9,14 +9,13 @@ test.fractionalbinomial <-
# Run the test
#----------------------------------------------------------------------
- browser()
params_prob_data <- setParmsData() # generate model parameters, random dataset
modelAndDir<-buildModelSaveMojoGLM(params_prob_data$params) # build the model and save mojo
filename = sprintf("%s/in.csv", modelAndDir$dirName) # save the test dataset into a in.csv file.
h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4)
}
diff --git a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R
index b851c081610c..24cdef51e822 100644
--- a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R
+++ b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8638_bodyfat_RID_Binomial_compareR_test.R
@@ -9,7 +9,6 @@ source("../../../scripts/h2o-r-test-setup.R")
test_RID_binomial_compareR <- function() {
fat <- h2o.importFile(locate("smalldata/glm_test/bodyfat.csv"))
bodyfat <- as.data.frame(fat)
- browser()
rGlmBinomial <- glm(bmi ~ neck+density+hip, data=bodyfat, family=binomial())
dfbetasGlmB <- dfbetas(rGlmBinomial)
hGlmBinomial <- h2o.glm(x=c("neck", "density", "hip"), y="bmi", lambda=0, family="binomial", standardize=FALSE, influence="dfbetas", training_frame=fat)
diff --git a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R
index 914c524d5c61..a92b1acfe66f 100644
--- a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R
+++ b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R
@@ -3,8 +3,8 @@ source("../../../scripts/h2o-r-test-setup.R")
# add test from Erin Ledell
glmBetaConstraints <- function() {
- df <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
- test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+ df <- h2o.importFile(locate("smalldata/higgs/higgs_train_10k.csv"))
+ test <- h2o.importFile(locate("smalldata/higgs/higgs_test_5k.csv"))
y <- "response"
x <- setdiff(names(df), y)
diff --git a/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R b/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R
index 158d91fb0d2c..f6aba8b57eec 100644
--- a/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R
+++ b/h2o-r/tests/testdir_algos/glrm/runit_glrm_PUBDEV_3788_loss_by_col_err.R
@@ -7,12 +7,11 @@ test.glrm.pubdev.3788 <- function() {
# Create data frame with a constant column
data <- data.frame('NumericCol' = runif(50),
'ConstantCol' = rep(1, 50),
- 'CategoricalCol' = sample(c("A", "B", "C", "D"), size = 50, replace = T))
+ 'CategoricalCol' = sample(c("A", "B", "C", "D"), size = 50, replace = T),
+ stringsAsFactors = TRUE)
data <- as.h2o(data)
- browser()
-
# Specify loss by column and set ignore_const_cols to TRUE
glrm_model <- h2o.glrm(data, k = 2, model_id = "glrm_test.hex",
loss_by_col = c("Quadratic", "Categorical", "Categorical"),
diff --git a/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R b/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R
index de11ada0ccea..dceff0de9ca1 100644
--- a/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R
+++ b/h2o-r/tests/testdir_algos/isofor/runit_isofor_accuracy.R
@@ -4,15 +4,15 @@ source("../../../scripts/h2o-r-test-setup.R")
test.IsolationForest.accuracy <- function() {
- set.seed(1234)
+ set.seed(12345)
N = 1e6
random_data <- data.frame(
x = c(rnorm(N, 0, 0.5), rnorm(N*0.05, -2, 1)),
y = c(rnorm(N, 0, 0.5), rnorm(N*0.05, 2, 1)),
- outlier = c(rep("NO", N), rep("YES", (0.05*N)))
+ outlier = c(rep("NO", N), rep("YES", (0.05*N))),
+ stringsAsFactors = TRUE
)
random_data.hex <- as.h2o(random_data)
-
# different approach than in the paper - build a smaller number of deeper trees trained on a much larger sample
h2o_isolation_forest <- h2o.isolationForest(x = c("x", "y"), training_frame = random_data.hex[, c("x", "y")],
ntrees = 25, seed = 1234, sample_rate = 0.7, min_rows = 1000, max_depth = 16)
diff --git a/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R b/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R
index b480a1d3551f..ae25a0f41b9e 100644
--- a/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R
+++ b/h2o-r/tests/testdir_algos/modelselection/runit_PUBDEV_8235_modelselection_gaussian_validation.R
@@ -6,7 +6,7 @@ testModelSelectionV <- function() {
bhexFV2 <- h2o.uploadFile(locate("smalldata/logreg/prostate.csv"))
Y <- "GLEASON"
X <- c("AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS")
- browser()
+
Log.info("Build the MaxRGLM model")
allsubsetsModel <- h2o.modelSelection(y=Y, x=X, seed=12345, training_frame = bhexFV, max_predictor_number=2,
mode="allsubsets")
diff --git a/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R b/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R
index 893f1ccc31a5..856cb38eeb63 100644
--- a/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R
+++ b/h2o-r/tests/testdir_algos/pca/runit_OLD_1079_airlines.R
@@ -3,7 +3,6 @@ source("../../../scripts/h2o-r-test-setup.R")
# Make sure we can run with airline data
test.pca.airline<- function() {
- browser()
dimD = 234
pp = h2o.uploadFile(locate("smalldata/airlines/AirlinesTest.csv.zip"))
aa = h2o.prcomp(pp, k=dimD, transform="STANDARDIZE")
diff --git a/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R b/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R
index 41f6e6ca6cca..b151260dba8a 100644
--- a/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R
+++ b/h2o-r/tests/testdir_algos/pca/runit_pubdev_3502_pca_hangs_large_NOPASS.R
@@ -9,7 +9,6 @@ test.pca.la1s <- function() {
run_time_c <- c()
num_run <- 1
- browser()
dataR <- h2o.importFile(locate("bigdata/laptop/jira/la1s.wc.arff.txt.zip"), sep = ',', destination_frame = "data", header = T, parse = FALSE)
data <- h2o.parseRaw(dataR, destination_frame = "bigParse",
parse_type = "CSV", header = T) # chunk_size = 124022500 size will make one chunk.
diff --git a/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R b/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R
index e4774d4048d8..fac014a57764 100644
--- a/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R
+++ b/h2o-r/tests/testdir_algos/pca/runit_pubdev_6817_noK_PCA.R
@@ -6,7 +6,6 @@ test.pca.arrests <- function() {
Log.info("Importing USArrests.csv data...\n")
arrests.hex <- h2o.uploadFile(locate("smalldata/pca_test/USArrests.csv"))
arrests.pca.h2o <- h2o.prcomp(training_frame = arrests.hex, k = 1, seed=12345)
- browser()
pca_noK <- h2o.prcomp(training_frame = arrests.hex, seed=12345)
pred1 <- h2o.predict(arrests.pca.h2o, arrests.hex)
diff --git a/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R b/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R
index cf42dfd27aea..a60e44485568 100644
--- a/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R
+++ b/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R
@@ -2,8 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
-library(randomForest)
-
test.DRF.bigcat <- function() {
# Training set has 100 categories from cat001 to cat100
# Categories cat001, cat003, ... are perfect predictors of y = 1
@@ -25,10 +23,10 @@ test.DRF.bigcat <- function() {
drfperf <- h2o.performance(drfmodel)
expect_equal(h2o.auc(drfperf), 1)
# No errors off the diagonal
- default_cm <- h2o.confusionMatrix(drfmodel,bigcat.hex)[[1]]
-# expect_equal(default_cm[1,2], 0)
-# expect_equal(default_cm[2,1], 0)
-
+ default_cm <- h2o.confusionMatrix(drfmodel,bigcat.hex)
+ print(default_cm)
+ expect_equal(default_cm[[1,2]], 0)
+ expect_equal(default_cm[[2,1]], 0)
}
doTest("DRF Test: Classification with 100 categorical level predictor", test.DRF.bigcat)
diff --git a/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R b/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R
index 395ffa564b51..5b8546921a99 100644
--- a/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R
+++ b/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R
@@ -2,8 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
-library(randomForest)
-
test.DRF.smallcat <- function() {
# Training set has 26 categories from A to Z
# Categories A, C, E, G, ... are perfect predictors of y = 1
@@ -33,25 +31,9 @@ test.DRF.smallcat <- function() {
print(h2o.confusionMatrix(drfmodel,alphabet.hex))
expect_equal(h2o.auc(drfperf), 1)
# No errors off the diagonal
- default_cm <- h2o.confusionMatrix(drfmodel,alphabet.hex)[[1]]
- #iexpect_equal(default_cm[1,2], 0)
- #expect_equal(default_cm[2,1], 0)
-
- # Train R DRF Model:
- # Log.info("R DRF with same parameters:")
- # drfmodel.r <- randomForest(y ~ ., data = alphabet.data, ntree = 1, nodesize = 1)
- # drfmodel.r.pred <- predict(drfmodel.r, alphabet.data, type = "response")
-
- # Compute confusion matrices
- # Log.info("R Confusion Matrix:"); print(drfmodel.r$confusion)
- # Log.info("H2O (Group Split) Confusion Matrix:"); print(drfmodel.grpsplit@model$confusion)
-
- # Compute the AUC - need to convert factors back to numeric
- # actual <- ifelse(alphabet.data$y == "0", 0, 1)
- # pred <- ifelse(drfmodel.r.pred == "0", 0, 1)
- # R.auc = gbm.roc.area(actual, pred)
- # Log.info(paste("R AUC:", R.auc, "\tH2O (Group Split) AUC:", drfmodel.grpsplit@model$AUC))
-
+ default_cm <- h2o.confusionMatrix(drfmodel,alphabet.hex)
+ expect_equal(default_cm[1,2], 0)
+ expect_equal(default_cm[2,1], 0)
}
doTest("DRF Test: Classification with 26 categorical level predictor", test.DRF.smallcat)
diff --git a/h2o-r/tests/testdir_algos/word2vec/runit_word2vec_find_synonyms.R b/h2o-r/tests/testdir_algos/word2vec/runit_word2vec_find_synonyms.R
new file mode 100644
index 000000000000..85aa6810eb7b
--- /dev/null
+++ b/h2o-r/tests/testdir_algos/word2vec/runit_word2vec_find_synonyms.R
@@ -0,0 +1,20 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+
+test.word2vec.findSynonyms <- function() {
+ job_titles <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv", col.names = c("category", "jobtitle"), col.types = c("String", "String"), header = TRUE)
+
+ words <- h2o.tokenize(job_titles, " ")
+ vec <- h2o.word2vec(training_frame = words)
+
+ cnt <- 10
+ syn <- h2o.findSynonyms(vec, "teacher", count = cnt)
+ expect_equal(length(syn$score), cnt)
+
+ # # GH-16192 h2o.findSynonyms returns empty dataset if there is no synonyms to find
+ syn2 <- h2o.findSynonyms(vec, "Tteacher", count = cnt)
+ expect_equal(length(syn2$score), 0)
+}
+
+doTest("Test findSynonyms function", test.word2vec.findSynonyms)
diff --git a/h2o-r/tests/testdir_golden/runit_pca_5_golden.R b/h2o-r/tests/testdir_golden/runit_pca_5_golden.R
index b312391c63b1..d0fa3e841aa3 100644
--- a/h2o-r/tests/testdir_golden/runit_pca_5_golden.R
+++ b/h2o-r/tests/testdir_golden/runit_pca_5_golden.R
@@ -5,7 +5,7 @@ source("../../scripts/h2o-r-test-setup.R")
test.poison.golden <- function() {
Log.info("Importing poison.csv data...")
- poisonR <- read.csv(locate("smalldata/pca_test/poison.csv"), header = TRUE)
+ poisonR <- read.csv(locate("smalldata/pca_test/poison.csv"), header = TRUE, stringsAsFactors = TRUE)
poisonH2O <- h2o.uploadFile(locate("smalldata/pca_test/poison.csv"), destination_frame = "poisonH2O")
k_test <- sort(sample(1:8,3))
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R
index e6088d867185..92ced338b1d2 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5008_ordinal_glm_mojo_large.R
@@ -15,7 +15,7 @@ test.ordinalGlm.mojo <-
h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R
index 548675cbf166..affa516fb7f8 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5303_multinomial_glm_mojo_large.R
@@ -15,7 +15,7 @@ test.multinomialGlm.mojo <-
h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R
index 15f58343b0f4..f6695a8c9fad 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5355_binomial_glm_mojo_large.R
@@ -15,7 +15,7 @@ test.ordinalGlm.mojo <-
h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R
index 64fb8ed18062..d899777e056e 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_5529_leaf_node_assign_gbm_mojo.R
@@ -16,7 +16,6 @@ test.gbm.leaf.assignment.mojo <-
h2o.downloadCSV(params_prob_data$tDataset[,params_prob_data$params$x], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, get_leaf_node_assignment=TRUE) # perform H2O and mojo prediction and return frames
print("Finished mojo. Going to compare two frames")
- browser()
print(twoFrames)
compareStringFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1)
}, error = function(x) x)
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R
index 3f950ad67e2f..aee634d0ac2a 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_binomial_gam_mojo_large.R
@@ -20,7 +20,7 @@ test.binomial.gam.mojo <-
h2o.downloadCSV(twoFrames$h2oPredict,
sprintf("%s/h2oPred.csv", modelAndDir$dirName))
h2o.downloadCSV(twoFrames$mojoPredict,
- sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(
twoFrames$h2oPredict,
twoFrames$mojoPredict,
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R
index 8a6fd74918c1..93b1b0e02748 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7185_quasibinomial_gam_MOJO.R
@@ -41,7 +41,7 @@ test.GAM.quasibinomial <- function() {
h2o.downloadCSV(htest[1:100, x], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("enum", "numeric", "numeric")) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
twoFrames$h2oPredict[,1] <- h2o.asfactor(twoFrames$h2oPredict[,1])
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R
index 8d2045e9a0cd..4de31d3874c4 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_7860_binomial_gam_TP_CS_MOJO.R
@@ -26,7 +26,7 @@ test.GAM.binomial <- function() {
h2o.downloadCSV(test, filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("enum", "numeric", "numeric")) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
twoFrames$h2oPredict[,1] <- h2o.asfactor(twoFrames$h2oPredict[,1])
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R
index 1e5d5b7151e4..507b769fe917 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_fractionalbinomial_glm_MOJO_offset.R
@@ -23,7 +23,7 @@ test.GLM.offset.fractionalbinomial <- function() {
h2o.downloadCSV(hf[1:100, xOffset], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("numeric", "numeric", "numeric")) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R
index 065aa7f83af5..65bced0aca32 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_quasibinomial_glm_MOJO_offset.R
@@ -37,7 +37,7 @@ test.GLM.offset.quasibinomial <- function() {
h2o.downloadCSV(hf[1:100, xOffset], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("numeric", "numeric", "numeric")) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
predictFrame <- twoFrames$h2oPredict[, 2:3]
mojoFrame <- twoFrames$mojoPredict[, 2:3]
compareFrames(predictFrame, mojoFrame, prob=1, tolerance = 1e-6)
diff --git a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R
index 7d3483d199be..3c1699f448f7 100644
--- a/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R
+++ b/h2o-r/tests/testdir_javapredict/runit_PUBDEV_8330_tweedie_glm_MOJO_offset.R
@@ -24,7 +24,7 @@ test.GLM.offset.tweedie <- function() {
h2o.downloadCSV(hf[1:100, xOffset], filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename, col.types=c("numeric", "numeric", "numeric")) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-6)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R b/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R
index b0d9f584e784..08d4040d16a9 100644
--- a/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R
+++ b/h2o-r/tests/testdir_javapredict/runit_pubdev_4531_large.R
@@ -19,7 +19,6 @@ test <-
test_file <- locate("smalldata/logreg/prostate_train_null_column_name.csv")
test_frame <- h2o.importFile(test_file)
params = prepTest()
- browser()
doJavapredictTest("gbm",test_file,test_frame,params) # make sure original code run
# check a separator that is not a special character
@@ -53,4 +52,4 @@ prepTest <- function() {
return(params)
}
-doTest("pubdev-4531: PredictCsv test", test)
\ No newline at end of file
+doTest("pubdev-4531: PredictCsv test", test)
diff --git a/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R b/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R
index 0cfe66c57166..2d13ea3eeb89 100644
--- a/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R
+++ b/h2o-r/tests/testdir_javapredict/runit_pubdev_5351_pca_mojo_large.R
@@ -16,7 +16,7 @@ test.PCA.mojo <-
h2o.downloadCSV(params_prob_data$tDataset, filename)
twoFrames<-mojoH2Opredict(modelAndDir$model, modelAndDir$dirName, filename) # perform H2O and mojo prediction and return frames
h2o.downloadCSV(twoFrames$h2oPredict, sprintf("%s/h2oPred.csv", modelAndDir$dirName))
- h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirname))
+ h2o.downloadCSV(twoFrames$mojoPredict, sprintf("%s/mojoOut.csv", modelAndDir$dirName))
compareFrames(twoFrames$h2oPredict,twoFrames$mojoPredict, prob=1, tolerance = 1e-4)
}
diff --git a/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R b/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R
index 7121f7694eae..df83640a3f4d 100644
--- a/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R
+++ b/h2o-r/tests/testdir_javapredict/runit_pubdev_6617_setInvNumNA.R
@@ -18,7 +18,6 @@ test.mojo.setInvNumNA <-
params$y <- "C2"
params$family <- "gaussian"
modelAndDir<-buildModelSaveMojoGLM(params) # build the model and save mojo
- browser()
modelPred <- h2o.predict(modelAndDir$model, testModel) # predict with invalid row value replaced with mean value
# get genmodel.jar pathname
a = strsplit(modelAndDir$dirName, '/')
diff --git a/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R b/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R
index 0c6b0b0ee094..4d3af484b75b 100644
--- a/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R
+++ b/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R
@@ -4,8 +4,8 @@ source("../../scripts/h2o-r-test-setup.R")
# R behavior: Reports an error but keeps the frame as is
test.pubdev.2800 <- function(conn){
- df <- h2o.importFile("http://h2o-smalldata.s3.amazonaws.com/jira/test_string_missing.csv")
+ df <- h2o.importFile("smalldata/jira/test_string_missing.csv")
expect_false(is.na(df[3,2]))
}
-doTest("'0' Parsed incorrectly", test.pubdev.2800)
\ No newline at end of file
+doTest("'0' Parsed incorrectly", test.pubdev.2800)
diff --git a/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R b/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R
index 3a5c19945d4c..49005fb01a1f 100644
--- a/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R
+++ b/h2o-r/tests/testdir_jira/runit_PUBDEV_7362_merge_duplicate_others.R
@@ -4,38 +4,40 @@ source("../../scripts/h2o-r-test-setup.R")
# problem with merge.
test <- function() {
# code from Kuba
- left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15))) # [A, 12][B, 13][C, 14][D, 15]
- right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000))) #[Y, 10000][B, 20000][X, 30000][D, 40000]
+ left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15), stringsAsFactors = TRUE)) # [A, 12][B, 13][C, 14][D, 15]
+ right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000), stringsAsFactors = TRUE)) #[Y, 10000][B, 20000][X, 30000][D, 40000]
merged <- h2o.merge(right, left, all.x = TRUE, method="radix")
- resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), bigvalue=c(20000, 40000, 30000, 10000), value = c(13, 15, NA, NA)))
+ resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), bigvalue=c(20000, 40000, 30000, 10000), value = c(13, 15, NA, NA), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic"))
merged <- h2o.merge(left, right, all.y = TRUE, method="radix")
- resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000)))
+ resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic"))
merged <- h2o.merge(left, right, all.x = FALSE, all.y = FALSE, method="radix")
- resultF <- as.h2o(data.frame(topic=c("B","D"), value = c(13, 15), bigvalue=c(20000, 40000)))
+ resultF <- as.h2o(data.frame(topic=c("B","D"), value = c(13, 15), bigvalue=c(20000, 40000), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic"))
merged <- h2o.merge(right, left, all.x = FALSE, all.y = FALSE, method="radix")
- resultF <- as.h2o(data.frame(topic=c("B","D"), bigvalue=c(20000, 40000), value = c(13, 15)))
+ resultF <- as.h2o(data.frame(topic=c("B","D"), bigvalue=c(20000, 40000), value = c(13, 15), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic"))
# customer code
left_hf <- as.h2o(data.frame(fruit = c(-177000000, -4000000, 100000000000, 200000000000, 1000000000000),
- color <- c('red', 'orange', 'yellow', 'red', 'blue')))
- right_hf <- as.h2o(data.frame(fruit = c(-177000000), citrus <- c(FALSE)))
+ color = c('red', 'orange', 'yellow', 'red', 'blue'), stringsAsFactors = TRUE))
+ right_hf <- as.h2o(data.frame(fruit = c(-177000000), citrus = c(FALSE)))
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
resultF <- as.h2o(data.frame(fruit = c(100000000000,200000000000,1000000000000,-177000000,-4000000),
- color=c('yellow','red','blue','red','orange'), citrus=c(NA, NA, NA, FALSE, NA)))
+ color=c('yellow','red','blue','red','orange'), citrus=c(NA, NA, NA, FALSE, NA),
+ stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# left frame starts lower
- left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256), color <- c('red', 'orange', 'yellow', 'red', 'blue')))
+ left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256), color = c('red', 'orange', 'yellow', 'red', 'blue'),
+ stringsAsFactors = TRUE))
right_hf <- as.h2o( data.frame(fruit = c(258,518,517,1030,1028,1028,1030,2049),
- citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE)))
+ citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE)))
merged2 <- h2o.merge(left_hf, right_hf, all.y = TRUE) # H2O give wrong answer
print(merged2)
resultF <- as.h2o(data.frame(fruit=c(258,517,518,1028,1028,1030,1030,2049),
@@ -45,13 +47,15 @@ test <- function() {
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
print(merged)
- resultF <- as.h2o(data.frame(fruit=c(0,2,3,256,257), color=c('yellow','red','orange','blue','red'), citrus=c(NA,NA,NA,NA,NA)))
+ resultF <- as.h2o(data.frame(fruit=c(0,2,3,256,257), color=c('yellow','red','orange','blue','red'),
+ citrus=c(NA,NA,NA,NA,NA), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# both frame more or less overlapped
left_hf <- as.h2o(data.frame(fruit = c(2,3,3,3,0,4,7,9,257,256,518,518,1028),
color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple',
- 'cyan','red', 'orange', 'yellow', 'red', 'blue','negra')))
+ 'cyan','red', 'orange', 'yellow', 'red', 'blue','negra'),
+ stringsAsFactors = TRUE))
right_hf <- as.h2o(data.frame(fruit = c(3,3,3,3,6,8,12,14,258,518,518,517,1030,1028,1028,1030,2049),
citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE, TRUE, FALSE,
FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE)))
@@ -61,7 +65,8 @@ test <- function() {
'yellow','red','red','red','red','purple','cyan','red','yellow','orange',
'red','red','blue','blue','negra','negra'),
citrus=c(NA,NA,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,
- NA,NA,NA,NA,NA,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)))
+ NA,NA,NA,NA,NA,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE),
+ stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
merged <- h2o.merge(left_hf, right_hf, all.x=FALSE, all.y=FALSE)
@@ -69,86 +74,87 @@ test <- function() {
color=c('orange','orange','orange','orange','yellow','yellow','yellow','yellow',
'red','red','red','red','red','red','blue','blue','negra','negra'),
citrus=c(TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,
- FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)))
+ FALSE,FALSE,FALSE,FALSE,FALSE,FALSE), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged, "fruit"), h2o.arrange(resultF,"fruit"))
# both frame with duplicate keys
# left frame starts higher and with overlap
- left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028), color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan')))
- right_hf <- as.h2o(data.frame(fruit = c(258,518,517,1030,1028,1030,1035), citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE)))
+ left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028), color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), stringsAsFactors = TRUE))
+ right_hf <- as.h2o(data.frame(fruit = c(258,518,517,1030,1028,1030,1035), citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE), stringsAsFactors = TRUE))
merged <- h2o.merge(left_hf, right_hf, all.x = FALSE, all.y=FALSE)
- resultF <- as.h2o(data.frame(fruit=c(518, 1028), color=c('purple', 'cyan'), citrus=c(TRUE, TRUE)))
+ resultF <- as.h2o(data.frame(fruit=c(518, 1028), color=c('purple', 'cyan'), citrus=c(TRUE, TRUE), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# left frame starts higher and no overlap
left_hf <- as.h2o(data.frame(fruit = c(2,3,0,14,15,16,17),
- color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan')))
+ color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'),
+ stringsAsFactors = TRUE))
right_hf <- as.h2o(data.frame(fruit = c(258,518,517,1030,1028,1030,1035),
- citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE)))
+ citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE)))
merged <- h2o.merge(left_hf, right_hf, all.x = FALSE, all.y=FALSE)
print(merged)
expect_true((nrow(merged) == 0 && ncol(merged) == 3), info="Merged frame and expected result are different in size.")
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
resultF <-as.h2o(data.frame(fruit=c(0,2,3,14,15,16,17),
color=c('yellow','red','orange','red', 'blue', 'purple', 'cyan'),
- citrus=c(NA,NA,NA,NA,NA,NA,NA)))
+ citrus=c(NA,NA,NA,NA,NA,NA,NA), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# code from Kuba
- left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15))) # [A, 12][B, 13][C, 14][D, 15]
- right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000))) #[Y, 10000][B, 20000][X, 30000][D, 40000]
+ left <- as.h2o(data.frame(topic=c("A","B","C","D"), value=c(12,13,14,15), stringsAsFactors = TRUE)) # [A, 12][B, 13][C, 14][D, 15]
+ right <- as.h2o(data.frame(topic=c("Y","B","X","D"), bigValue=c(10000, 20000, 30000, 40000), stringsAsFactors = TRUE)) #[Y, 10000][B, 20000][X, 30000][D, 40000]
merged <- h2o.merge(left, right, all.y = TRUE, method="radix")
- resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000)))
+ resultF <- as.h2o(data.frame(topic=c("B","D","X","Y"), value = c(13, 15, NA, NA), bigvalue=c(20000, 40000, 30000, 10000), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"topic"), h2o.arrange(resultF,"topic"))
# example from Neema
left_hf <- as.h2o(data.frame(fruit = c(-177000000, -4000000, 100000000000, 200000000000, 1000000000000),
- color <- c('red', 'orange', 'yellow', 'red', 'blue')))
+ color = c('red', 'orange', 'yellow', 'red', 'blue'), stringsAsFactors = TRUE))
right_hf <- as.h2o(data.frame(fruit = c(-177000000, -177000000),
- citrus <- c(FALSE)))
+ citrus = c(FALSE)))
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
resultF <- as.h2o(data.frame(fruit = c(100000000000,200000000000,1000000000000,-177000000,-177000000,-4000000),
- color=c('yellow','red','blue','red','red','orange'), citrus=c(NA, NA, NA, FALSE, FALSE, NA)))
+ color=c('yellow','red','blue','red','red','orange'), citrus=c(NA, NA, NA, FALSE, FALSE, NA), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
merged <- h2o.merge(left_hf, right_hf, all.y = TRUE)
resultF <- as.h2o(data.frame(fruit = c(-177000000,-177000000),
- color=c('red','red'), citrus=c(FALSE, FALSE)))
+ color=c('red','red'), citrus=c(FALSE, FALSE), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# more or less overlapped
left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028),
- color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan')))
+ color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), stringsAsFactors = TRUE))
right_hf <- as.h2o(data.frame(fruit = c(2,1,3,258,518,517,1030,1028,1030,1035,0),
- citrus <- c(FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE)))
+ citrus = c(FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE)))
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
resultF <- as.h2o(data.frame(fruit=c(0,2,3,256,257,518,1028),
color=c('yellow','red','orange','blue','red','purple','cyan'),
- citrus=c(TRUE, FALSE, FALSE, NA, NA, TRUE, TRUE)))
+ citrus=c(TRUE, FALSE, FALSE, NA, NA, TRUE, TRUE), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# left frame with duplicate keys
left_hf <- as.h2o(data.frame(fruit = c(2,3,3,3,3,0,257,256,518,1028, 1028, 1028),
- color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan',
- 'black','red','violet','magenta','cyan')))
+ color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan',
+ 'black','red','violet','magenta','cyan'), stringsAsFactors = TRUE))
right_hf <- as.h2o(data.frame(fruit = c(258,517,1030,1028,1030,2049),
- citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE)))
+ citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE)))
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
resultF <- as.h2o(data.frame(fruit=c(0,2,3,3,3,3,256,257,518,1028,1028,1028),
color=c('purple','red', 'orange', 'yellow', 'red', 'blue','black','cyan','red','violet','magenta','cyan'),
- citrus=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,FALSE,FALSE,FALSE)))
+ citrus=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,FALSE,FALSE,FALSE), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
# rite frame with duplicate keys
left_hf <- as.h2o(data.frame(fruit = c(2,3,0,257,256,518,1028),
- color <- c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan')))
+ color = c('red', 'orange', 'yellow', 'red', 'blue', 'purple', 'cyan'), stringsAsFactors = TRUE))
right_hf <- as.h2o(data.frame(fruit = c(3,3,3,3,258,518,517,1030,1028,1028,1030,2049),
- citrus <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE, TRUE, FALSE, FALSE, TRUE)))
+ citrus = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE,TRUE, TRUE, FALSE, FALSE, TRUE)))
merged <- h2o.merge(left_hf, right_hf, all.x = TRUE)
resultF <- as.h2o(data.frame(fruit=c(0,2,3,3,3,3,256,257,518,1028,1028),
color=c('yellow','red','orange','orange','orange','orange','blue','red','purple','cyan','cyan'),
- citrus=c(NA, NA, TRUE, TRUE, FALSE, FALSE, NA, NA, FALSE, TRUE, FALSE)))
+ citrus=c(NA, NA, TRUE, TRUE, FALSE, FALSE, NA, NA, FALSE, TRUE, FALSE), stringsAsFactors = TRUE))
assertMergeCorrect(h2o.arrange(merged,"fruit"), h2o.arrange(resultF,"fruit"))
}
diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R
index 70cd5b20f450..a3199523e3dd 100644
--- a/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R
+++ b/h2o-r/tests/testdir_jira/runit_hexdev_29_categorical_continuous.R
@@ -12,7 +12,8 @@ test.continuous.or.categorical <- function() {
aa <- data.frame(
h1 = c( 1, 8, 4, 3, 6),
h2 = c('fish', 'cat', 'fish', 'dog', 'bird'),
- h3 = c( 0, 1, 0, 0, 1)
+ h3 = c( 0, 1, 0, 0, 1),
+ stringsAsFactors = TRUE
)
df.hex <- as.h2o(aa)
diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R
index f26476d67026..5b37d499a6c5 100644
--- a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R
+++ b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R
@@ -36,7 +36,7 @@ test.continuous.or.categorical <- function() {
e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
- expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
+ expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
# col.types as character vector
df.hex2 <- h2o.importFile(locate("smalldata/iris/iris.csv"), col.types=c("Numeric","Numeric","Enum","Numeric","Enum"))
@@ -66,7 +66,7 @@ test.continuous.or.categorical <- function() {
e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
- expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
+ expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
# col.types as character vector
df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"),
@@ -98,7 +98,7 @@ test.continuous.or.categorical <- function() {
e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
- expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
+ expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
# col.types as character vector
df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"),
diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_1383.R b/h2o-r/tests/testdir_jira/runit_pubdev_1383.R
index 494c98a776ec..196b84bc2739 100644
--- a/h2o-r/tests/testdir_jira/runit_pubdev_1383.R
+++ b/h2o-r/tests/testdir_jira/runit_pubdev_1383.R
@@ -6,7 +6,7 @@ source("../../scripts/h2o-r-test-setup.R")
test.pubdev.1383 <- function() {
k <- 10
Log.info("Importing fgl_tr.csv...")
- fgl.dat <- read.csv(locate("smalldata/pca_test/fgl_tr.csv"))
+ fgl.dat <- read.csv(locate("smalldata/pca_test/fgl_tr.csv"), stringsAsFactors = TRUE)
fgl.hex <- h2o.importFile(locate("smalldata/pca_test/fgl_tr.csv"))
print(summary(fgl.hex))
diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_1398.R b/h2o-r/tests/testdir_jira/runit_pubdev_1398.R
index bd72fbe2a9ad..c2be64abe8c9 100644
--- a/h2o-r/tests/testdir_jira/runit_pubdev_1398.R
+++ b/h2o-r/tests/testdir_jira/runit_pubdev_1398.R
@@ -6,7 +6,7 @@ source("../../scripts/h2o-r-test-setup.R")
test.pubdev.1398 <- function() {
k <- 13
Log.info("Importing decathlon.csv...")
- dec.dat <- read.csv(locate("smalldata/pca_test/decathlon.csv"))
+ dec.dat <- read.csv(locate("smalldata/pca_test/decathlon.csv"), stringsAsFactors = TRUE)
dec.hex <- h2o.importFile(locate("smalldata/pca_test/decathlon.csv"))
print(summary(dec.hex))
diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_1654.R b/h2o-r/tests/testdir_jira/runit_pubdev_1654.R
index df947c113326..08e573da130e 100644
--- a/h2o-r/tests/testdir_jira/runit_pubdev_1654.R
+++ b/h2o-r/tests/testdir_jira/runit_pubdev_1654.R
@@ -8,7 +8,7 @@ test.pubdev.1654 <- function() {
use_all_factor_levels <- FALSE
Log.info("Importing birds.csv data...")
- birds.dat <- read.csv(locate("smalldata/pca_test/birds.csv"), header = TRUE)
+ birds.dat <- read.csv(locate("smalldata/pca_test/birds.csv"), header = TRUE, stringsAsFactors = TRUE)
birds.hex <- h2o.importFile(locate("smalldata/pca_test/birds.csv"))
print(summary(birds.hex))
diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R b/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R
index e12f7694ffcc..4cca01509f0c 100644
--- a/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R
+++ b/h2o-r/tests/testdir_jira/runit_pubdev_5518_autoencoder_grid.R
@@ -3,7 +3,6 @@ source("../../scripts/h2o-r-test-setup.R")
# Test derived from Nidhi Mehta. Thanks.
test.pubdev.5518 <- function() {
- browser()
N=1000
set.seed(5)
color = sample(c("D","E","I","F","M"),size=N,replace=TRUE)
diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_8218.R b/h2o-r/tests/testdir_jira/runit_pubdev_8218.R
index 357a1fc12a45..7b764ba52413 100644
--- a/h2o-r/tests/testdir_jira/runit_pubdev_8218.R
+++ b/h2o-r/tests/testdir_jira/runit_pubdev_8218.R
@@ -6,7 +6,8 @@ test.pubdev_8218 = function(){
df = data.frame(
v1 = c('Y', 'Y', 'Y', 'N', 'N'),
v2 = c('S', 'S', 'S', 'A', 'A'),
- v3 = c('E1', 'E1', 'E1', 'B1', 'B1')
+ v3 = c('E1', 'E1', 'E1', 'B1', 'B1'),
+ stringsAsFactors = TRUE
)
df.hex = as.h2o(df, 'dfhex')
interaction = h2o.interaction(
@@ -30,7 +31,8 @@ test.pubdev_8218 = function(){
df = data.frame(
v1 = c('Y', 'Y', 'Y', 'N', 'N', 'Y'),
v2 = c('S', 'S', 'S', 'A', 'A', 'N'),
- v3 = c('E1', 'E1', 'E1', 'B1', 'B1', 'B1')
+ v3 = c('E1', 'E1', 'E1', 'B1', 'B1', 'B1'),
+ stringsAsFactors = TRUE
)
df.hex = as.h2o(df, 'dfhex')
interaction = h2o.interaction(
diff --git a/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R b/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R
index 0bd6e3507df1..8579e17b124c 100644
--- a/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R
+++ b/h2o-r/tests/testdir_misc/runit_PUBDEV-6775-2D-pdp.R
@@ -7,7 +7,6 @@ test <- function() {
## Change CAPSULE to Enum
prostate_hex[, "CAPSULE"] = as.factor(prostate_hex[, "CAPSULE"])
- browser()
## Run Random Forest in H2O
temp_filename_no_extension <- tempfile(pattern = "pdp", tmpdir = tempdir(), fileext = "")
## Calculate partial dependence using h2o.partialPlot for columns "AGE" and "RACE"
diff --git a/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R b/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R
index 437a51ebf532..6dcc0dd6a3a4 100644
--- a/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R
+++ b/h2o-r/tests/testdir_misc/runit_as.h2o_sparse.R
@@ -22,7 +22,8 @@ test.as.h2o.sparse <- function() {
j <- c(2, 9, 6:10, 46343)
x <- pi * (1:8)
m.large <- Matrix::sparseMatrix(i, j, x = x)
- expect_error(as.matrix(m.large), "Cholmod error 'problem too large'")
+ # When we have enough memory R 4.4 can create the matrix without failing
+ # expect_error(as.matrix(m.large), "Cholmod error 'problem too large'|vector memory limit of .* reached")
Log.info("Loading a large sparse matrix into H2O")
h2o.large <- as.h2o(m.large, "large_matrix")
diff --git a/h2o-r/tests/testdir_misc/runit_h2oconfig.R b/h2o-r/tests/testdir_misc/runit_h2oconfig.R
index 0b52c13eb439..7afada31b7b0 100644
--- a/h2o-r/tests/testdir_misc/runit_h2oconfig.R
+++ b/h2o-r/tests/testdir_misc/runit_h2oconfig.R
@@ -88,7 +88,7 @@ test.config <- function() {
password = password"),fileConn)
#Parse config and check if correct
config = .parse.h2oconfig(h2oconfig_filename)
- expect_equal(config,data.frame(init.username = "name" ,init.password = "password"))
+ expect_equal(config,data.frame(init.username = "name" ,init.password = "password", stringsAsFactors = TRUE))
#Create tmp config
writeLines(c("[general]
@@ -99,7 +99,7 @@ test.config <- function() {
password = password"),fileConn)
#Parse config and check if correct
config = .parse.h2oconfig(h2oconfig_filename)
- expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password"))
+ expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password", stringsAsFactors = TRUE))
#Create tmp config
writeLines(c("
@@ -108,10 +108,10 @@ test.config <- function() {
init.password = password"),fileConn)
#Parse config and check if correct
config = .parse.h2oconfig(h2oconfig_filename)
- expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password"))
+ expect_equal(config,data.frame(general.allow_breaking_changes = as.factor("True"),init.username = "name" ,init.password = "password", stringsAsFactors = TRUE))
#Delete tmp directory
on.exit(unlink(dir,recursive=TRUE))
}
-doTest("Test h2o config parsing", test.config)
\ No newline at end of file
+doTest("Test h2o config parsing", test.config)
diff --git a/h2o-r/tests/testdir_misc/runit_ifelse.R b/h2o-r/tests/testdir_misc/runit_ifelse.R
index e7e06c3114d3..8733385ef00f 100644
--- a/h2o-r/tests/testdir_misc/runit_ifelse.R
+++ b/h2o-r/tests/testdir_misc/runit_ifelse.R
@@ -10,7 +10,7 @@ test.ifelse <- function() {
Log.info("Find Setosa species H2O's ifelse...")
setosa.hex <- ifelse(iris.hex$Species == "setosa", "N", "Y")
- expect_equal(as.data.frame(setosa.hex), data.frame(C1 = setosa))
+ expect_equal(as.data.frame(setosa.hex), data.frame(C1 = setosa, stringsAsFactors = TRUE))
}
doTest("R and H2O ifelse Function", test.ifelse)
diff --git a/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R b/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R
index 0400d1b1660b..b1b36cfebe2b 100644
--- a/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R
+++ b/h2o-r/tests/testdir_misc/runit_import_upload_singlequoted.R
@@ -13,7 +13,7 @@ test.import_single_quoted <- function() {
expect_true(h2o.ncol(hdf) == 20)
expect_true(h2o.nrow(hdf) == 7)
- df <- read.csv(path, quote="'")
+ df <- read.csv(path, quote="'", stringsAsFactors = TRUE)
hddf <- as.data.frame(hdf)
# comparing last column only as it's difficult to compare dataframes in R (always cryptic errors on some column):
# if parsing was ok, last column should be identical, otherwise it should be shifted
@@ -29,7 +29,7 @@ test.upload_single_quoted <- function() {
expect_true(h2o.ncol(hdf) == 20)
expect_true(h2o.nrow(hdf) == 7)
- df <- read.csv(path, quote="'")
+ df <- read.csv(path, quote="'", stringsAsFactors = TRUE)
hddf <- as.data.frame(hdf)
expect_equal(df['status'], hddf['status'])
}
diff --git a/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R b/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R
index ebc210f9c5e1..8254ea838708 100644
--- a/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R
+++ b/h2o-r/tests/testdir_misc/runit_pubdev_5921_na_prints_large.R
@@ -14,7 +14,6 @@ testPartialPlots <- function() {
assert_twoDTable_equal(h2o_pp_weight[[1]], h2o_pp_weight_NA[[1]]) # compare Input_miss pdp
assert_twoDTable_equal(h2o_pp_weight[[2]], h2o_pp_weight_NA[[2]]) # compare fDayOfWeek pdp
- browser()
manual_weighted_stats_im <- manual_partial_dependency(airlines_gbm, airlines_hex, h2o_pp_weight_NA[[1]][[1]], "Input_miss", as.data.frame(airlines_hex["Weight"]), 3)
assert_twoDTable_array_equal(h2o_pp_weight_NA[[1]], manual_weighted_stats_im[1,], manual_weighted_stats_im[2,], manual_weighted_stats_im[3,])
manual_weighted_stats_day <- manual_partial_dependency(airlines_gbm, airlines_hex, h2o_pp_weight_NA[[2]][[1]], "fDayOfWeek", as.data.frame(airlines_hex["Weight"]), 3)
diff --git a/h2o-r/tests/testdir_misc/runit_relevel.R b/h2o-r/tests/testdir_misc/runit_relevel.R
index 48b57b61620e..87d95edfc7ad 100644
--- a/h2o-r/tests/testdir_misc/runit_relevel.R
+++ b/h2o-r/tests/testdir_misc/runit_relevel.R
@@ -28,7 +28,7 @@ test.relevel <- function() {
expect_true(("DPROS.Both" %in% ns2), "Both level IS NOT expected to be skipped in re-leveled column")
# compare against R
- dr <- read.csv(locate("smalldata/prostate/prostate_cat.csv"))
+ dr <- read.csv(locate("smalldata/prostate/prostate_cat.csv"), stringsAsFactors=TRUE)
dr$DPROS <- relevel(dr$DPROS,"None")
mr <- glm(data=dr,CAPSULE ~ ., family=binomial)
print(mr)
diff --git a/h2o-r/tests/testdir_munging/unop/runit_head_tail.R b/h2o-r/tests/testdir_munging/unop/runit_head_tail.R
index e9aa742a9a57..ee29a2fa23f7 100644
--- a/h2o-r/tests/testdir_munging/unop/runit_head_tail.R
+++ b/h2o-r/tests/testdir_munging/unop/runit_head_tail.R
@@ -6,7 +6,7 @@ source("../../../scripts/h2o-r-test-setup.R")
test.head_tail <- function() {
Log.info("Uploading iris/iris_wheader.csv")
iris.hex <- h2o.importFile(locate("smalldata/iris/iris_wheader.csv"), "iris_wheader.hex")
- iris.dat <- read.csv(locate("smalldata/iris/iris_wheader.csv"))
+ iris.dat <- read.csv(locate("smalldata/iris/iris_wheader.csv"), stringsAsFactors=TRUE)
nrows <- nrow(iris.dat)
ncols <- ncol(iris.dat)
diff --git a/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R b/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R
index 25fc429a970f..43d5f4a1d726 100644
--- a/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R
+++ b/h2o-r/tests/testdir_parser/runit_GH_15741_force_col_types.R
@@ -2,7 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../scripts/h2o-r-test-setup.R")
test.force_col_types <- function() {
- browser()
originalTypes <- c("real", "int", "int", "int", "int", "string", "real", "string", "real", "real", "enum", "int", "int", "int", "int", "enum", 'real', 'real', "enum", "enum", "enum", 'real', "int", "int", "enum", "enum", "string", "int", "int", "int", "int", "int", "int", "int", "enum", "int", "string", "int", "string", "int", "string", "string", 'real', "int", "string", "int", 'real', 'real', "int", "int")
h2odata <- h2o.importFile(path = locate("smalldata/parser/synthetic_dataset.csv"))
checkTypes(originalTypes, h2odata)
diff --git a/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R b/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R
index 86a4fa29c5e0..10925a38e511 100644
--- a/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R
+++ b/h2o-r/tests/testdir_parser/runit_GH_15741_parquet_force_col_types.R
@@ -2,7 +2,6 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../scripts/h2o-r-test-setup.R")
test.force_col_types <- function() {
- browser()
originalTypes <- c("real", "int") # old H2O parse column tyoes
h2odata <- h2o.importFile(path = locate("smalldata/parser/parquet/df.parquet"))
checkTypes(originalTypes, h2odata)
diff --git a/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R
new file mode 100644
index 000000000000..94c7e016ef9a
--- /dev/null
+++ b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R
@@ -0,0 +1,10 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../scripts/h2o-r-test-setup.R")
+
+test.skipped_columns <- function() {
+ iris_hf <- as.h2o(iris, skipped_columns=c(1,2))
+ expect_true(ncol(iris_hf) == (ncol(iris)-2))
+ print("Columns are skipped!!!")
+}
+
+doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns)
diff --git a/h2o-r/tests/testdir_parser/runit_GH_16161_parquet_npe.R b/h2o-r/tests/testdir_parser/runit_GH_16161_parquet_npe.R
new file mode 100644
index 000000000000..0ee55e1b639c
--- /dev/null
+++ b/h2o-r/tests/testdir_parser/runit_GH_16161_parquet_npe.R
@@ -0,0 +1,19 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f")))
+source("../../scripts/h2o-r-test-setup.R")
+
+test.parseParquetString<- function() {
+ df <- h2o.createFrame(rows = 100,
+ cols = 10,
+ string_fraction = 0.1, # create one string column
+ seed = 5,
+ seed_for_column_types = 25)
+ target <- file.path(sandbox(), "createdFrame.parquet")
+ h2o.exportFile(data = df,
+ path = target,
+ format = "parquet",
+ write_checksum = FALSE)
+ df2 <- h2o.importFile(target)
+ compareFrames(df, df2)
+}
+
+doTest("Test Parquet String export error.", test.parseParquetString)
diff --git a/h2o-r/tests/testdir_parser/runit_parse_zstd.R b/h2o-r/tests/testdir_parser/runit_parse_zstd.R
new file mode 100644
index 000000000000..2db078cf9e18
--- /dev/null
+++ b/h2o-r/tests/testdir_parser/runit_parse_zstd.R
@@ -0,0 +1,14 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../scripts/h2o-r-test-setup.R")
+
+test.parseExportZSTD<- function() {
+ f1 <- h2o.importFile(locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
+
+ target <- file.path(sandbox(), "gaussian_20cols_10000Rows.csv.zst")
+ h2o.exportFile(f1, target)
+
+ f2 <- h2o.importFile(target)
+ compareFrames(f1, f2, prob=1)
+}
+
+doTest("Test ZSTD parser and export", test.parseExportZSTD)
diff --git a/scripts/jenkins/groovy/buildConfig.groovy b/scripts/jenkins/groovy/buildConfig.groovy
index d3ea58ba1fd4..a212fe3b2150 100644
--- a/scripts/jenkins/groovy/buildConfig.groovy
+++ b/scripts/jenkins/groovy/buildConfig.groovy
@@ -14,7 +14,7 @@ class BuildConfig {
private static final String DEFAULT_HADOOP_IMAGE_NAME = 'dev-build-hadoop'
private static final String DEFAULT_RELEASE_IMAGE_NAME = 'dev-release'
- public static final int DEFAULT_IMAGE_VERSION_TAG = 44
+ public static final int DEFAULT_IMAGE_VERSION_TAG = 45
public static final String AWSCLI_IMAGE = DOCKER_REGISTRY + '/opsh2oai/awscli'
public static final String S3CMD_IMAGE = DOCKER_REGISTRY + '/opsh2oai/s3cmd'
@@ -99,7 +99,7 @@ class BuildConfig {
changesMap[COMPONENT_HADOOP] = buildHadoop
changedPythonTests = detectPythonTestChanges(changes)
- nodeLabels = NodeLabels.findByBuildURL(context.env.BUILD_URL)
+ nodeLabels = NodeLabels.LABELS_C1
supportedXGBEnvironments = [
'centos7.3': [
[name: 'CentOS 7.3 Minimal', dockerfile: 'xgb/centos/Dockerfile-centos-minimal', fromImage: 'centos:7.3.1611', targetName: XGB_TARGET_MINIMAL, nodeLabel: getDefaultNodeLabel()],
@@ -369,13 +369,13 @@ class BuildConfig {
}
static enum NodeLabels {
- LABELS_C1('docker && !mr-0xc8', 'mr-0xc9', 'gpu && !2gpu', 'mr-0xk10'), //master or nightly build
- LABELS_B4('docker', 'docker', 'gpu && !2gpu', 'docker') //PR build
+ LABELS_C1('docker && !mr-0xc8', 'mr-0xc9', 'gpu && !2gpu', 'mr-0xk10'), //master or nightly build - use only this one
+ LABELS_B4('docker', 'docker', 'gpu && !2gpu', 'docker') //PR build - not used
static Map