From f7b497e0d3814516ad73af12aeabce9104ae030f Mon Sep 17 00:00:00 2001 From: Mohit Chawla Date: Fri, 4 Aug 2023 00:35:44 +0530 Subject: [PATCH 1/4] added warning if User Passes Categorical Columns to h2o.cor() --- h2o-py/h2o/frame.py | 7 +++++++ h2o-r/h2o-package/R/frame.R | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 235a69b6c302..296822f3ec5f 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -3185,6 +3185,13 @@ def cor(self, y=None, na_rm=False, use=None, method="Pearson"): if y is None: y = self if use is None: use = "complete.obs" if na_rm else "everything" + + y_categorical = any(self.types[col_name] == "enum" for col_name in y) + + if y_categorical: + import warnings + warnings.warn("Categorical columns found in 'y'.") + if self.nrow == 1 or (self.ncol == 1 and y.ncol == 1): return ExprNode("cor", self, y, use, method)._eager_scalar() return H2OFrame._expr(expr=ExprNode("cor", self, y, use, method))._frame() diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index e90e1e564029..03ab5e6312d1 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -2880,25 +2880,34 @@ var <- function(x, y = NULL, na.rm = FALSE, use) { #' cor(prostate$AGE) #' } #' @export -h2o.cor <- function(x, y=NULL,na.rm = FALSE, use, method="Pearson"){ +h2o.cor <- function(x, y = NULL, na.rm = FALSE, use, method = "Pearson") { # Eager, mostly to match prior semantics but no real reason it need to be - if( is.null(y) ){ + if (is.null(y)) { y <- x } - if(missing(use)) { + if (missing(use)) { if (na.rm) use <- "complete.obs" else use <- "everything" } if (is.null(method) || is.na(method)) { stop("Correlation method must be specified.") } + + # Check for categorical columns in x and y + x_categorical <- any(h2o.isfactor(x)) + y_categorical <- any(h2o.isfactor(y)) + if (x_categorical || y_categorical) { + warning("Categorical columns found in x or y.") + } + # Eager, mostly to match prior semantics but no real reason it need to be - expr <- .newExpr("cor",x,y,.quote(use), .quote(method)) - if( (nrow(x)==1L || (ncol(x)==1L && ncol(y)==1L)) ) .eval.scalar(expr) - else .fetch.data(expr,ncol(x)) + expr <- .newExpr("cor", x, y, .quote(use), .quote(method)) + if ((nrow(x) == 1L || (ncol(x) == 1L && ncol(y) == 1L))) .eval.scalar(expr) + else .fetch.data(expr, ncol(x)) } + #' #' Compute a pairwise distance measure between all rows of two numeric H2OFrames. #' From e0535785f1d16afe6563280e4517c4b6a96e1508 Mon Sep 17 00:00:00 2001 From: Mohit Chawla Date: Fri, 4 Aug 2023 01:19:44 +0530 Subject: [PATCH 2/4] replaced warning text to NA --- h2o-py/h2o/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 296822f3ec5f..1818a3b25f50 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -3190,7 +3190,7 @@ def cor(self, y=None, na_rm=False, use=None, method="Pearson"): if y_categorical: import warnings - warnings.warn("Categorical columns found in 'y'.") + warnings.warn("NA") if self.nrow == 1 or (self.ncol == 1 and y.ncol == 1): return ExprNode("cor", self, y, use, method)._eager_scalar() return H2OFrame._expr(expr=ExprNode("cor", self, y, use, method))._frame() From 276864f855b1af3cb5f285d93cd8ea24de6825bf Mon Sep 17 00:00:00 2001 From: Mohit Chawla Date: Fri, 4 Aug 2023 01:23:49 +0530 Subject: [PATCH 3/4] replaced frame.R warning text --- h2o-r/h2o-package/R/frame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 03ab5e6312d1..1239adc01583 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -2898,7 +2898,7 @@ h2o.cor <- function(x, y = NULL, na.rm = FALSE, use, method = "Pearson") { y_categorical <- any(h2o.isfactor(y)) if (x_categorical || y_categorical) { - warning("Categorical columns found in x or y.") + warning("NA") } # Eager, mostly to match prior semantics but no real reason it need to be From f2b7654e47ed0ce6353f2c58dee055ef6323223b Mon Sep 17 00:00:00 2001 From: Mohit Chawla Date: Fri, 4 Aug 2023 01:42:21 +0530 Subject: [PATCH 4/4] no warning for binary categorical --- h2o-py/h2o/frame.py | 8 ++++---- h2o-r/h2o-package/R/frame.R | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 1818a3b25f50..e026cb5e9203 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -3182,13 +3182,13 @@ def cor(self, y=None, na_rm=False, use=None, method="Pearson"): assert_is_type(y, H2OFrame, None) assert_is_type(na_rm, bool) assert_is_type(use, None, "everything", "all.obs", "complete.obs") - if y is None: - y = self - if use is None: use = "complete.obs" if na_rm else "everything" - y_categorical = any(self.types[col_name] == "enum" for col_name in y) if y_categorical: + num_unique_levels = {col: len(self[col].levels()) for col in y} + multi_categorical = any(num_levels > 2 for num_levels in num_unique_levels.values()) + + if multi_categorical: import warnings warnings.warn("NA") diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 1239adc01583..4f627d183847 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -2897,10 +2897,11 @@ h2o.cor <- function(x, y = NULL, na.rm = FALSE, use, method = "Pearson") { x_categorical <- any(h2o.isfactor(x)) y_categorical <- any(h2o.isfactor(y)) - if (x_categorical || y_categorical) { - warning("NA") + if ((x_categorical && length(unique(h2o.levels(x))) > 2) || (y_categorical && length(unique(h2o.levels(y))) > 2)) { + warning("NA") } + # Eager, mostly to match prior semantics but no real reason it need to be expr <- .newExpr("cor", x, y, .quote(use), .quote(method)) if ((nrow(x) == 1L || (ncol(x) == 1L && ncol(y) == 1L))) .eval.scalar(expr) @@ -2908,6 +2909,7 @@ h2o.cor <- function(x, y = NULL, na.rm = FALSE, use, method = "Pearson") { } + #' #' Compute a pairwise distance measure between all rows of two numeric H2OFrames. #'