Merge remote-tracking branch 'origin/rel-3.44.0'

h2oai · Oct 30, 2023 · eb6e6e7 · eb6e6e7
2 parents 3d1e7b1 + 936756b
commit eb6e6e7
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 5 deletions.
diff --git a/h2o-core/src/main/java/water/parser/ParseDataset.java b/h2o-core/src/main/java/water/parser/ParseDataset.java
@@ -20,6 +20,8 @@
 import java.io.InputStream;
 import java.text.SimpleDateFormat;
 import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
@@ -412,11 +414,19 @@ public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) {
       String parseType = setup.getParseType().name();
       String[] originalColumnTypes = "PARQUET".equals(parseType) ? setup.getParquetColumnTypes() 
               : setup.getOrigColumnTypes();
-      if (originalColumnTypes != null) {
+      final int[] skippedColumns = setup.getSkippedColumns();
+      String[] newColumnTypes;
+      if (skippedColumns != null) {  // need to remove column types of skipped columns
+        Set<Integer> skippedColIndices = Arrays.stream(skippedColumns).boxed().collect(Collectors.toSet());
+        newColumnTypes = IntStream.range(0, originalColumnTypes.length).filter(x -> !(skippedColIndices.contains(x))).mapToObj(x -> originalColumnTypes[x]).toArray(String[]::new);
+      } else {
+        newColumnTypes = originalColumnTypes;
+      }
+      if (newColumnTypes != null) {
         if ("PARQUET".equals(parseType)) // force change the column types specified by user 
-          forceChangeColumnTypesParquet(fr, originalColumnTypes);
+          forceChangeColumnTypesParquet(fr, newColumnTypes);
         else
-          forceChangeColumnTypes(fr, originalColumnTypes);
+          forceChangeColumnTypes(fr, newColumnTypes);
       }
     }
 

diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -1980,8 +1980,8 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False):
                         os.remove(fileName)
                         os.rmdir(tmpdir)
                 elif not(can_use_datatable()):
-                    warnings("multi_thread mode can only be used when you have datatable "
-                             "installed.")                   
+                    warnings.warn("multi_thread mode can only be used when you have datatable "
+                             "installed.  Defaults to single-thread operation.")                   
             return pandas.read_csv(StringIO(self.get_frame_data()), low_memory=False, skip_blank_lines=False)
         from h2o.utils.csv.readers import reader
         frame = [row for row in reader(StringIO(self.get_frame_data()))]

diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py
@@ -595,6 +595,9 @@ def javamunge(assembly, pojoname, test, compile_only=False):
 def install(package):
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
 
+def uninstall(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package])
+
 def locate(path):
     """
     Search for a relative path and turn it into an absolute path.

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15861_no_datatable.py b/h2o-py/tests/testdir_misc/pyunit_gh_15861_no_datatable.py
@@ -0,0 +1,28 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+from h2o.utils.shared_utils import (can_use_datatable)
+
+def test_datatable_without_datatable():
+    delTable = False
+    if can_use_datatable():
+        delTable = True
+        pyunit_utils.uninstall("datatable")
+
+    try:   
+        # should run to completion
+        with pyunit_utils.catch_warnings() as ws:
+            h2oFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"))
+            new_frame = h2oFrame.as_data_frame(multi_thread=True) 
+            assert "multi_thread mode can only be used when you have datatable installed.  Defaults to single-thread " \
+                "operation." in str(ws[0].message)
+    finally:
+        # re-install datatable before quitting.     
+        if delTable:
+            pyunit_utils.install("datatable")
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_datatable_without_datatable)
+else:
+    test_datatable_without_datatable()
diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py b/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py
@@ -0,0 +1,36 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+
+# test and make sure that the force_col_types works
+def test_force_col_types():
+    h2oOriginalTypes = {'C1': 'real', 'C2': 'int', 'C3': 'int', 'C4': 'int', 'C5': 'int', 'C6': 'string', 'C7': 'real',
+                'C8': 'string', 'C9': 'real', 'C10': 'real', 'C11': 'enum', 'C12': 'int', 'C13': 'int',
+                'C14': 'int', 'C15': 'int', 'C16': 'enum', 'C17': 'real', 'C18': 'real', 'C19': 'enum',
+                'C20': 'enum', 'C21': 'enum', 'C22': 'real', 'C23': 'int', 'C24': 'int', 'C25': 'enum',
+                'C26': 'enum', 'C27': 'string', 'C28': 'int', 'C29': 'int', 'C30': 'int', 'C31': 'int',
+                'C32': 'int', 'C33': 'int', 'C34': 'int', 'C35': 'enum', 'C36': 'int', 'C37': 'string',
+                'C38': 'int', 'C39': 'string', 'C40': 'int', 'C41': 'string', 'C42': 'string', 'C43': 'real',
+                'C44': 'int', 'C45': 'string', 'C46': 'int', 'C47': 'real', 'C48': 'real', 'C49': 'int', 'C50': 'int'}
+    # import file
+    h2oData = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv")) # no col_types specification
+    h2oTypes = h2oData.types
+    pyunit_utils.equal_two_dicts_string(h2oOriginalTypes, h2oTypes)
+
+    h2oTypes = h2oData.types # these changes needs force_col_types = True
+    h2oTypes["C2"]="real"
+    h2oTypes["C48"]="int"
+    h2oData4 = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv"), col_types=h2oTypes, 
+                               force_col_types=True, skipped_columns=[0,4,5])
+    assert h2oData4.ncol == (h2oData.ncol - 3), "Expected number of columns: {0}, Actual: {1}".format(h2oData.ncol - 3,
+                                                                                                      h2oData4.ncol)
+    pyunit_utils.compare_frames_local(h2oData["C2"], h2oData4["C2"], prob=1)  # change from int column to real columns should be fine
+    # change from real to int will generate columns with different values due to rounding
+    pyunit_utils.compare_frames_local(h2oData["C48"], h2oData4["C48"], prob=1, tol=0.5)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_force_col_types)
+else:
+    test_force_col_types()
diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py b/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py
@@ -0,0 +1,20 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+
+def test_parquet_column_types_skipped_columns():
+    parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"))
+    parquetForce = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"), 
+                                   force_col_types=True, skipped_columns=[0])
+    assert parquetForce.types["uniform_col"]=="real", "Expected type: {0}, actual: " \
+                                                      "{1}".format("real", parquetForce.types["uniform_col"])
+    assert parquet.ncol-1==parquetForce.ncol, "Expected column number: {0}, actual: " \
+                                              "{1}".format(parquet.ncol-1, parquetForce.ncol)
+    pyunit_utils.compare_frames_local(parquet[1], parquetForce[0], prob=0.1, tol=1e-12)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_parquet_column_types_skipped_columns)
+else:
+    test_parquet_column_types_skipped_columns()