GH-15860: fixed problem with skipped columns. (#15867)

* fixed problem with skipped columns. The skipped columns needed to be removed from column types * Update h2o-core/src/main/java/water/parser/ParseDataset.java Co-authored-by: Marek Novotný <[email protected]>
h2oai · Oct 30, 2023 · ac63da6 · ac63da6
1 parent 0d6ab63
commit ac63da6
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 3 deletions.
diff --git a/h2o-core/src/main/java/water/parser/ParseDataset.java b/h2o-core/src/main/java/water/parser/ParseDataset.java
@@ -20,6 +20,8 @@
 import java.io.InputStream;
 import java.text.SimpleDateFormat;
 import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
@@ -412,11 +414,19 @@ public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) {
       String parseType = setup.getParseType().name();
       String[] originalColumnTypes = "PARQUET".equals(parseType) ? setup.getParquetColumnTypes() 
               : setup.getOrigColumnTypes();
-      if (originalColumnTypes != null) {
+      final int[] skippedColumns = setup.getSkippedColumns();
+      String[] newColumnTypes;
+      if (skippedColumns != null) {  // need to remove column types of skipped columns
+        Set<Integer> skippedColIndices = Arrays.stream(skippedColumns).boxed().collect(Collectors.toSet());
+        newColumnTypes = IntStream.range(0, originalColumnTypes.length).filter(x -> !(skippedColIndices.contains(x))).mapToObj(x -> originalColumnTypes[x]).toArray(String[]::new);
+      } else {
+        newColumnTypes = originalColumnTypes;
+      }
+      if (newColumnTypes != null) {
         if ("PARQUET".equals(parseType)) // force change the column types specified by user 
-          forceChangeColumnTypesParquet(fr, originalColumnTypes);
+          forceChangeColumnTypesParquet(fr, newColumnTypes);
         else
-          forceChangeColumnTypes(fr, originalColumnTypes);
+          forceChangeColumnTypes(fr, newColumnTypes);
       }
     }
 

diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py b/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py
@@ -0,0 +1,36 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+
+# test and make sure that the force_col_types works
+def test_force_col_types():
+    h2oOriginalTypes = {'C1': 'real', 'C2': 'int', 'C3': 'int', 'C4': 'int', 'C5': 'int', 'C6': 'string', 'C7': 'real',
+                'C8': 'string', 'C9': 'real', 'C10': 'real', 'C11': 'enum', 'C12': 'int', 'C13': 'int',
+                'C14': 'int', 'C15': 'int', 'C16': 'enum', 'C17': 'real', 'C18': 'real', 'C19': 'enum',
+                'C20': 'enum', 'C21': 'enum', 'C22': 'real', 'C23': 'int', 'C24': 'int', 'C25': 'enum',
+                'C26': 'enum', 'C27': 'string', 'C28': 'int', 'C29': 'int', 'C30': 'int', 'C31': 'int',
+                'C32': 'int', 'C33': 'int', 'C34': 'int', 'C35': 'enum', 'C36': 'int', 'C37': 'string',
+                'C38': 'int', 'C39': 'string', 'C40': 'int', 'C41': 'string', 'C42': 'string', 'C43': 'real',
+                'C44': 'int', 'C45': 'string', 'C46': 'int', 'C47': 'real', 'C48': 'real', 'C49': 'int', 'C50': 'int'}
+    # import file
+    h2oData = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv")) # no col_types specification
+    h2oTypes = h2oData.types
+    pyunit_utils.equal_two_dicts_string(h2oOriginalTypes, h2oTypes)
+
+    h2oTypes = h2oData.types # these changes needs force_col_types = True
+    h2oTypes["C2"]="real"
+    h2oTypes["C48"]="int"
+    h2oData4 = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv"), col_types=h2oTypes, 
+                               force_col_types=True, skipped_columns=[0,4,5])
+    assert h2oData4.ncol == (h2oData.ncol - 3), "Expected number of columns: {0}, Actual: {1}".format(h2oData.ncol - 3,
+                                                                                                      h2oData4.ncol)
+    pyunit_utils.compare_frames_local(h2oData["C2"], h2oData4["C2"], prob=1)  # change from int column to real columns should be fine
+    # change from real to int will generate columns with different values due to rounding
+    pyunit_utils.compare_frames_local(h2oData["C48"], h2oData4["C48"], prob=1, tol=0.5)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_force_col_types)
+else:
+    test_force_col_types()
diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py b/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py
@@ -0,0 +1,20 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+
+def test_parquet_column_types_skipped_columns():
+    parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"))
+    parquetForce = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"), 
+                                   force_col_types=True, skipped_columns=[0])
+    assert parquetForce.types["uniform_col"]=="real", "Expected type: {0}, actual: " \
+                                                      "{1}".format("real", parquetForce.types["uniform_col"])
+    assert parquet.ncol-1==parquetForce.ncol, "Expected column number: {0}, actual: " \
+                                              "{1}".format(parquet.ncol-1, parquetForce.ncol)
+    pyunit_utils.compare_frames_local(parquet[1], parquetForce[0], prob=0.1, tol=1e-12)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_parquet_column_types_skipped_columns)
+else:
+    test_parquet_column_types_skipped_columns()