diff --git a/h2o-core/src/main/java/water/parser/ParseDataset.java b/h2o-core/src/main/java/water/parser/ParseDataset.java index e43f0bd84040..c2f06a4398e5 100644 --- a/h2o-core/src/main/java/water/parser/ParseDataset.java +++ b/h2o-core/src/main/java/water/parser/ParseDataset.java @@ -20,6 +20,8 @@ import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.zip.GZIPInputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -412,11 +414,19 @@ public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) { String parseType = setup.getParseType().name(); String[] originalColumnTypes = "PARQUET".equals(parseType) ? setup.getParquetColumnTypes() : setup.getOrigColumnTypes(); - if (originalColumnTypes != null) { + final int[] skippedColumns = setup.getSkippedColumns(); + String[] newColumnTypes; + if (skippedColumns != null) { // need to remove column types of skipped columns + Set skippedColIndices = Arrays.stream(skippedColumns).boxed().collect(Collectors.toSet()); + newColumnTypes = IntStream.range(0, originalColumnTypes.length).filter(x -> !(skippedColIndices.contains(x))).mapToObj(x -> originalColumnTypes[x]).toArray(String[]::new); + } else { + newColumnTypes = originalColumnTypes; + } + if (newColumnTypes != null) { if ("PARQUET".equals(parseType)) // force change the column types specified by user - forceChangeColumnTypesParquet(fr, originalColumnTypes); + forceChangeColumnTypesParquet(fr, newColumnTypes); else - forceChangeColumnTypes(fr, originalColumnTypes); + forceChangeColumnTypes(fr, newColumnTypes); } } diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py b/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py new file mode 100644 index 000000000000..4cf6bd244132 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_GH_15860_force_col_types_skipped_columns.py @@ -0,0 +1,36 @@ +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + +# test and make sure that the force_col_types works +def test_force_col_types(): + h2oOriginalTypes = {'C1': 'real', 'C2': 'int', 'C3': 'int', 'C4': 'int', 'C5': 'int', 'C6': 'string', 'C7': 'real', + 'C8': 'string', 'C9': 'real', 'C10': 'real', 'C11': 'enum', 'C12': 'int', 'C13': 'int', + 'C14': 'int', 'C15': 'int', 'C16': 'enum', 'C17': 'real', 'C18': 'real', 'C19': 'enum', + 'C20': 'enum', 'C21': 'enum', 'C22': 'real', 'C23': 'int', 'C24': 'int', 'C25': 'enum', + 'C26': 'enum', 'C27': 'string', 'C28': 'int', 'C29': 'int', 'C30': 'int', 'C31': 'int', + 'C32': 'int', 'C33': 'int', 'C34': 'int', 'C35': 'enum', 'C36': 'int', 'C37': 'string', + 'C38': 'int', 'C39': 'string', 'C40': 'int', 'C41': 'string', 'C42': 'string', 'C43': 'real', + 'C44': 'int', 'C45': 'string', 'C46': 'int', 'C47': 'real', 'C48': 'real', 'C49': 'int', 'C50': 'int'} + # import file + h2oData = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv")) # no col_types specification + h2oTypes = h2oData.types + pyunit_utils.equal_two_dicts_string(h2oOriginalTypes, h2oTypes) + + h2oTypes = h2oData.types # these changes needs force_col_types = True + h2oTypes["C2"]="real" + h2oTypes["C48"]="int" + h2oData4 = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv"), col_types=h2oTypes, + force_col_types=True, skipped_columns=[0,4,5]) + assert h2oData4.ncol == (h2oData.ncol - 3), "Expected number of columns: {0}, Actual: {1}".format(h2oData.ncol - 3, + h2oData4.ncol) + pyunit_utils.compare_frames_local(h2oData["C2"], h2oData4["C2"], prob=1) # change from int column to real columns should be fine + # change from real to int will generate columns with different values due to rounding + pyunit_utils.compare_frames_local(h2oData["C48"], h2oData4["C48"], prob=1, tol=0.5) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_force_col_types) +else: + test_force_col_types() diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py b/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py new file mode 100644 index 000000000000..0e0037f4bf4c --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_GH_15860_parquet_skipped_columns.py @@ -0,0 +1,20 @@ +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + +def test_parquet_column_types_skipped_columns(): + parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet")) + parquetForce = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"), + force_col_types=True, skipped_columns=[0]) + assert parquetForce.types["uniform_col"]=="real", "Expected type: {0}, actual: " \ + "{1}".format("real", parquetForce.types["uniform_col"]) + assert parquet.ncol-1==parquetForce.ncol, "Expected column number: {0}, actual: " \ + "{1}".format(parquet.ncol-1, parquetForce.ncol) + pyunit_utils.compare_frames_local(parquet[1], parquetForce[0], prob=0.1, tol=1e-12) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_parquet_column_types_skipped_columns) +else: + test_parquet_column_types_skipped_columns()