Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/rel-3.44.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
h2o-ops committed Oct 30, 2023
2 parents 3d1e7b1 + 936756b commit eb6e6e7
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 5 deletions.
16 changes: 13 additions & 3 deletions h2o-core/src/main/java/water/parser/ParseDataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
Expand Down Expand Up @@ -412,11 +414,19 @@ public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) {
String parseType = setup.getParseType().name();
String[] originalColumnTypes = "PARQUET".equals(parseType) ? setup.getParquetColumnTypes()
: setup.getOrigColumnTypes();
if (originalColumnTypes != null) {
final int[] skippedColumns = setup.getSkippedColumns();
String[] newColumnTypes;
if (skippedColumns != null) { // need to remove column types of skipped columns
Set<Integer> skippedColIndices = Arrays.stream(skippedColumns).boxed().collect(Collectors.toSet());
newColumnTypes = IntStream.range(0, originalColumnTypes.length).filter(x -> !(skippedColIndices.contains(x))).mapToObj(x -> originalColumnTypes[x]).toArray(String[]::new);
} else {
newColumnTypes = originalColumnTypes;
}
if (newColumnTypes != null) {
if ("PARQUET".equals(parseType)) // force change the column types specified by user
forceChangeColumnTypesParquet(fr, originalColumnTypes);
forceChangeColumnTypesParquet(fr, newColumnTypes);
else
forceChangeColumnTypes(fr, originalColumnTypes);
forceChangeColumnTypes(fr, newColumnTypes);
}
}

Expand Down
4 changes: 2 additions & 2 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1980,8 +1980,8 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False):
os.remove(fileName)
os.rmdir(tmpdir)
elif not(can_use_datatable()):
warnings("multi_thread mode can only be used when you have datatable "
"installed.")
warnings.warn("multi_thread mode can only be used when you have datatable "
"installed. Defaults to single-thread operation.")
return pandas.read_csv(StringIO(self.get_frame_data()), low_memory=False, skip_blank_lines=False)
from h2o.utils.csv.readers import reader
frame = [row for row in reader(StringIO(self.get_frame_data()))]
Expand Down
3 changes: 3 additions & 0 deletions h2o-py/tests/pyunit_utils/utilsPY.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,9 @@ def javamunge(assembly, pojoname, test, compile_only=False):
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])

def uninstall(package):
subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package])

def locate(path):
"""
Search for a relative path and turn it into an absolute path.
Expand Down
28 changes: 28 additions & 0 deletions h2o-py/tests/testdir_misc/pyunit_gh_15861_no_datatable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
from h2o.utils.shared_utils import (can_use_datatable)

def test_datatable_without_datatable():
delTable = False
if can_use_datatable():
delTable = True
pyunit_utils.uninstall("datatable")

try:
# should run to completion
with pyunit_utils.catch_warnings() as ws:
h2oFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"))
new_frame = h2oFrame.as_data_frame(multi_thread=True)
assert "multi_thread mode can only be used when you have datatable installed. Defaults to single-thread " \
"operation." in str(ws[0].message)
finally:
# re-install datatable before quitting.
if delTable:
pyunit_utils.install("datatable")

if __name__ == "__main__":
pyunit_utils.standalone_test(test_datatable_without_datatable)
else:
test_datatable_without_datatable()
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils

# test and make sure that the force_col_types works
def test_force_col_types():
h2oOriginalTypes = {'C1': 'real', 'C2': 'int', 'C3': 'int', 'C4': 'int', 'C5': 'int', 'C6': 'string', 'C7': 'real',
'C8': 'string', 'C9': 'real', 'C10': 'real', 'C11': 'enum', 'C12': 'int', 'C13': 'int',
'C14': 'int', 'C15': 'int', 'C16': 'enum', 'C17': 'real', 'C18': 'real', 'C19': 'enum',
'C20': 'enum', 'C21': 'enum', 'C22': 'real', 'C23': 'int', 'C24': 'int', 'C25': 'enum',
'C26': 'enum', 'C27': 'string', 'C28': 'int', 'C29': 'int', 'C30': 'int', 'C31': 'int',
'C32': 'int', 'C33': 'int', 'C34': 'int', 'C35': 'enum', 'C36': 'int', 'C37': 'string',
'C38': 'int', 'C39': 'string', 'C40': 'int', 'C41': 'string', 'C42': 'string', 'C43': 'real',
'C44': 'int', 'C45': 'string', 'C46': 'int', 'C47': 'real', 'C48': 'real', 'C49': 'int', 'C50': 'int'}
# import file
h2oData = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv")) # no col_types specification
h2oTypes = h2oData.types
pyunit_utils.equal_two_dicts_string(h2oOriginalTypes, h2oTypes)

h2oTypes = h2oData.types # these changes needs force_col_types = True
h2oTypes["C2"]="real"
h2oTypes["C48"]="int"
h2oData4 = h2o.import_file(pyunit_utils.locate("smalldata/parser/synthetic_dataset.csv"), col_types=h2oTypes,
force_col_types=True, skipped_columns=[0,4,5])
assert h2oData4.ncol == (h2oData.ncol - 3), "Expected number of columns: {0}, Actual: {1}".format(h2oData.ncol - 3,
h2oData4.ncol)
pyunit_utils.compare_frames_local(h2oData["C2"], h2oData4["C2"], prob=1) # change from int column to real columns should be fine
# change from real to int will generate columns with different values due to rounding
pyunit_utils.compare_frames_local(h2oData["C48"], h2oData4["C48"], prob=1, tol=0.5)


if __name__ == "__main__":
pyunit_utils.standalone_test(test_force_col_types)
else:
test_force_col_types()
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils

def test_parquet_column_types_skipped_columns():
parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"))
parquetForce = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/df.parquet"),
force_col_types=True, skipped_columns=[0])
assert parquetForce.types["uniform_col"]=="real", "Expected type: {0}, actual: " \
"{1}".format("real", parquetForce.types["uniform_col"])
assert parquet.ncol-1==parquetForce.ncol, "Expected column number: {0}, actual: " \
"{1}".format(parquet.ncol-1, parquetForce.ncol)
pyunit_utils.compare_frames_local(parquet[1], parquetForce[0], prob=0.1, tol=1e-12)


if __name__ == "__main__":
pyunit_utils.standalone_test(test_parquet_column_types_skipped_columns)
else:
test_parquet_column_types_skipped_columns()

0 comments on commit eb6e6e7

Please sign in to comment.