From 010412e21bf21034e9f9e3e86b5f584f23a38a79 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 22 Dec 2024 20:36:24 +0100 Subject: [PATCH] correctly converts dict arrow types into dlt types --- dlt/common/libs/pyarrow.py | 4 ++++ tests/libs/pyarrow/test_pyarrow.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 029cd75399..255fcd344e 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -183,6 +183,10 @@ def get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: return dict(data_type="decimal", precision=dtype.precision, scale=dtype.scale) elif pyarrow.types.is_nested(dtype): return dict(data_type="json") + elif pyarrow.types.is_dictionary(dtype): + # Dictionary types are essentially categorical encodings. The underlying value_type + # dictates the "logical" type. We simply delegate to the underlying value_type. + return get_column_type_from_py_arrow(dtype.value_type) else: raise ValueError(dtype) diff --git a/tests/libs/pyarrow/test_pyarrow.py b/tests/libs/pyarrow/test_pyarrow.py index f81b3d1b99..07e8d3428d 100644 --- a/tests/libs/pyarrow/test_pyarrow.py +++ b/tests/libs/pyarrow/test_pyarrow.py @@ -66,6 +66,18 @@ def test_py_arrow_to_table_schema_columns(): assert result == dlt_schema +def test_py_arrow_dict_to_column() -> None: + array_1 = pa.array(["a", "b", "c"], type=pa.dictionary(pa.int8(), pa.string())) + array_2 = pa.array([1, 2, 3], type=pa.dictionary(pa.int8(), pa.int64())) + table = pa.table({"strings": array_1, "ints": array_2}) + columns = py_arrow_to_table_schema_columns(table.schema) + assert columns == { + "strings": {"name": "strings", "nullable": True, "data_type": "text"}, + "ints": {"name": "ints", "nullable": True, "data_type": "bigint"}, + } + assert table.to_pydict() == {"strings": ["a", "b", "c"], "ints": [1, 2, 3]} + + def test_to_arrow_scalar() -> None: naive_dt = get_py_arrow_timestamp(6, tz=None) # print(naive_dt)