Skip to content

Commit

Permalink
correctly converts dict arrow types into dlt types
Browse files Browse the repository at this point in the history
  • Loading branch information
rudolfix committed Dec 22, 2024
1 parent 562f805 commit 010412e
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
4 changes: 4 additions & 0 deletions dlt/common/libs/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ def get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType:
return dict(data_type="decimal", precision=dtype.precision, scale=dtype.scale)
elif pyarrow.types.is_nested(dtype):
return dict(data_type="json")
elif pyarrow.types.is_dictionary(dtype):
# Dictionary types are essentially categorical encodings. The underlying value_type
# dictates the "logical" type. We simply delegate to the underlying value_type.
return get_column_type_from_py_arrow(dtype.value_type)
else:
raise ValueError(dtype)

Expand Down
12 changes: 12 additions & 0 deletions tests/libs/pyarrow/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@ def test_py_arrow_to_table_schema_columns():
assert result == dlt_schema


def test_py_arrow_dict_to_column() -> None:
array_1 = pa.array(["a", "b", "c"], type=pa.dictionary(pa.int8(), pa.string()))
array_2 = pa.array([1, 2, 3], type=pa.dictionary(pa.int8(), pa.int64()))
table = pa.table({"strings": array_1, "ints": array_2})
columns = py_arrow_to_table_schema_columns(table.schema)
assert columns == {
"strings": {"name": "strings", "nullable": True, "data_type": "text"},
"ints": {"name": "ints", "nullable": True, "data_type": "bigint"},
}
assert table.to_pydict() == {"strings": ["a", "b", "c"], "ints": [1, 2, 3]}


def test_to_arrow_scalar() -> None:
naive_dt = get_py_arrow_timestamp(6, tz=None)
# print(naive_dt)
Expand Down

0 comments on commit 010412e

Please sign in to comment.