Skip to content

Commit

Permalink
refactor(dtype): automatically cast mixed string and date columns to …
Browse files Browse the repository at this point in the history
…strings (#245)

Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke authored Jul 1, 2024
1 parent 71133d3 commit f962672
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
Binary file modified python/tests/fixtures/fixture-multi-dtypes-columns.xlsx
Binary file not shown.
21 changes: 19 additions & 2 deletions python/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def expected_data() -> dict[str, list[Any]]:
"Date": [datetime(2023, 7, 21)] * 9,
"Details": ["Healthcare"] * 7 + ["Something"] * 2,
"Asset ID": ["84444"] * 7 + ["ABC123"] * 2,
"Mixed dates": ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3,
}


Expand Down Expand Up @@ -89,13 +90,29 @@ def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[A
None,
]
expected_data["Asset ID"] = [84444.0] * 7 + [None] * 2
expected_data["Mixed dates"] = [datetime(2023, 7, 21)] * 6 + [None] * 3

pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame(expected_data).astype({"Date": "datetime64[ms]"}))
pd_assert_frame_equal(
pd_df,
pd.DataFrame(expected_data).astype(
{
"Date": "datetime64[ms]",
"Mixed dates": "datetime64[ms]",
}
),
)

pl_df = sheet.to_polars()
pl_assert_frame_equal(
pl_df, pl.DataFrame(expected_data, schema_overrides={"Date": pl.Datetime(time_unit="ms")})
pl_df,
pl.DataFrame(
expected_data,
schema_overrides={
"Date": pl.Datetime(time_unit="ms"),
"Mixed dates": pl.Datetime(time_unit="ms"),
},
),
)


Expand Down
10 changes: 9 additions & 1 deletion src/types/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,15 @@ fn int_types() -> &'static HashSet<DType> {
}

fn string_types() -> &'static HashSet<DType> {
STRING_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::String]))
STRING_TYPES_CELL.get_or_init(|| {
HashSet::from([
DType::Int,
DType::Float,
DType::String,
DType::DateTime,
DType::Date,
])
})
}

pub(crate) fn get_dtype_for_column<DT: CellType + Debug + DataType>(
Expand Down

0 comments on commit f962672

Please sign in to comment.