diff --git a/CHANGES.rst b/CHANGES.rst index 75dff837..59a099a0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,8 +6,11 @@ Version 3.19.1 (2021-02-XX) =========================== * Allow ``pyarrow==3`` as a dependency. +* Fix a bug in :func:`~kartothek.io_components.utils.align_categories` for dataframes + with missings and of non-categorical dtype. * Fix an issue with the cube index validation introduced in v3.19.0 (#413). + Version 3.19.0 (2021-02-12) =========================== diff --git a/kartothek/io_components/utils.py b/kartothek/io_components/utils.py index e21a43d3..c7ea8085 100644 --- a/kartothek/io_components/utils.py +++ b/kartothek/io_components/utils.py @@ -344,7 +344,7 @@ def align_categories(dfs, categoricals): for ix, df in enumerate(dfs): ser = df[column] if not pd.api.types.is_categorical_dtype(ser): - cats = ser.unique() + cats = ser.dropna().unique() LOGGER.info( "Encountered non-categorical type where categorical was expected\n" "Found at index position {ix} for column {col}\n" diff --git a/tests/io_components/test_utils.py b/tests/io_components/test_utils.py index 34865184..f17fc5b6 100644 --- a/tests/io_components/test_utils.py +++ b/tests/io_components/test_utils.py @@ -1,6 +1,7 @@ from functools import partial from typing import Callable +import numpy as np import pandas as pd import pandas.testing as pdt import pyarrow as pa @@ -202,6 +203,20 @@ def test_align_categories(): pdt.assert_series_equal(out_dfs[2][col_name], expected_3) +def test_align_categories_with_missings(): + df_0 = pd.DataFrame({"letters": ["a", "a", "b", np.nan]}) + df_1 = pd.DataFrame({"letters": ["a", "a"]}) + out = align_categories([df_0, df_1], ["letters"]) + expected_0 = pd.DataFrame( + {"letters": pd.Categorical(["a", "a", "b", np.nan], categories=["a", "b"])} + ) + expected_1 = pd.DataFrame( + {"letters": pd.Categorical(["a", "a"], categories=["a", "b"])} + ) + pdt.assert_frame_equal(out[0], expected_0) + pdt.assert_frame_equal(out[1], expected_1) + + def test_sort_cateogrical(): values = ["f", "a", "b", "z", "e"] categories = ["e", "z", "b", "a", "f"]