Skip to content

Commit

Permalink
Drop missings from fake categories in align categories (#419)
Browse files Browse the repository at this point in the history
* dropna in align_categories.

* Add test.

* Changelog.

Co-authored-by: Florian Jetter <[email protected]>
  • Loading branch information
mlondschien and fjetter authored Feb 23, 2021
1 parent 243a4cc commit e882c74
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ Version 3.19.1 (2021-02-XX)
===========================

* Allow ``pyarrow==3`` as a dependency.
* Fix a bug in :func:`~kartothek.io_components.utils.align_categories` for dataframes
with missings and of non-categorical dtype.
* Fix an issue with the cube index validation introduced in v3.19.0 (#413).


Version 3.19.0 (2021-02-12)
===========================

Expand Down
2 changes: 1 addition & 1 deletion kartothek/io_components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def align_categories(dfs, categoricals):
for ix, df in enumerate(dfs):
ser = df[column]
if not pd.api.types.is_categorical_dtype(ser):
cats = ser.unique()
cats = ser.dropna().unique()
LOGGER.info(
"Encountered non-categorical type where categorical was expected\n"
"Found at index position {ix} for column {col}\n"
Expand Down
15 changes: 15 additions & 0 deletions tests/io_components/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from functools import partial
from typing import Callable

import numpy as np
import pandas as pd
import pandas.testing as pdt
import pyarrow as pa
Expand Down Expand Up @@ -202,6 +203,20 @@ def test_align_categories():
pdt.assert_series_equal(out_dfs[2][col_name], expected_3)


def test_align_categories_with_missings():
df_0 = pd.DataFrame({"letters": ["a", "a", "b", np.nan]})
df_1 = pd.DataFrame({"letters": ["a", "a"]})
out = align_categories([df_0, df_1], ["letters"])
expected_0 = pd.DataFrame(
{"letters": pd.Categorical(["a", "a", "b", np.nan], categories=["a", "b"])}
)
expected_1 = pd.DataFrame(
{"letters": pd.Categorical(["a", "a"], categories=["a", "b"])}
)
pdt.assert_frame_equal(out[0], expected_0)
pdt.assert_frame_equal(out[1], expected_1)


def test_sort_cateogrical():
values = ["f", "a", "b", "z", "e"]
categories = ["e", "z", "b", "a", "f"]
Expand Down

0 comments on commit e882c74

Please sign in to comment.