Drop missings from fake categories in align categories (#419)

* dropna in align_categories. * Add test. * Changelog. Co-authored-by: Florian Jetter <[email protected]>
JDASoftwareGroup · Feb 23, 2021 · e882c74 · e882c74
1 parent 243a4cc
commit e882c74
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 1 deletion.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -6,8 +6,11 @@ Version 3.19.1 (2021-02-XX)
 ===========================
 
 * Allow ``pyarrow==3`` as a dependency.
+* Fix a bug in :func:`~kartothek.io_components.utils.align_categories` for dataframes
+  with missings and of non-categorical dtype.
 * Fix an issue with the cube index validation introduced in v3.19.0 (#413).
 
+
 Version 3.19.0 (2021-02-12)
 ===========================
 

diff --git a/kartothek/io_components/utils.py b/kartothek/io_components/utils.py
@@ -344,7 +344,7 @@ def align_categories(dfs, categoricals):
         for ix, df in enumerate(dfs):
             ser = df[column]
             if not pd.api.types.is_categorical_dtype(ser):
-                cats = ser.unique()
+                cats = ser.dropna().unique()
                 LOGGER.info(
                     "Encountered non-categorical type where categorical was expected\n"
                     "Found at index position {ix} for column {col}\n"

diff --git a/tests/io_components/test_utils.py b/tests/io_components/test_utils.py
@@ -1,6 +1,7 @@
 from functools import partial
 from typing import Callable
 
+import numpy as np
 import pandas as pd
 import pandas.testing as pdt
 import pyarrow as pa
@@ -202,6 +203,20 @@ def test_align_categories():
         pdt.assert_series_equal(out_dfs[2][col_name], expected_3)
 
 
+def test_align_categories_with_missings():
+    df_0 = pd.DataFrame({"letters": ["a", "a", "b", np.nan]})
+    df_1 = pd.DataFrame({"letters": ["a", "a"]})
+    out = align_categories([df_0, df_1], ["letters"])
+    expected_0 = pd.DataFrame(
+        {"letters": pd.Categorical(["a", "a", "b", np.nan], categories=["a", "b"])}
+    )
+    expected_1 = pd.DataFrame(
+        {"letters": pd.Categorical(["a", "a"], categories=["a", "b"])}
+    )
+    pdt.assert_frame_equal(out[0], expected_0)
+    pdt.assert_frame_equal(out[1], expected_1)
+
+
 def test_sort_cateogrical():
     values = ["f", "a", "b", "z", "e"]
     categories = ["e", "z", "b", "a", "f"]