Merge pull request #191 from fjetter/bugfix/ktk_hash_bucket_in_schema

Remove _KTK_HASH_BUCKET if exists
JDASoftwareGroup · Dec 17, 2019 · cec1cb1 · cec1cb1
2 parents 08a3aec + 43be3c6
commit cec1cb1
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 3 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,13 +2,18 @@
 Changelog
 =========
 
-Version Unreleased
-==================
+Version 3.6.2 (2019-12-17)
+==========================
 
 Improvements
 ^^^^^^^^^^^^
 
-- Add more explicit typing to :mod:`kartothek.io.eager`.
+* Add more explicit typing to :mod:`kartothek.io.eager`.
+
+Bug fixes
+^^^^^^^^^
+* Fix an issue where :func:`~kartothek.io.dask.dataframe.update_dataset_from_ddf` would create a column named "_KTK_HASH_BUCKET" in the dataset
+
 
 Version 3.6.1 (2019-12-11)
 ==========================

diff --git a/kartothek/io/dask/_update.py b/kartothek/io/dask/_update.py
@@ -126,6 +126,8 @@ def _store_partition(
     df_serializer,
     metadata_version,
 ):
+    if _KTK_HASH_BUCKET in df:
+        df = df.drop(_KTK_HASH_BUCKET, axis=1)
     store = store_factory()
     # I don't have access to the group values
     mps = parse_input_to_metapartition(

diff --git a/tests/io/dask/dataframe/test_update.py b/tests/io/dask/dataframe/test_update.py
@@ -171,6 +171,12 @@ def test_update_shuffle_buckets(
         range(unique_secondaries)
     )
 
+    assert set(dataset.table_meta["core"].names) == {
+        "primary",
+        "secondary",
+        "sorted_column",
+    }
+
     factory = DatasetFactory("output_dataset_uuid", store_factory)
     factory.load_all_indices()