From 4d20267393d44a31a895077f81f52f1f730557c4 Mon Sep 17 00:00:00 2001 From: Ilia Zaitcev Date: Tue, 25 May 2021 13:13:50 +0200 Subject: [PATCH] Rollback to v3.20.0 (#471) Revert "Bump codecov/codecov-action from v1.4.1 to v1.5.0 (#466)" This reverts commit fdc9779194e27ead482380d84f4a67d497d40dc7. Revert "fix mistakes in documentation" This reverts commit 4e4b5e0f827d8b303085ef0dba6ca57a6cb3b3dc. Revert "Bump pre-commit/action from v2.0.0 to v2.0.3 (#460)" This reverts commit d027ca2beda1182594276fb23315203421ad5ac6. Revert "Bump codecov/codecov-action from v1.4.0 to v1.4.1 (#461)" This reverts commit 97cd553f0a6e3bab1e02cae79f89216cb33fe0f5. Revert "Bump codecov/codecov-action from v1.3.1 to v1.4.0 (#458)" This reverts commit e48d67a51e1e2facfef9039b0ce18090ec917ba1. Revert "Fix bug when loading few columns of a dataset with many primary indices (#446)" This reverts commit 90ee486b73b0de84b7e69f97f8246446ba6001e2. Revert "Prepare release 4.0.1" This reverts commit b278503756e0077ff646112f310180916ac08474. Revert "Fix tests for dask dataframe and delayed backends" This reverts commit 5520f74002bd3d1dba01161220846f22da4bd146. Revert "Add end-to-end regression test" This reverts commit 8a3e6ae80578bcdc26cafc6445d8cb609df64252. Revert "Fix dataset corruption after updates (#445)" This reverts commit a26e840159c3dcf63efe1be1742051c108833966. Revert "Set release date for 4.0" This reverts commit 08a809429cba9f46c4c0d09853eb476021764708. Revert "Return dask scalar for store and update from ddf (#437)" This reverts commit 494732d891d758d5b3d58cb4ad119405f9a1b844. Revert "Add tests for non-default table (#440)" This reverts commit 3807a022c9f156a3d576881e08405109adc69233. Revert "Bump codecov/codecov-action from v1.2.2 to v1.3.1 (#441)" This reverts commit f7615ecab64e40a68d7d15d1b22fc877ce079ec3. Revert "Set default for dates_as_object to True (#436)" This reverts commit 75ffdb5664f242b1bfe2ba8d69de280a664684af. Revert "Remove inferred indices (#438)" This reverts commit b1e2535676c2a6289dc833dd77640e7929f7d3cc. Revert "fix typo: 'KTK_CUBE_UUID_SEPERATOR' -> 'KTK_CUBE_UUID_SEPARATOR' (#422)" This reverts commit b349ceed6a27539110860b67b6f1f2285436d9b2. Revert "Remove all deprecated arguments (#434)" This reverts commit 74f079081ed22ef9208e9469c652772413824c69. Revert "Remove multi table feature (#431)" This reverts commit 032856a0a62af7696aa20386524b72dac29646c9. --- .github/workflows/ci-pre-commit.yml | 2 +- .github/workflows/ci.yml | 2 +- CHANGES.rst | 12 + asv_bench/benchmarks/index.py | 4 +- asv_bench/benchmarks/metapartition.py | 6 +- asv_bench/benchmarks/write.py | 16 +- docs/conf.py | 5 + docs/guide/examples.rst | 58 +- docs/guide/getting_started.rst | 84 +- docs/guide/mutating_datasets.rst | 85 +- docs/guide/partitioning.rst | 39 +- docs/spec/indexing.rst | 4 +- kartothek/api/consistency.py | 4 +- kartothek/api/discover.py | 6 +- kartothek/cli/_info.py | 7 +- kartothek/cli/_query.py | 3 +- kartothek/core/common_metadata.py | 10 +- kartothek/core/cube/constants.py | 6 +- kartothek/core/cube/cube.py | 8 +- kartothek/core/dataset.py | 218 ++-- kartothek/core/docs.py | 48 +- kartothek/core/factory.py | 16 +- kartothek/core/naming.py | 3 - kartothek/core/partition.py | 2 - kartothek/core/urlencode.py | 6 +- kartothek/io/dask/_shuffle.py | 7 +- kartothek/io/dask/_utils.py | 24 +- kartothek/io/dask/bag.py | 87 +- kartothek/io/dask/common_cube.py | 17 +- kartothek/io/dask/dataframe.py | 165 +-- kartothek/io/dask/delayed.py | 247 +++- kartothek/io/eager.py | 246 +++- kartothek/io/iter.py | 68 +- kartothek/io/testing/build_cube.py | 100 +- kartothek/io/testing/cleanup_cube.py | 4 +- kartothek/io/testing/extend_cube.py | 2 +- kartothek/io/testing/index.py | 82 +- kartothek/io/testing/merge.py | 91 ++ kartothek/io/testing/read.py | 308 ++++- kartothek/io/testing/update.py | 317 +++-- kartothek/io/testing/utils.py | 20 +- kartothek/io/testing/write.py | 457 +++++-- kartothek/io_components/cube/cleanup.py | 4 +- .../io_components/cube/query/__init__.py | 3 +- kartothek/io_components/cube/query/_group.py | 7 +- .../io_components/cube/query/_intention.py | 4 +- .../io_components/cube/query/_regroup.py | 2 +- kartothek/io_components/cube/write.py | 20 +- kartothek/io_components/gc.py | 5 +- kartothek/io_components/merge.py | 120 ++ kartothek/io_components/metapartition.py | 1089 ++++++++++++----- kartothek/io_components/read.py | 79 +- kartothek/io_components/utils.py | 100 +- kartothek/io_components/write.py | 131 +- kartothek/utils/ktk_adapters.py | 41 +- tests/api/test_discover.py | 119 +- tests/conftest.py | 46 +- tests/core/cube/test_constants.py | 4 +- tests/core/test_builder.py | 10 +- tests/core/test_dataset_dyn_part.py | 32 +- tests/core/test_dataset_explicit_part.py | 34 +- tests/core/test_docs.py | 11 +- tests/io/dask/bag/test_read.py | 5 +- tests/io/dask/dataframe/test_read.py | 4 +- tests/io/dask/dataframe/test_shuffle.py | 5 +- tests/io/dask/dataframe/test_stats.py | 95 +- tests/io/dask/dataframe/test_update.py | 42 +- tests/io/dask/delayed/test_merge.py | 19 + tests/io/dask/delayed/test_read.py | 21 +- tests/io/dask/test_common_cube.py | 16 +- tests/io/eager/test_commit.py | 167 ++- tests/io/eager/test_read.py | 65 +- tests/io/eager/test_update.py | 4 +- tests/io/eager/test_write.py | 137 ++- tests/io/iter/test_read.py | 4 +- tests/io_components/test_dataset.py | 2 +- tests/io_components/test_metapartition.py | 799 +++++++++--- tests/io_components/test_mutate.py | 234 ++++ tests/io_components/test_read.py | 67 +- tests/io_components/test_write.py | 14 +- tests/utils/test_ktk_adapters.py | 11 +- 81 files changed, 4888 insertions(+), 1580 deletions(-) create mode 100644 kartothek/io/testing/merge.py create mode 100644 kartothek/io_components/merge.py create mode 100644 tests/io/dask/delayed/test_merge.py create mode 100644 tests/io_components/test_mutate.py diff --git a/.github/workflows/ci-pre-commit.yml b/.github/workflows/ci-pre-commit.yml index 7c45cdcb..e55ca393 100644 --- a/.github/workflows/ci-pre-commit.yml +++ b/.github/workflows/ci-pre-commit.yml @@ -8,4 +8,4 @@ jobs: steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 - - uses: pre-commit/action@v2.0.3 + - uses: pre-commit/action@v2.0.0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 65beddbc..85924ffc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,7 +135,7 @@ jobs: run: python setup.py sdist bdist_wheel - name: Codecov - uses: codecov/codecov-action@v1.5.0 + uses: codecov/codecov-action@v1.2.2 with: # NOTE: `token` is not required, because the kartothek repo is public file: ./coverage.xml diff --git a/CHANGES.rst b/CHANGES.rst index ffe4e94d..3d86a636 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,11 +2,23 @@ Changelog ========= +Version 5.0.0 (2021-05-xx) +========================== + +This release rolls all the changes introduced with 4.x back to 3.20.0. + +As the incompatibility between 4.0 and 5.0 will be an issue for some customers, we encourage you to use the very stable +kartothek 3.20.0 and not version 4.x. + +Please refer the Issue #471 for further information. + Kartothek 4.0.3 (2021-06-10) ============================ + * Pin dask to not use 2021.5.1 and 2020.6.0 (#475) + Kartothek 4.0.2 (2021-06-07) ============================ diff --git a/asv_bench/benchmarks/index.py b/asv_bench/benchmarks/index.py index 4b6bd52e..75dae7c0 100644 --- a/asv_bench/benchmarks/index.py +++ b/asv_bench/benchmarks/index.py @@ -131,7 +131,9 @@ def setup(self, cardinality, num_values, partitions_to_merge): unique_vals = ["{:010d}".format(n) for n in range(cardinality)] array = [unique_vals[x % len(unique_vals)] for x in range(num_values)] self.df = pd.DataFrame({self.column: array}) - self.mp = MetaPartition(label=self.table, data=self.df, metadata_version=4) + self.mp = MetaPartition( + label=self.table, data={"core": self.df}, metadata_version=4 + ) self.mp_indices = self.mp.build_indices([self.column]) self.merge_indices.append(self.mp_indices) diff --git a/asv_bench/benchmarks/metapartition.py b/asv_bench/benchmarks/metapartition.py index c2bf4856..65e427dc 100644 --- a/asv_bench/benchmarks/metapartition.py +++ b/asv_bench/benchmarks/metapartition.py @@ -33,7 +33,7 @@ def setup(self, num_rows, dtype): self.mp = MetaPartition( label="primary_key={}/base_label".format(dtype[0]), metadata_version=4, - schema=self.schema, + table_meta={"table": self.schema}, ) def time_reconstruct_index(self, num_rows, dtype): @@ -41,6 +41,7 @@ def time_reconstruct_index(self, num_rows, dtype): self.mp._reconstruct_index_columns( df=self.df, key_indices=[("primary_key", str(dtype[1]))], + table="table", columns=None, categories=None, date_as_object=False, @@ -50,7 +51,8 @@ def time_reconstruct_index_categorical(self, num_rows, dtype): self.mp._reconstruct_index_columns( df=self.df, key_indices=[("primary_key", str(dtype[1]))], + table="table", columns=None, - categories="primary_key", + categories={"table": ["primary_key"]}, date_as_object=False, ) diff --git a/asv_bench/benchmarks/write.py b/asv_bench/benchmarks/write.py index 50f0c116..0b04026d 100644 --- a/asv_bench/benchmarks/write.py +++ b/asv_bench/benchmarks/write.py @@ -17,11 +17,12 @@ from .config import AsvBenchmarkConfig -def generate_mp(): +def generate_mp(dataset_metadata=None): return MetaPartition( label=uuid.uuid4().hex, - schema=make_meta(get_dataframe_alltypes(), origin="alltypes"), - file="fakefile", + table_meta={"table": make_meta(get_dataframe_alltypes(), origin="alltypes")}, + files={"table": "fakefile"}, + dataset_metadata=dataset_metadata, ) @@ -49,7 +50,8 @@ class TimeStoreDataset(AsvBenchmarkConfig): def setup(self, num_partitions, max_depth, num_leafs): self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp())) - self.partitions = [generate_mp() for _ in range(num_partitions)] + dataset_metadata = generate_metadata(max_depth, num_leafs) + self.partitions = [generate_mp(dataset_metadata) for _ in range(num_partitions)] self.dataset_uuid = "dataset_uuid" self.user_dataset_metadata = {} @@ -68,10 +70,8 @@ class TimePersistMetadata(AsvBenchmarkConfig): def setup(self, num_partitions): self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp())) - self.schemas = [generate_mp().schema for _ in range(num_partitions)] + self.partitions = [generate_mp() for _ in range(num_partitions)] self.dataset_uuid = "dataset_uuid" def time_persist_common_metadata(self, num_partitions): - persist_common_metadata( - self.schemas, None, self.store, self.dataset_uuid, "name" - ) + persist_common_metadata(self.partitions, None, self.store, self.dataset_uuid) diff --git a/docs/conf.py b/docs/conf.py index e133a635..154572ee 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -129,3 +129,8 @@ "kartothek.serialization._generic": "kartothek.serialization", "kartothek.serialization._parquet": "kartothek.serialization", } + +# In particular the deprecation warning in DatasetMetadata.table_schema is +# raising too many warning to handle sensibly using ipython directive pseudo +# decorators. Remove this with 4.X again +ipython_warning_is_error = False diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 73260b20..541cf0b2 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -36,7 +36,7 @@ Setup a store # Load your data # By default the single dataframe is stored in the 'core' table - df_from_store = read_table(store=store_url, dataset_uuid=dataset_uuid) + df_from_store = read_table(store=store_url, dataset_uuid=dataset_uuid, table="table") df_from_store @@ -58,8 +58,14 @@ Write # We'll define two partitions which both have two tables input_list_of_partitions = [ - pd.DataFrame({"A": range(10)}), - pd.DataFrame({"A": range(10, 20)}), + { + "label": "FirstPartition", + "data": [("FirstCategory", pd.DataFrame()), ("SecondCategory", pd.DataFrame())], + }, + { + "label": "SecondPartition", + "data": [("FirstCategory", pd.DataFrame()), ("SecondCategory", pd.DataFrame())], + }, ] # The pipeline will return a :class:`~kartothek.core.dataset.DatasetMetadata` object @@ -90,10 +96,17 @@ Read # In case you were using the dataset created in the Write example for d1, d2 in zip( list_of_partitions, - [pd.DataFrame({"A": range(10)}), pd.DataFrame({"A": range(10, 20)}),], + [ + # FirstPartition + {"FirstCategory": pd.DataFrame(), "SecondCategory": pd.DataFrame()}, + # SecondPartition + {"FirstCategory": pd.DataFrame(), "SecondCategory": pd.DataFrame()}, + ], ): - for k1, k2 in zip(d1, d2): - assert k1 == k2 + for kv1, kv2 in zip(d1.items(), d2.items()): + k1, v1 = kv1 + k2, v2 = kv2 + assert k1 == k2 and all(v1 == v2) Iter @@ -107,8 +120,14 @@ Write from kartothek.api.dataset import store_dataframes_as_dataset__iter input_list_of_partitions = [ - pd.DataFrame({"A": range(10)}), - pd.DataFrame({"A": range(10, 20)}), + { + "label": "FirstPartition", + "data": [("FirstCategory", pd.DataFrame()), ("SecondCategory", pd.DataFrame())], + }, + { + "label": "SecondPartition", + "data": [("FirstCategory", pd.DataFrame()), ("SecondCategory", pd.DataFrame())], + }, ] # The pipeline will return a :class:`~kartothek.core.dataset.DatasetMetadata` object @@ -141,10 +160,17 @@ Read # In case you were using the dataset created in the Write example for d1, d2 in zip( list_of_partitions, - [pd.DataFrame({"A": range(10)}), pd.DataFrame({"A": range(10, 20)}),], + [ + # FirstPartition + {"FirstCategory": pd.DataFrame(), "SecondCategory": pd.DataFrame()}, + # SecondPartition + {"FirstCategory": pd.DataFrame(), "SecondCategory": pd.DataFrame()}, + ], ): - for k1, k2 in zip(d1, d2): - assert k1 == k2 + for kv1, kv2 in zip(d1.items(), d2.items()): + k1, v1 = kv1 + k2, v2 = kv2 + assert k1 == k2 and all(v1 == v2) Dask ```` @@ -158,8 +184,14 @@ Write from kartothek.api.dataset import store_delayed_as_dataset input_list_of_partitions = [ - pd.DataFrame({"A": range(10)}), - pd.DataFrame({"A": range(10, 20)}), + { + "label": "FirstPartition", + "data": [("FirstCategory", pd.DataFrame()), ("SecondCategory", pd.DataFrame())], + }, + { + "label": "SecondPartition", + "data": [("FirstCategory", pd.DataFrame()), ("SecondCategory", pd.DataFrame())], + }, ] # This will return a :class:`~dask.delayed`. The figure below diff --git a/docs/guide/getting_started.rst b/docs/guide/getting_started.rst index 8a80b1ba..1927ad4d 100644 --- a/docs/guide/getting_started.rst +++ b/docs/guide/getting_started.rst @@ -5,6 +5,10 @@ Getting Started =============== +Kartothek manages datasets that consist of files that contain tables. It does so by offering +a metadata definition to handle these datasets efficiently. + +Datasets in Kartothek are made up of one or more ``tables``, each with a unique schema. When working with Kartothek tables as a Python user, we will use :class:`~pandas.DataFrame` as the user-facing type. @@ -127,25 +131,33 @@ This class holds information about the structure and schema of the dataset. .. ipython:: python - dm.table_name + dm.tables sorted(dm.partitions.keys()) - dm.schema.remove_metadata() + dm.table_meta["table"].remove_metadata() # Arrow schema + +For this guide, two attributes that are noteworthy are ``tables`` and ``partitions``: -For this guide we want to take a closer look at the ``partitions`` attribute. -``partitions`` are the physical "pieces" of data which together constitute the -contents of a dataset. Data is written to storage on a per-partition basis. See -the section on partitioning for further details: :ref:`partitioning_section`. +- Each dataset has one or more ``tables``, where each table is a logical collection of data, + bound together by a common schema. +- ``partitions`` are the physical "pieces" of data which together constitute the + contents of a dataset. Data is written to storage on a per-partition basis. + See the section on partitioning for further details: :ref:`partitioning_section`. -The attribute ``schema`` can be accessed to see the underlying schema of the dataset. +The attribute ``table_meta`` can be accessed to see the underlying schema of the dataset. See :ref:`type_system` for more information. To store multiple dataframes into a dataset, it is possible to pass a collection of dataframes; the exact format will depend on the I/O backend used. -Kartothek assumes these dataframes are different chunks of the same table and -will therefore be required to have the same schema. A ``ValueError`` will be -thrown otherwise. +Additionally, Kartothek supports several data input formats, +it does not need to always be a plain ``pd.DataFrame``. +See :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition` for +further details. + +If table names are not specified when passing an iterator of dataframes, +Kartothek assumes these dataframes are different chunks of the same table +and expects their schemas to be identical. A ``ValueError`` will be thrown otherwise. For example, .. ipython:: python @@ -182,6 +194,39 @@ For example, .. note:: Read these sections for more details: :ref:`type_system`, :ref:`dataset_spec` +When we do not explicitly define the name of the table and partition, Kartothek uses the +default table name ``table`` and generates a UUID for the partition name. + +.. admonition:: A more complex example: multiple named tables + + Sometimes it may be useful to write multiple dataframes with different schemas into + a single dataset. This can be achieved by creating a dataset with multiple tables. + + In this example, we create a dataset with two tables: ``core-table`` and ``aux-table``. + The schemas of the tables are identical across partitions (each dictionary in the + ``dfs`` list argument represents a partition). + + .. ipython:: python + + dfs = [ + { + "data": { + "core-table": pd.DataFrame({"id": [22, 23], "f": [1.1, 2.4]}), + "aux-table": pd.DataFrame({"id": [22], "col1": ["x"]}), + } + }, + { + "data": { + "core-table": pd.DataFrame({"id": [29, 31], "f": [3.2, 0.6]}), + "aux-table": pd.DataFrame({"id": [31], "col1": ["y"]}), + } + }, + ] + + dm = store_dataframes_as_dataset(store_url, dataset_uuid="two-tables", dfs=dfs) + dm.tables + + Reading data from storage ========================= @@ -193,24 +238,24 @@ table of the dataset as a pandas DataFrame. from kartothek.api.dataset import read_table - read_table("a_unique_dataset_identifier", store_url) + read_table("a_unique_dataset_identifier", store_url, table="table") We can also read a dataframe iteratively, using -:func:`~kartothek.io.iter.read_dataset_as_dataframes__iterator`. This will return a generator of :class:`pandas.DataFrame` where every element represents one file. For example, +:func:`~kartothek.io.iter.read_dataset_as_dataframes__iterator`. This will return a generator +of dictionaries (one dictionary for each `partition`), where the keys of each dictionary +represent the `tables` of the dataset. For example, .. ipython:: python from kartothek.api.dataset import read_dataset_as_dataframes__iterator - for partition_index, df in enumerate( - read_dataset_as_dataframes__iterator( - dataset_uuid="a_unique_dataset_identifier", store=store_url - ) + for partition_index, df_dict in enumerate( + read_dataset_as_dataframes__iterator(dataset_uuid="two-tables", store=store_url) ): - # Note: There is no guarantee on the ordering print(f"Partition #{partition_index}") - print(f"Data: \n{df}") + for table_name, table_df in df_dict.items(): + print(f"Table: {table_name}. Data: \n{table_df}") Respectively, the ``dask.delayed`` back-end provides the function :func:`~kartothek.io.dask.delayed.read_dataset_as_delayed`, which has a very similar @@ -230,7 +275,8 @@ function but returns a collection of ``dask.delayed`` objects. .. ipython:: python - read_table("a_unique_dataset_identifier", store_url, predicates=[[("A", "<", 2.5)]]) + # Read only values table `core-table` where `f` < 2.5 + read_table("two-tables", store_url, table="core-table", predicates=[[("f", "<", 2.5)]]) .. _storefact: https://github.com/blue-yonder/storefact .. _dask: https://docs.dask.org/en/latest/ diff --git a/docs/guide/mutating_datasets.rst b/docs/guide/mutating_datasets.rst index 25089d1a..cae75f5d 100644 --- a/docs/guide/mutating_datasets.rst +++ b/docs/guide/mutating_datasets.rst @@ -91,12 +91,91 @@ previous contents. from kartothek.api.dataset import read_table - updated_df = read_table(dataset_uuid=dm.uuid, store=store_url) + updated_df = read_table(dataset_uuid=dm.uuid, store=store_url, table="table") updated_df The way dataset updates work is that new partitions are added to a dataset -as long as they have the same tables as the existing partitions. +as long as they have the same tables as the existing partitions. A `different` +table **cannot** be introduced into an existing dataset with an update. + +To illustrate this point better, let's first create a dataset with two tables: + +.. ipython:: python + + df2 = pd.DataFrame( + { + "G": "foo", + "H": pd.Categorical(["test", "train", "test", "train"]), + "I": np.array([9] * 4, dtype="int32"), + "J": pd.Series(3, index=list(range(4)), dtype="float32"), + "K": pd.Timestamp("20190604"), + "L": 2.0, + } + ) + df2 + + dm_two_tables = store_dataframes_as_dataset( + store_url, "two_tables", dfs=[{"data": {"table1": df, "table2": df2}}] + ) + dm_two_tables.tables + sorted(dm_two_tables.partitions.keys()) + + +.. admonition:: Partition identifiers + + In the previous example a dictionary was used to pass the desired data to the store function. To label each + partition, by default Kartothek uses UUIDs to ensure that each partition is named uniquely. This is + necessary so that the update can properly work using `copy-on-write `_ + principles. + +Below is an example where we update the existing dataset ``another_unique_dataset_identifier`` +with new data for ``table1`` and ``table2``: + +.. ipython:: python + + another_df2 = pd.DataFrame( + { + "G": "bar", + "H": pd.Categorical(["prod", "dev", "prod", "dev"]), + "I": np.array([12] * 4, dtype="int32"), + "J": pd.Series(4, index=list(range(4)), dtype="float32"), + "K": pd.Timestamp("20190614"), + "L": 10.0, + } + ) + another_df2 + + dm_two_tables = update_dataset_from_dataframes( + {"data": {"table1": another_df, "table2": another_df2}}, + store=store_url, + dataset_uuid=dm_two_tables.uuid, + ) + dm_two_tables.tables + sorted(dm_two_tables.partitions.keys()) + + +Trying to update only a subset of tables throws a ``ValueError``: + +.. ipython:: + + @verbatim + In [45]: update_dataset_from_dataframes( + ....: { + ....: "data": + ....: { + ....: "table2": another_df2 + ....: } + ....: }, + ....: store=store_url, + ....: dataset_uuid=dm_two_tables.uuid + ....: ) + ....: + --------------------------------------------------------------------------- + ValueError: Input partitions for update have different tables than dataset: + Input partition tables: {'table2'} + Tables of existing dataset: ['table1', 'table2'] + Deleting Data ------------- @@ -199,7 +278,7 @@ with one update: ) sorted(dm.partitions.keys()) - read_table(dm.uuid, store_url) + read_table(dm.uuid, store_url, table="table") As can be seen in the example above, the resultant dataframe from :func:`~kartothek.io.eager.read_table` diff --git a/docs/guide/partitioning.rst b/docs/guide/partitioning.rst index 25d1fc9a..d0ab1f09 100644 --- a/docs/guide/partitioning.rst +++ b/docs/guide/partitioning.rst @@ -100,7 +100,31 @@ Note that, since 2 dataframes have been provided as input to the function, there 4 different files created, even though only 2 different combinations of values of E and F are found, ``E=test/F=foo`` and ``E=train/F=foo`` (However, these 4 physical partitions can be read as just the 2 logical partitions by using the argument -``dispatch_by=["E", "F"]`` at reading time). +``concat_partitions_on_primary_index=True`` at reading time). + +For datasets consisting of multiple tables, explicit partitioning on columns can only be +performed if the column exists in both tables and is of the same data type: guaranteeing +that their types are the same is part of schema validation in Kartothek. + +For example: + +.. ipython:: python + :okwarning: + + df.dtypes + different_df = pd.DataFrame( + {"B": pd.to_datetime(["20130102", "20190101"]), "L": [1, 4], "Q": [True, False]} + ) + different_df.dtypes + + dm = store_dataframes_as_dataset( + store_url, + "multiple_partitioned_tables", + [{"data": {"table1": df, "table2": different_df}}], + partition_on="B", + ) + + sorted(dm.partitions.keys()) As noted above, when data is appended to a dataset, Kartothek guarantees it has @@ -109,6 +133,9 @@ the proper schema and partitioning. The order of columns provided in ``partition_on`` is important, as the partition structure would be different if the columns are in a different order. +.. note:: Every partition must have data for every table. An empty dataframe in this + context is also considered as data. + .. _partitioning_dask: Force partitioning by shuffling using Dask @@ -139,7 +166,7 @@ number of physical input partitions. ddf = dd.from_pandas(df, npartitions=10) dm = update_dataset_from_ddf( - ddf, dataset_uuid="no_shuffle", store=store_url, partition_on="A" + ddf, dataset_uuid="no_shuffle", store=store_url, partition_on="A", table="table" ).compute() sorted(dm.partitions.keys()) @@ -156,7 +183,12 @@ partitioning values of A to be fused into a single file. :okwarning: dm = update_dataset_from_ddf( - ddf, dataset_uuid="with_shuffle", store=store_url, partition_on="A", shuffle=True, + ddf, + dataset_uuid="with_shuffle", + store=store_url, + partition_on="A", + shuffle=True, + table="table", ).compute() sorted(dm.partitions.keys()) @@ -204,6 +236,7 @@ When investigating the index, we can also see that a query for a given value in store=store_url, partition_on="A", shuffle=True, + table="table", bucket_by="B", num_buckets=4, secondary_indices="B", diff --git a/docs/spec/indexing.rst b/docs/spec/indexing.rst index 57873f6a..6dbc2c74 100644 --- a/docs/spec/indexing.rst +++ b/docs/spec/indexing.rst @@ -15,7 +15,7 @@ All currently supported kartothek index types are inverted indices and are mappi index_dct = {1: ["table/12345"], 2: ["table/12345", "table/6789"]} -Where, in this example, the value ``1`` is found in exactly one partition which is labeled ``table/12345``. +Where, in this example, the value ``42`` is found in exactly one partition which is labeled ``table/partitionA=42/12345``. Users typically do not interact with indices directly since querying a dataset will automatically load and interact with the indices. For some applications it is still quite useful to interact with them directly. @@ -87,7 +87,7 @@ For data with high cardinality this kind of index is not well suited since it wo Secondary indices ----------------- -Secondary indices are the most powerful type of indices which allow us to reference files without having to encode any kind of values in the keys. They can be created by supplying the `secondary_indices` keyword argument as shown above. +Secondary indices are the most powerful type of indices which allow us to reference files without having to encode any kind of values in the keys. They can be created by supplying the `secondary_indices` keyword argument as shown above. The user interaction works similarly to the Persistence diff --git a/kartothek/api/consistency.py b/kartothek/api/consistency.py index 3a1880dc..a7b6a876 100644 --- a/kartothek/api/consistency.py +++ b/kartothek/api/consistency.py @@ -314,14 +314,14 @@ def check_datasets( datasets = {name: ds.load_partition_indices() for name, ds in datasets.items()} _check_datasets( datasets=datasets, - f=lambda ds: {ds.table_name}, + f=lambda ds: set(ds.table_meta.keys()), expected={SINGLE_TABLE}, what="table", ) _check_overlap(datasets, cube) # check column types - validate_shared_columns([ds.schema for ds in datasets.values()]) + validate_shared_columns([ds.table_meta[SINGLE_TABLE] for ds in datasets.values()]) _check_partition_columns(datasets, cube) _check_dimension_columns(datasets, cube) diff --git a/kartothek/api/discover.py b/kartothek/api/discover.py index f610c455..b1cd21e7 100644 --- a/kartothek/api/discover.py +++ b/kartothek/api/discover.py @@ -10,7 +10,7 @@ KTK_CUBE_METADATA_KEY_IS_SEED, KTK_CUBE_METADATA_PARTITION_COLUMNS, KTK_CUBE_METADATA_SUPPRESS_INDEX_ON, - KTK_CUBE_UUID_SEPARATOR, + KTK_CUBE_UUID_SEPERATOR, ) from kartothek.core.cube.cube import Cube from kartothek.core.dataset import DatasetMetadata @@ -79,7 +79,7 @@ def discover_ktk_cube_dataset_ids(uuid_prefix: str, store: StoreInput) -> Set[st The ktk_cube dataset ids """ - prefix = uuid_prefix + KTK_CUBE_UUID_SEPARATOR + prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR names = _discover_dataset_meta_files(prefix, store) return set([name[len(prefix) :] for name in names]) @@ -115,7 +115,7 @@ def discover_datasets_unchecked( filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids ) - prefix = uuid_prefix + KTK_CUBE_UUID_SEPARATOR + prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR names = _discover_dataset_meta_files(prefix, store) diff --git a/kartothek/cli/_info.py b/kartothek/cli/_info.py index 81bf13a5..26c56b5d 100644 --- a/kartothek/cli/_info.py +++ b/kartothek/cli/_info.py @@ -4,6 +4,7 @@ from kartothek.cli._utils import to_bold as b from kartothek.cli._utils import to_header as h +from kartothek.io_components.metapartition import SINGLE_TABLE from kartothek.utils.ktk_adapters import get_dataset_columns __all__ = ("info",) @@ -18,7 +19,7 @@ def info(ctx): datasets = ctx.obj["datasets"] seed_ds = datasets[cube.seed_dataset] - seed_schema = seed_ds.schema + seed_schema = seed_ds.table_meta[SINGLE_TABLE] click.echo(h("Infos")) click.echo(b("UUID Prefix:") + " {}".format(cube.uuid_prefix)) @@ -40,7 +41,7 @@ def _info_dataset(ktk_cube_dataset_id, ds, cube): click.echo(h("Dataset: {}".format(ktk_cube_dataset_id))) ds = ds.load_partition_indices() - schema = ds.schema + schema = ds.table_meta[SINGLE_TABLE] all_cols = get_dataset_columns(ds) payload_cols = sorted( all_cols - (set(cube.dimension_columns) | set(cube.partition_columns)) @@ -82,7 +83,7 @@ def _collist_string_index(cube, datasets): for col in sorted(cube.index_columns): for ktk_cube_dataset_id in sorted(datasets.keys()): ds = datasets[ktk_cube_dataset_id] - schema = ds.schema + schema = ds.table_meta[SINGLE_TABLE] if col in schema.names: lines.append(" - {c}: {t}".format(c=col, t=schema.field(col).type)) break diff --git a/kartothek/cli/_query.py b/kartothek/cli/_query.py index 4d62ab34..06bac070 100644 --- a/kartothek/cli/_query.py +++ b/kartothek/cli/_query.py @@ -10,6 +10,7 @@ from kartothek.core.cube.conditions import Conjunction from kartothek.io.dask.bag_cube import query_cube_bag +from kartothek.io_components.metapartition import SINGLE_TABLE from kartothek.utils.ktk_adapters import get_dataset_columns __all__ = ("query",) @@ -41,7 +42,7 @@ def query(ctx): cols = get_dataset_columns(ds) all_columns |= cols for col in cols: - all_types[col] = ds.schema.field(col).type + all_types[col] = ds.table_meta[SINGLE_TABLE].field(col).type ipython = _get_ipython() diff --git a/kartothek/core/common_metadata.py b/kartothek/core/common_metadata.py index 2472dcc4..2be05325 100644 --- a/kartothek/core/common_metadata.py +++ b/kartothek/core/common_metadata.py @@ -16,7 +16,6 @@ from kartothek.core import naming from kartothek.core._compat import load_json -from kartothek.core.naming import SINGLE_TABLE from kartothek.core.utils import ensure_string_type _logger = logging.getLogger() @@ -337,7 +336,7 @@ def _get_common_metadata_key(dataset_uuid, table): def read_schema_metadata( - dataset_uuid: str, store: KeyValueStore, table: str = SINGLE_TABLE + dataset_uuid: str, store: KeyValueStore, table: str ) -> SchemaWrapper: """ Read schema and metadata from store. @@ -361,10 +360,7 @@ def read_schema_metadata( def store_schema_metadata( - schema: SchemaWrapper, - dataset_uuid: str, - store: KeyValueStore, - table: str = SINGLE_TABLE, + schema: SchemaWrapper, dataset_uuid: str, store: KeyValueStore, table: str ) -> str: """ Store schema and metadata to store. @@ -450,7 +446,7 @@ def _determine_schemas_to_compare( reference = None null_cols_in_reference = set() - for schema in set(schemas): + for schema in schemas: if not isinstance(schema, SchemaWrapper): schema = SchemaWrapper(schema, "__unknown__") diff --git a/kartothek/core/cube/constants.py b/kartothek/core/cube/constants.py index 0b88fb04..1fbff2e3 100644 --- a/kartothek/core/cube/constants.py +++ b/kartothek/core/cube/constants.py @@ -9,7 +9,7 @@ "KTK_CUBE_METADATA_KEY_IS_SEED", "KTK_CUBE_METADATA_STORAGE_FORMAT", "KTK_CUBE_METADATA_VERSION", - "KTK_CUBE_UUID_SEPARATOR", + "KTK_CUBE_UUID_SEPERATOR", ) @@ -45,6 +45,4 @@ KTK_CUBE_METADATA_SUPPRESS_INDEX_ON = "ktk_cube_suppress_index_on" #: Character sequence used to seperate cube and dataset UUID -KTK_CUBE_UUID_SEPARATOR = "++" -# Alias for compat reasons -KTK_CUBE_UUID_SEPERATOR = KTK_CUBE_UUID_SEPARATOR +KTK_CUBE_UUID_SEPERATOR = "++" diff --git a/kartothek/core/cube/cube.py b/kartothek/core/cube/cube.py index b9858453..f3de645a 100644 --- a/kartothek/core/cube/cube.py +++ b/kartothek/core/cube/cube.py @@ -2,7 +2,7 @@ import attr -from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPARATOR +from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPERATOR from kartothek.core.dataset import _validate_uuid from kartothek.utils.converters import ( converter_str, @@ -102,10 +102,10 @@ def _validator_uuid_freestanding(name, value): name=name, value=value ) ) - if value.find(KTK_CUBE_UUID_SEPARATOR) != -1: + if value.find(KTK_CUBE_UUID_SEPERATOR) != -1: raise ValueError( '{name} ("{value}") must not contain UUID separator {sep}'.format( - name=name, value=value, sep=KTK_CUBE_UUID_SEPARATOR + name=name, value=value, sep=KTK_CUBE_UUID_SEPERATOR ) ) @@ -201,7 +201,7 @@ def ktk_dataset_uuid(self, ktk_cube_dataset_id): _validator_uuid_freestanding("ktk_cube_dataset_id", ktk_cube_dataset_id) return "{uuid_prefix}{sep}{ktk_cube_dataset_id}".format( uuid_prefix=self.uuid_prefix, - sep=KTK_CUBE_UUID_SEPARATOR, + sep=KTK_CUBE_UUID_SEPERATOR, ktk_cube_dataset_id=ktk_cube_dataset_id, ) diff --git a/kartothek/core/dataset.py b/kartothek/core/dataset.py index 15db87e0..91fbbb01 100644 --- a/kartothek/core/dataset.py +++ b/kartothek/core/dataset.py @@ -1,6 +1,7 @@ import copy import logging import re +import warnings from collections import OrderedDict, defaultdict from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union @@ -21,11 +22,7 @@ PartitionIndex, filter_indices, ) -from kartothek.core.naming import ( - EXTERNAL_INDEX_SUFFIX, - PARQUET_FILE_SUFFIX, - SINGLE_TABLE, -) +from kartothek.core.naming import EXTERNAL_INDEX_SUFFIX, PARQUET_FILE_SUFFIX from kartothek.core.partition import Partition from kartothek.core.typing import StoreInput from kartothek.core.urlencode import decode_key, quote_indices @@ -66,8 +63,7 @@ def __init__( metadata_version: int = naming.DEFAULT_METADATA_VERSION, explicit_partitions: bool = True, partition_keys: Optional[List[str]] = None, - schema: Optional[SchemaWrapper] = None, - table_name: Optional[str] = SINGLE_TABLE, + table_meta: Optional[Dict[str, SchemaWrapper]] = None, ): if not _validate_uuid(uuid): raise ValueError("UUID contains illegal character") @@ -82,8 +78,7 @@ def __init__( self.explicit_partitions = explicit_partitions self.partition_keys = partition_keys or [] - self.schema = schema - self._table_name = table_name + self._table_meta = table_meta if table_meta else {} _add_creation_time(self) super(DatasetMetadataBase, self).__init__() @@ -103,10 +98,31 @@ def __eq__(self, other: Any) -> bool: return False if self.partition_keys != other.partition_keys: return False - if self.schema != other.schema: + if self.table_meta != other.table_meta: return False return True + @property + def table_meta(self) -> Dict[str, SchemaWrapper]: + warnings.warn( + "The attribute `DatasetMetadataBase.table_meta` will be removed in " + "kartothek 4.0 in favour of `DatasetMetadataBase.schema`.", + DeprecationWarning, + ) + return self._table_meta + + @table_meta.setter + def table_meta(self, value): + self._table_meta = value + + @property + def schema(self) -> SchemaWrapper: + if len(self.tables) > 1: + raise AttributeError( + "Attribute schema can only be accessed for a single tabled dataset" + ) + return self._table_meta[self.tables[0]] + @property def primary_indices_loaded(self) -> bool: if not self.partition_keys: @@ -116,24 +132,14 @@ def primary_indices_loaded(self) -> bool: return False return True - @property - def table_name(self) -> str: - if self._table_name: - return self._table_name - elif self.partitions: - tables = self.tables - if tables: - return self.tables[0] - return "" - @property def tables(self) -> List[str]: - tables = list(iter(next(iter(self.partitions.values())).files.keys())) - if len(tables) > 1: - raise RuntimeError( - f"Dataset {self.uuid} has tables {tables} but read support for multi tabled dataset was dropped with kartothek 4.0." - ) - return tables + if self.table_meta: + return list(self.table_meta.keys()) + elif self.partitions: + return [tab for tab in list(self.partitions.values())[0].files] + else: + return [] @property def index_columns(self) -> Set[str]: @@ -218,6 +224,7 @@ def to_dict(self) -> Dict: if self.partition_keys is not None: dct["partition_keys"] = self.partition_keys + # don't preserve table_meta, since there is no JSON-compatible way (yet) return dct @@ -227,42 +234,6 @@ def to_json(self) -> bytes: def to_msgpack(self) -> bytes: return packb(self.to_dict()) - def load_partition_indices(self: T) -> T: - """ - Load all filename encoded indices into RAM. File encoded indices can be extracted from datasets with partitions - stored in a format like - - .. code:: - - `dataset_uuid/table/IndexCol=IndexValue/SecondIndexCol=Value/partition_label.parquet` - - Which results in an in-memory index holding the information - - .. code:: - - { - "IndexCol": { - IndexValue: ["partition_label"] - }, - "SecondIndexCol": { - Value: ["partition_label"] - } - } - - """ - if self.primary_indices_loaded: - return self - - indices = _construct_dynamic_index_from_partitions( - partitions=self.partitions, - schema=self.schema, - default_dtype=pa.string() if self.metadata_version == 3 else None, - partition_keys=self.partition_keys, - ) - combined_indices = self.indices.copy() - combined_indices.update(indices) - return self.copy(indices=combined_indices) - def load_index(self: T, column: str, store: StoreInput) -> T: """ Load an index into memory. @@ -301,7 +272,9 @@ def load_index(self: T, column: str, store: StoreInput) -> T: indices = dict(self.indices, **col_loaded_index) return self.copy(indices=indices) - def load_all_indices(self: T, store: StoreInput) -> T: + def load_all_indices( + self: T, store: StoreInput, load_partition_indices: bool = True + ) -> T: """ Load all registered indices into memory. @@ -311,6 +284,8 @@ def load_all_indices(self: T, store: StoreInput) -> T: ---------- store Object that implements the .get method for file/object loading. + load_partition_indices + Flag if filename indices should be loaded. Default is True. Returns ------- @@ -325,7 +300,9 @@ def load_all_indices(self: T, store: StoreInput) -> T: } ds = self.copy(indices=indices) - return ds.load_partition_indices() + if load_partition_indices: + ds = ds.load_partition_indices() + return ds def query(self, indices: List[IndexBase] = None, **kwargs) -> List[str]: """ @@ -359,6 +336,42 @@ def query(self, indices: List[IndexBase] = None, **kwargs) -> List[str]: return list(candidate_set) + def load_partition_indices(self: T) -> T: + """ + Load all filename encoded indices into RAM. File encoded indices can be extracted from datasets with partitions + stored in a format like + + .. code:: + + `dataset_uuid/table/IndexCol=IndexValue/SecondIndexCol=Value/partition_label.parquet` + + Which results in an in-memory index holding the information + + .. code:: + + { + "IndexCol": { + IndexValue: ["partition_label"] + }, + "SecondIndexCol": { + Value: ["partition_label"] + } + } + + """ + if self.primary_indices_loaded: + return self + + indices = _construct_dynamic_index_from_partitions( + partitions=self.partitions, + table_meta=self.table_meta, + default_dtype=pa.string() if self.metadata_version == 3 else None, + partition_keys=self.partition_keys, + ) + combined_indices = self.indices.copy() + combined_indices.update(indices) + return self.copy(indices=combined_indices) + @default_docs def get_indices_as_dataframe( self, @@ -382,13 +395,7 @@ def get_indices_as_dataframe( Parameters ---------- """ - if self.partition_keys and ( - columns is None - or ( - self.partition_keys is not None - and set(columns) & set(self.partition_keys) - ) - ): + if not self.primary_indices_loaded and columns != []: # self.load_partition_indices is not inplace dm = self.load_partition_indices() else: @@ -500,14 +507,14 @@ class DatasetMetadata(DatasetMetadataBase): def __repr__(self): return ( "DatasetMetadata(uuid={uuid}, " - "table_name={table_name}, " + "tables={tables}, " "partition_keys={partition_keys}, " "metadata_version={metadata_version}, " "indices={indices}, " "explicit_partitions={explicit_partitions})" ).format( uuid=self.uuid, - table_name=self.table_name, + tables=self.tables, partition_keys=self.partition_keys, metadata_version=self.metadata_version, indices=list(self.indices.keys()), @@ -631,29 +638,23 @@ def load_from_dict( table_set.add(key.split("/")[1]) tables = list(table_set) - schema = None - table_name = None - if tables: - table_name = tables[0] - - if load_schema: - schema = read_schema_metadata( - dataset_uuid=dataset_uuid, store=store, table=table_name + table_meta = {} + if load_schema: + for table in tables: + table_meta[table] = read_schema_metadata( + dataset_uuid=dataset_uuid, store=store, table=table ) - metadata["schema"] = schema + metadata["table_meta"] = table_meta if "partition_keys" not in metadata: metadata["partition_keys"] = _get_partition_keys_from_partitions( metadata["partitions"] ) - ds = DatasetMetadata.from_dict( + return DatasetMetadata.from_dict( metadata, explicit_partitions=explicit_partitions ) - if table_name: - ds._table_name = table_name - return ds @staticmethod def from_buffer(buf: str, format: str = "json", explicit_partitions: bool = True): @@ -680,7 +681,7 @@ def from_dict(dct: Dict, explicit_partitions: bool = True): metadata_version=dct[naming.METADATA_VERSION_KEY], explicit_partitions=explicit_partitions, partition_keys=dct.get("partition_keys", None), - schema=dct.get("schema"), + table_meta=dct.get("table_meta", None), ) for key, value in dct.get("metadata", {}).items(): @@ -700,13 +701,18 @@ def from_dict(dct: Dict, explicit_partitions: bool = True): def _get_type_from_meta( - schema: Optional[SchemaWrapper], column: str, default: Optional[pa.DataType], + table_meta: Optional[Dict[str, SchemaWrapper]], + column: str, + default: Optional[pa.DataType], ) -> pa.DataType: # use first schema that provides type information, since write path should ensure that types are normalized and # equal - if schema is not None: - idx = schema.get_field_index(column) - return schema[idx].type + if table_meta is not None: + for schema in table_meta.values(): + if column not in schema.names: + continue + idx = schema.get_field_index(column) + return schema[idx].type if default is not None: return default @@ -717,25 +723,23 @@ def _get_type_from_meta( def _empty_partition_indices( - partition_keys: List[str], - schema: Optional[SchemaWrapper], - default_dtype: pa.DataType, + partition_keys: List[str], table_meta: TableMetaType, default_dtype: pa.DataType ): indices = {} for col in partition_keys: - arrow_type = _get_type_from_meta(schema, col, default_dtype) + arrow_type = _get_type_from_meta(table_meta, col, default_dtype) indices[col] = PartitionIndex(column=col, index_dct={}, dtype=arrow_type) return indices def _construct_dynamic_index_from_partitions( partitions: Dict[str, Partition], - schema: Optional[SchemaWrapper], + table_meta: TableMetaType, default_dtype: pa.DataType, partition_keys: List[str], ) -> Dict[str, PartitionIndex]: if len(partitions) == 0: - return _empty_partition_indices(partition_keys, schema, default_dtype) + return _empty_partition_indices(partition_keys, table_meta, default_dtype) def _get_files(part): if isinstance(part, dict): @@ -749,7 +753,7 @@ def _get_files(part): ) # partitions is NOT empty here, see check above first_partition_files = _get_files(first_partition) if not first_partition_files: - return _empty_partition_indices(partition_keys, schema, default_dtype) + return _empty_partition_indices(partition_keys, table_meta, default_dtype) key_table = next(iter(first_partition_files.keys())) storage_keys = ( (key, _get_files(part)[key_table]) for key, part in partitions.items() @@ -769,7 +773,7 @@ def _get_files(part): _key_indices[column][value].add(partition_label) new_indices = {} for col, index_dct in _key_indices.items(): - arrow_type = _get_type_from_meta(schema, col, default_dtype) + arrow_type = _get_type_from_meta(table_meta, col, default_dtype) # convert defaultdicts into dicts new_indices[col] = PartitionIndex( @@ -874,7 +878,7 @@ def __init__( metadata_version=naming.DEFAULT_METADATA_VERSION, explicit_partitions=True, partition_keys=None, - schema=None, + table_meta=None, ): verify_metadata_version(metadata_version) @@ -884,7 +888,7 @@ def __init__( self.metadata_version = metadata_version self.partitions: Dict[str, Partition] = OrderedDict() self.partition_keys = partition_keys - self.schema = schema + self.table_meta = table_meta self.explicit_partitions = explicit_partitions _add_creation_time(self) @@ -899,12 +903,13 @@ def from_dataset(dataset): metadata_version=dataset.metadata_version, explicit_partitions=dataset.explicit_partitions, partition_keys=dataset.partition_keys, - schema=dataset.schema, + table_meta=dataset.table_meta, ) ds_builder.metadata = dataset.metadata ds_builder.indices = dataset.indices ds_builder.partitions = dataset.partitions + ds_builder.tables = dataset.tables return ds_builder def add_partition(self, name, partition): @@ -918,12 +923,6 @@ def add_partition(self, name, partition): partition: :class:`kartothek.core.partition.Partition` The partition to add. """ - - if len(partition.files) > 1: - raise RuntimeError( - f"Dataset {self.uuid} has tables {sorted(partition.files.keys())} but read support for multi tabled dataset was dropped with kartothek 4.0." - ) - self.partitions[name] = partition return self @@ -1016,6 +1015,7 @@ def to_dict(self): if self.partition_keys is not None: dct["partition_keys"] = self.partition_keys + # don't preserve table_meta, since there is no JSON-compatible way (yet) return dct def to_json(self): @@ -1059,7 +1059,7 @@ def to_dataset(self) -> DatasetMetadata: metadata_version=self.metadata_version, explicit_partitions=self.explicit_partitions, partition_keys=self.partition_keys, - schema=self.schema, + table_meta=self.table_meta, ) diff --git a/kartothek/core/docs.py b/kartothek/core/docs.py index 1c5c39b2..e97721b1 100644 --- a/kartothek/core/docs.py +++ b/kartothek/core/docs.py @@ -29,20 +29,18 @@ "table": """ table: Optional[str] The table to be loaded. If none is specified, the default 'table' is used.""", - "table_name": """ - table_name: - The table name of the dataset to be loaded. This creates a namespace for - the partitioning like - - `dataset_uuid/table_name/*` - - This is to support legacy workflows. We recommend not to use this and use the default wherever possible.""", - "schema": """ - schema: SchemaWrapper - The dataset table schema""", + "tables": """ + tables : List[str] + A list of tables to be loaded. If None is given, all tables of + a partition are loaded""", + "table_meta": """ + table_meta: Dict[str, SchemaWrapper] + The dataset table schemas""", "columns": """ - columns - A subset of columns to be loaded.""", + columns : Optional[List[Dict[str]]] + A dictionary mapping tables to list of columns. Only the specified + columns are loaded for the corresponding table. If a specfied table or column is + not present in the dataset, a ValueError is raised.""", "dispatch_by": """ dispatch_by: Optional[List[str]] List of index columns to group and partition the jobs by. @@ -108,8 +106,14 @@ For `kartothek.io.dask.update.update_dataset.*` a delayed object resolving to a list of dicts is also accepted.""", "categoricals": """ - categoricals - Load the provided subset of columns as a :class:`pandas.Categorical`.""", + categoricals : Dict[str, List[str]] + A dictionary mapping tables to list of columns that should be + loaded as `category` dtype instead of the inferred one.""", + "label_filter": """ + label_filter: Callable + A callable taking a partition label as a parameter and returns a boolean. The callable will be applied + to the list of partitions during dispatch and will filter out all partitions for which the callable + evaluates to False.""", "dates_as_object": """ dates_as_object: bool Load pyarrow.date{32,64} columns as ``object`` columns in Pandas @@ -162,12 +166,22 @@ "df_generator": """ df_generator: Iterable[Union[pandas.DataFrame, Dict[str, pandas.DataFrame]]] The dataframe(s) to be stored""", + "central_partition_metadata": """ + central_partition_metadata: bool + This has no use and will be removed in future releases""", "default_metadata_version": """ default_metadata_version: int Default metadata version. (Note: Metadata version greater than 3 are only supported)""", + "load_dynamic_metadata": """ + load_dynamic_metadata: bool + The keyword `load_dynamic_metadata` is deprecated and will be removed in the next major release.""", + "concat_partitions_on_primary_index": """ + concat_partitions_on_primary_index: bool + Concatenate partition based on their primary index values.""", "delayed_tasks": """ - delayed_tasks - A list of delayed objects where each element returns a :class:`pandas.DataFrame`.""", + delayed_tasks: List[dask.delayed.Delayed] + Every delayed object represents a partition and should be accepted by + :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`""", "load_dataset_metadata": """ load_dataset_metadata: bool Optional argument on whether to load the metadata or not""", diff --git a/kartothek/core/factory.py b/kartothek/core/factory.py index 59752e8d..5f0c57ef 100644 --- a/kartothek/core/factory.py +++ b/kartothek/core/factory.py @@ -28,6 +28,7 @@ def __init__( store_factory: StoreInput, load_schema: bool = True, load_all_indices: bool = False, + load_dataset_metadata: bool = True, ) -> None: """ A dataset factory object which can be used to cache dataset load operations. This class should be the primary user entry point when @@ -58,6 +59,8 @@ def __init__( Load the schema information immediately. load_all_indices Load all indices immediately. + load_dataset_metadata + Keep the user metadata in memory """ self._cache_metadata: Optional[DatasetMetadata] = None self._cache_store = None @@ -67,6 +70,7 @@ def __init__( self.load_schema = load_schema self._ds_callable = None self.is_loaded = False + self.load_dataset_metadata = load_dataset_metadata self.load_all_indices_flag = load_all_indices def __repr__(self): @@ -92,6 +96,8 @@ def _instantiate_metadata_cache(self: T) -> T: load_schema=self.load_schema, load_all_indices=self.load_all_indices_flag, ) + if not self.load_dataset_metadata: + self._cache_metadata.metadata = {} self.is_loaded = True return self @@ -142,8 +148,12 @@ def load_index(self: T, column, store=None) -> T: self._cache_metadata = self.dataset_metadata.load_index(column, self.store) return self - def load_all_indices(self: T, store: Any = None) -> T: - self._cache_metadata = self.dataset_metadata.load_all_indices(self.store) + def load_all_indices( + self: T, store: Any = None, load_partition_indices: bool = True, + ) -> T: + self._cache_metadata = self.dataset_metadata.load_all_indices( + self.store, load_partition_indices=load_partition_indices + ) return self def load_partition_indices(self: T) -> T: @@ -155,6 +165,7 @@ def _ensure_factory( dataset_uuid: Optional[str], store: Optional[StoreInput], factory: Optional[DatasetFactory], + load_dataset_metadata: bool, load_schema: bool = True, ) -> DatasetFactory: @@ -164,6 +175,7 @@ def _ensure_factory( return DatasetFactory( dataset_uuid=dataset_uuid, store_factory=lazy_store(store), + load_dataset_metadata=load_dataset_metadata, load_schema=load_schema, ) diff --git a/kartothek/core/naming.py b/kartothek/core/naming.py index b35a8cf6..ec83aba7 100644 --- a/kartothek/core/naming.py +++ b/kartothek/core/naming.py @@ -4,9 +4,6 @@ Global naming constants for datasets """ - -# FIXME: move this constant somewhere else. Cannot import from its declaration due to cyclic imports -SINGLE_TABLE = "table" DEFAULT_METADATA_VERSION = 4 MIN_METADATA_VERSION = 4 MAX_METADATA_VERSION = 4 diff --git a/kartothek/core/partition.py b/kartothek/core/partition.py index 3e238656..c35274fe 100644 --- a/kartothek/core/partition.py +++ b/kartothek/core/partition.py @@ -3,8 +3,6 @@ PartitionDictType = Dict[str, Dict[str, str]] -# TODO: purge this. This is just slowing us down by creating many python objects we don't actual -# Changing the partition class needs to be done with are since it's to_dict is used for the storage metadata spec class Partition: def __init__( self, label: str, files: Optional[Dict[str, str]] = None, metadata: Dict = None diff --git a/kartothek/core/urlencode.py b/kartothek/core/urlencode.py index f66bfda1..6a1b776a 100644 --- a/kartothek/core/urlencode.py +++ b/kartothek/core/urlencode.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from typing import List, Tuple, Union +from typing import List, Tuple from urlquote import quote as urlquote_quote from urlquote import unquote as urlquote_unquote @@ -22,9 +22,7 @@ def unquote(value): return urlquote_unquote(value).decode("utf-8") -def decode_key( - key: str, -) -> Union[Tuple[str, str, List, str], Tuple[str, None, List, None]]: +def decode_key(key): """ Split a given key into its kartothek components `{dataset_uuid}/{table}/{key_indices}/{filename}` diff --git a/kartothek/io/dask/_shuffle.py b/kartothek/io/dask/_shuffle.py index 79184a63..78d025c2 100644 --- a/kartothek/io/dask/_shuffle.py +++ b/kartothek/io/dask/_shuffle.py @@ -9,6 +9,7 @@ from kartothek.core.typing import StoreFactory from kartothek.io.dask.compression import pack_payload, unpack_payload_pandas from kartothek.io_components.metapartition import MetaPartition +from kartothek.io_components.utils import InferredIndices from kartothek.io_components.write import write_partition from kartothek.serialization import DataFrameSerializer @@ -35,7 +36,7 @@ def _hash_bucket(df: pd.DataFrame, subset: Optional[Sequence[str]], num_buckets: def shuffle_store_dask_partitions( ddf: dd.DataFrame, table: str, - secondary_indices: List[str], + secondary_indices: Optional[InferredIndices], metadata_version: int, partition_on: List[str], store_factory: StoreFactory, @@ -131,11 +132,11 @@ def shuffle_store_dask_partitions( def _unpack_store_partition( df: pd.DataFrame, - secondary_indices: List[str], + secondary_indices: Optional[InferredIndices], sort_partitions_by: List[str], table: str, dataset_uuid: str, - partition_on: List[str], + partition_on: Optional[List[str]], store_factory: StoreFactory, df_serializer: DataFrameSerializer, metadata_version: int, diff --git a/kartothek/io/dask/_utils.py b/kartothek/io/dask/_utils.py index 33f40405..9a8d2f6b 100644 --- a/kartothek/io/dask/_utils.py +++ b/kartothek/io/dask/_utils.py @@ -15,11 +15,21 @@ CATEGORICAL_EFFICIENCY_WARN_LIMIT = 100000 +def _identity(): + def _id(x): + return x + + return _id + + def _get_data(mp, table=None): """ Task to avoid serialization of lambdas """ - return mp.data + if table: + return mp.data[table] + else: + return mp.data def _cast_categorical_to_index_cat(df, categories): @@ -55,10 +65,14 @@ def _maybe_get_categoricals_from_index(dataset_metadata_factory, categoricals): """ categoricals_from_index = {} if categoricals: - for column in categoricals: - if column in dataset_metadata_factory.indices: - cat_dtype = _construct_categorical(column, dataset_metadata_factory) - categoricals_from_index[column] = cat_dtype + for table, table_cat in categoricals.items(): + if not table_cat: + continue + categoricals_from_index[table] = {} + for cat in table_cat: + if cat in dataset_metadata_factory.indices: + cat_dtype = _construct_categorical(cat, dataset_metadata_factory) + categoricals_from_index[table][cat] = cat_dtype return categoricals_from_index diff --git a/kartothek/io/dask/bag.py b/kartothek/io/dask/bag.py index be5eea1b..a76d218f 100644 --- a/kartothek/io/dask/bag.py +++ b/kartothek/io/dask/bag.py @@ -1,3 +1,6 @@ +# -*- coding: utf-8 -*- +import warnings +from collections import defaultdict from functools import partial from typing import Optional, Sequence @@ -13,11 +16,11 @@ from kartothek.io.dask._utils import ( _cast_categorical_to_index_cat, _get_data, + _identity, _maybe_get_categoricals_from_index, ) from kartothek.io_components.index import update_indices_from_partitions from kartothek.io_components.metapartition import ( - SINGLE_TABLE, MetaPartition, parse_input_to_metapartition, ) @@ -51,14 +54,19 @@ def _load_and_concat_metapartitions_inner(mps, *args, **kwargs): def read_dataset_as_metapartitions_bag( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object: bool = True, + label_filter=None, + dates_as_object=False, + load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, partition_size=None, + dispatch_metadata=True, ): """ Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects. @@ -72,19 +80,37 @@ def read_dataset_as_metapartitions_bag( A dask.bag object containing the metapartions. """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=load_dataset_metadata, ) + if len(ds_factory.tables) > 1: + warnings.warn( + "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " + "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " + "functionality. " + "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", + DeprecationWarning, + ) + store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( - dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by, + dataset_factory=ds_factory, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, + label_filter=label_filter, + predicates=predicates, + dispatch_by=dispatch_by, + dispatch_metadata=dispatch_metadata, ) - mp_bag = db.from_sequence(mps, partition_size=partition_size) + mps = db.from_sequence(mps, partition_size=partition_size) - if dispatch_by is not None: - mp_bag = mp_bag.map( + if concat_partitions_on_primary_index or dispatch_by is not None: + mps = mps.map( _load_and_concat_metapartitions_inner, store=store, + tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, @@ -92,9 +118,10 @@ def read_dataset_as_metapartitions_bag( predicates=predicates, ) else: - mp_bag = mp_bag.map( + mps = mps.map( MetaPartition.load_dataframes, store=store, + tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, @@ -107,25 +134,28 @@ def read_dataset_as_metapartitions_bag( ) if categoricals_from_index: - - mp_bag = mp_bag.map( - MetaPartition.apply, - func=partial( - _cast_categorical_to_index_cat, categories=categoricals_from_index - ), - type_safe=True, + func_dict = defaultdict(_identity) + func_dict.update( + { + table: partial(_cast_categorical_to_index_cat, categories=cats) + for table, cats in categoricals_from_index.items() + } ) - return mp_bag + mps = mps.map(MetaPartition.apply, func_dict, type_safe=True) + return mps @default_docs def read_dataset_as_dataframe_bag( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object: bool = True, + label_filter=None, + dates_as_object=False, predicates=None, factory=None, dispatch_by=None, @@ -146,13 +176,18 @@ def read_dataset_as_dataframe_bag( dataset_uuid=dataset_uuid, store=store, factory=factory, + tables=tables, columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, + load_dataset_metadata=False, predicates=predicates, dispatch_by=dispatch_by, partition_size=partition_size, + dispatch_metadata=False, ) return mps.map(_get_data) @@ -171,7 +206,6 @@ def store_bag_as_dataset( partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, - table_name: str = SINGLE_TABLE, ): """ Transform and store a dask.bag of dictionaries containing @@ -197,9 +231,7 @@ def store_bag_as_dataset( raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial( - parse_input_to_metapartition, - metadata_version=metadata_version, - table_name=table_name, + parse_input_to_metapartition, metadata_version=metadata_version ) mps = bag.map(input_to_mps) @@ -247,11 +279,17 @@ def build_dataset_indices__bag( """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) - assert ds_factory.schema is not None - cols_to_load = set(columns) & set(ds_factory.schema.names) + cols_to_load = { + table: set(columns) & set(meta.names) + for table, meta in ds_factory.table_meta.items() + } + cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols} mps = dispatch_metapartitions_from_factory(ds_factory) @@ -260,6 +298,7 @@ def build_dataset_indices__bag( .map( MetaPartition.load_dataframes, store=ds_factory.store_factory, + tables=list(cols_to_load.keys()), columns=cols_to_load, ) .map(MetaPartition.build_indices, columns=columns) diff --git a/kartothek/io/dask/common_cube.py b/kartothek/io/dask/common_cube.py index f059afab..2fc6bfb7 100644 --- a/kartothek/io/dask/common_cube.py +++ b/kartothek/io/dask/common_cube.py @@ -35,7 +35,6 @@ prepare_ktk_partition_on, ) from kartothek.io_components.metapartition import ( - SINGLE_TABLE, MetaPartition, parse_input_to_metapartition, ) @@ -61,7 +60,7 @@ def ensure_valid_cube_indices( ) -> Cube: """ Parse all existing datasets and infer the required set of indices. We do not - allow indices to be removed or added in update steps at the moment and + allow indices to be removed or added in update steps at the momenent and need to make sure that existing ones are updated properly. The returned `Cube` instance will be a copy of the input with `index_columns` and `suppress_index_on` fields adjusted to reflect the @@ -69,14 +68,14 @@ def ensure_valid_cube_indices( """ dataset_indices = [] for ds in existing_datasets.values(): - assert ds.schema is not None - dataset_columns = set(ds.schema.names) - table_indices = cube.index_columns & dataset_columns - compatible_indices = _ensure_compatible_indices(ds, table_indices) - dataset_indices.append(set(compatible_indices)) + for internal_table in ds.table_meta: + dataset_columns = set(ds.table_meta[internal_table].names) + table_indices = cube.index_columns & dataset_columns + compatible_indices = _ensure_compatible_indices(ds, table_indices) + if compatible_indices: + dataset_indices.append(set(compatible_indices)) required_indices = cube.index_columns.union(*dataset_indices) suppress_index_on = cube.suppress_index_on.difference(*dataset_indices) - # Need to remove dimension columns since they *are* technically indices but # the cube interface class declares them as not indexed just to add them # later on, assuming it is not blacklisted @@ -645,7 +644,7 @@ def _multiplex_parse_input_to_metapartition(data): for k in sorted(data.keys()): v = data.pop(k) result[k] = parse_input_to_metapartition( - v, metadata_version=KTK_CUBE_METADATA_VERSION, table_name=SINGLE_TABLE + v, metadata_version=KTK_CUBE_METADATA_VERSION ) del v return result diff --git a/kartothek/io/dask/dataframe.py b/kartothek/io/dask/dataframe.py index dfe48879..fe6d906c 100644 --- a/kartothek/io/dask/dataframe.py +++ b/kartothek/io/dask/dataframe.py @@ -1,11 +1,11 @@ import random +import warnings from typing import ( Callable, Iterable, List, Mapping, Optional, - Sequence, SupportsFloat, Union, cast, @@ -31,7 +31,10 @@ from kartothek.io_components.read import dispatch_metapartitions_from_factory from kartothek.io_components.update import update_dataset_from_partitions from kartothek.io_components.utils import ( + InferredIndices, _ensure_compatible_indices, + check_single_table_dataset, + normalize_arg, normalize_args, validate_partition_keys, ) @@ -44,7 +47,7 @@ from ._shuffle import shuffle_store_dask_partitions from ._utils import _maybe_get_categoricals_from_index -from .delayed import read_dataset_as_delayed +from .delayed import read_table_as_delayed __all__ = ( "read_dataset_as_ddf", @@ -62,9 +65,11 @@ def read_dataset_as_ddf( store=None, table=SINGLE_TABLE, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, - categoricals: Optional[Sequence[str]] = None, - dates_as_object: bool = True, + categoricals=None, + label_filter=None, + dates_as_object=False, predicates=None, factory=None, dask_index_on=None, @@ -97,24 +102,39 @@ def read_dataset_as_ddf( ) ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) + if len(ds_factory.tables) > 1: + warnings.warn( + "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " + "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " + "functionality. " + "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", + DeprecationWarning, + ) + if isinstance(columns, dict): columns = columns[table] meta = _get_dask_meta_for_dataset( - ds_factory, columns, categoricals, dates_as_object + ds_factory, table, columns, categoricals, dates_as_object ) if columns is None: columns = list(meta.columns) # that we can use factories instead of dataset_uuids - delayed_partitions = read_dataset_as_delayed( + delayed_partitions = read_table_as_delayed( factory=ds_factory, + table=table, columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, - categoricals=categoricals, + categoricals={table: categoricals}, + label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, dispatch_by=dask_index_on if dask_index_on else dispatch_by, @@ -131,11 +151,13 @@ def read_dataset_as_ddf( return dd.from_delayed(delayed_partitions, meta=meta) -def _get_dask_meta_for_dataset(ds_factory, columns, categoricals, dates_as_object): +def _get_dask_meta_for_dataset( + ds_factory, table, columns, categoricals, dates_as_object +): """ Calculate a schema suitable for the dask dataframe meta from the dataset. """ - table_schema = ds_factory.schema + table_schema = ds_factory.table_meta[table] meta = empty_dataframe_from_schema( table_schema, columns=columns, date_as_object=dates_as_object ) @@ -145,10 +167,10 @@ def _get_dask_meta_for_dataset(ds_factory, columns, categoricals, dates_as_objec meta = dd.utils.clear_known_categories(meta, categoricals) categoricals_from_index = _maybe_get_categoricals_from_index( - ds_factory, categoricals + ds_factory, {table: categoricals} ) if categoricals_from_index: - meta = meta.astype(categoricals_from_index) + meta = meta.astype(categoricals_from_index[table]) return meta @@ -238,23 +260,8 @@ def _shuffle_docs(func): return func -def _id(x): - return x - - -def _commit_update_from_reduction(df_mps, **kwargs): - partitions = pd.Series(df_mps.values.flatten()).dropna() - return update_dataset_from_partitions(partition_list=partitions, **kwargs,) - - -def _commit_store_from_reduction(df_mps, **kwargs): - partitions = pd.Series(df_mps.values.flatten()).dropna() - return store_dataset_from_partitions(partition_list=partitions, **kwargs,) - - @default_docs @_shuffle_docs -@normalize_args def store_dataset_from_ddf( ddf: dd.DataFrame, store: StoreInput, @@ -265,6 +272,7 @@ def store_dataset_from_ddf( repartition_ratio: Optional[SupportsFloat] = None, num_buckets: int = 1, sort_partitions_by: Optional[Union[List[str], str]] = None, + delete_scope: Optional[Iterable[Mapping[str, str]]] = None, metadata: Optional[Mapping] = None, df_serializer: Optional[DataFrameSerializer] = None, metadata_merger: Optional[Callable] = None, @@ -276,22 +284,25 @@ def store_dataset_from_ddf( """ Store a dataset from a dask.dataframe. """ - # normalization done by normalize_args but mypy doesn't recognize this - sort_partitions_by = cast(List[str], sort_partitions_by) - secondary_indices = cast(List[str], secondary_indices) - bucket_by = cast(List[str], bucket_by) - partition_on = cast(List[str], partition_on) + partition_on = normalize_arg("partition_on", partition_on) + secondary_indices = normalize_arg("secondary_indices", secondary_indices) + sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by) + bucket_by = normalize_arg("bucket_by", bucket_by) + store = normalize_arg("store", store) + delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope) if table is None: raise TypeError("The parameter `table` is not optional.") - ds_factory = _ensure_factory(dataset_uuid=dataset_uuid, store=store, factory=None) + ds_factory = _ensure_factory( + dataset_uuid=dataset_uuid, store=store, factory=None, load_dataset_metadata=True + ) if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) - mp_ser = _write_dataframe_partitions( + mps = _write_dataframe_partitions( ddf=ddf, - store=ds_factory.store_factory, + store=store, dataset_uuid=dataset_uuid, table=table, secondary_indices=secondary_indices, @@ -304,18 +315,12 @@ def store_dataset_from_ddf( partition_on=partition_on, bucket_by=bucket_by, ) - return mp_ser.reduction( - chunk=_id, - aggregate=_commit_store_from_reduction, - split_every=False, - token="commit-dataset", - meta=object, - aggregate_kwargs={ - "store": ds_factory.store_factory, - "dataset_uuid": ds_factory.dataset_uuid, - "dataset_metadata": metadata, - "metadata_merger": metadata_merger, - }, + return dask.delayed(store_dataset_from_partitions)( + mps, + store=ds_factory.store_factory if ds_factory else store, + dataset_uuid=ds_factory.dataset_uuid if ds_factory else dataset_uuid, + dataset_metadata=metadata, + metadata_merger=metadata_merger, ) @@ -324,7 +329,7 @@ def _write_dataframe_partitions( store: StoreFactory, dataset_uuid: str, table: str, - secondary_indices: List[str], + secondary_indices: Optional[InferredIndices], shuffle: bool, repartition_ratio: Optional[SupportsFloat], num_buckets: int, @@ -342,11 +347,7 @@ def _write_dataframe_partitions( if ddf is None: mps = dd.from_pandas( pd.Series( - [ - parse_input_to_metapartition( - None, metadata_version=metadata_version, table_name=table, - ) - ] + [parse_input_to_metapartition(None, metadata_version=metadata_version)] ), npartitions=1, ) @@ -383,7 +384,6 @@ def _write_dataframe_partitions( @default_docs @_shuffle_docs -@normalize_args def update_dataset_from_ddf( ddf: dd.DataFrame, store: Optional[StoreInput] = None, @@ -410,15 +410,15 @@ def update_dataset_from_ddf( -------- :ref:`mutating_datasets` """ + partition_on = normalize_arg("partition_on", partition_on) + secondary_indices = normalize_arg("secondary_indices", secondary_indices) + sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by) + bucket_by = normalize_arg("bucket_by", bucket_by) + store = normalize_arg("store", store) + delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope) + if table is None: raise TypeError("The parameter `table` is not optional.") - - # normalization done by normalize_args but mypy doesn't recognize this - sort_partitions_by = cast(List[str], sort_partitions_by) - secondary_indices = cast(List[str], secondary_indices) - bucket_by = cast(List[str], bucket_by) - partition_on = cast(List[str], partition_on) - ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, @@ -430,9 +430,12 @@ def update_dataset_from_ddf( inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices) del secondary_indices - mp_ser = _write_dataframe_partitions( + if ds_factory is not None: + check_single_table_dataset(ds_factory, table) + + mps = _write_dataframe_partitions( ddf=ddf, - store=ds_factory.store_factory if ds_factory else store, + store=store, dataset_uuid=dataset_uuid or ds_factory.dataset_uuid, table=table, secondary_indices=inferred_indices, @@ -445,21 +448,14 @@ def update_dataset_from_ddf( partition_on=cast(List[str], partition_on), bucket_by=bucket_by, ) - - return mp_ser.reduction( - chunk=_id, - aggregate=_commit_update_from_reduction, - split_every=False, - token="commit-dataset", - meta=object, - aggregate_kwargs={ - "store_factory": store, - "dataset_uuid": dataset_uuid, - "ds_factory": ds_factory, - "delete_scope": delete_scope, - "metadata": metadata, - "metadata_merger": metadata_merger, - }, + return dask.delayed(update_dataset_from_partitions)( + mps, + store_factory=store, + dataset_uuid=dataset_uuid, + ds_factory=ds_factory, + delete_scope=delete_scope, + metadata=metadata, + metadata_merger=metadata_merger, ) @@ -468,6 +464,7 @@ def update_dataset_from_ddf( def collect_dataset_metadata( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, + table_name: str = SINGLE_TABLE, predicates: Optional[PredicatesType] = None, frac: float = 1.0, factory: Optional[DatasetFactory] = None, @@ -519,7 +516,10 @@ def collect_dataset_metadata( "Please make sure to provide a value larger than 0.0 and smaller than or equal to 1.0 ." ) dataset_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) mps = list( @@ -533,7 +533,7 @@ def collect_dataset_metadata( ddf = dd.from_delayed( [ dask.delayed(MetaPartition.get_parquet_metadata)( - mp, store=dataset_factory.store_factory + mp, store=dataset_factory.store_factory, table_name=table_name ) for mp in mps ], @@ -595,7 +595,10 @@ def hash_dataset( If provided, calculate hash per group instead of per partition """ dataset_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) columns = subset diff --git a/kartothek/io/dask/delayed.py b/kartothek/io/dask/delayed.py index e308aa68..479f2896 100644 --- a/kartothek/io/dask/delayed.py +++ b/kartothek/io/dask/delayed.py @@ -1,5 +1,8 @@ +# -*- coding: utf-8 -*- +import warnings +from collections import defaultdict from functools import partial -from typing import List, Optional, Sequence +from typing import List, Optional import dask from dask import delayed @@ -18,6 +21,7 @@ delete_top_level_metadata, ) from kartothek.io_components.gc import delete_files, dispatch_files_to_gc +from kartothek.io_components.merge import align_datasets from kartothek.io_components.metapartition import ( SINGLE_TABLE, MetaPartition, @@ -41,6 +45,7 @@ from ._utils import ( _cast_categorical_to_index_cat, _get_data, + _identity, _maybe_get_categoricals_from_index, map_delayed, ) @@ -75,7 +80,11 @@ def delete_dataset__delayed(dataset_uuid=None, store=None, factory=None): ---------- """ dataset_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, load_schema=False, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_schema=False, + load_dataset_metadata=False, ) gc = garbage_collect_dataset__delayed(factory=dataset_factory) @@ -118,7 +127,10 @@ def garbage_collect_dataset__delayed( """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) nested_files = dispatch_files_to_gc( @@ -129,6 +141,108 @@ def garbage_collect_dataset__delayed( ) +def _load_and_merge_mps(mp_list, store, label_merger, metadata_merger, merge_tasks): + mp_list = [mp.load_dataframes(store=store) for mp in mp_list] + mp = MetaPartition.merge_metapartitions( + mp_list, label_merger=label_merger, metadata_merger=metadata_merger + ) + mp = mp.concat_dataframes() + + for task in merge_tasks: + mp = mp.merge_dataframes(**task) + + return mp + + +@default_docs +@normalize_args +def merge_datasets_as_delayed( + left_dataset_uuid, + right_dataset_uuid, + store, + merge_tasks, + match_how="exact", + label_merger=None, + metadata_merger=None, +): + """ + A dask.delayed graph to perform the merge of two full kartothek datasets. + + Parameters + ---------- + left_dataset_uuid : str + UUID for left dataset (order does not matter in all merge schemas) + right_dataset_uuid : str + UUID for right dataset (order does not matter in all merge schemas) + match_how : Union[str, Callable] + Define the partition label matching scheme. + Available implementations are: + + * left (right) : The left (right) partitions are considered to be + the base partitions and **all** partitions of the + right (left) dataset are joined to the left + partition. This should only be used if one of the + datasets contain very few partitions. + * prefix : The labels of the partitions of the dataset with fewer + partitions are considered to be the prefixes to the + right dataset + * exact : All partition labels of the left dataset need to have + an exact match in the right dataset + * callable : A callable with signature func(left, right) which + returns a boolean to determine if the partitions match + + If True, an exact match of partition labels between the to-be-merged + datasets is required in order to merge. + If False (Default), the partition labels of the dataset with fewer + partitions are interpreted as prefixes. + merge_tasks : List[Dict] + A list of merge tasks. Each item in this list is a dictionary giving + explicit instructions for a specific merge. + Each dict should contain key/values: + + * `left`: The table for the left dataframe + * `right`: The table for the right dataframe + * 'output_label' : The table for the merged dataframe + * `merge_func`: A callable with signature + `merge_func(left_df, right_df, merge_kwargs)` to + handle the data preprocessing and merging. + Default pandas.merge + * 'merge_kwargs' : The kwargs to be passed to the `merge_func` + + Example: + + .. code:: + + >>> merge_tasks = [ + ... { + ... "left": "left_dict", + ... "right": "right_dict", + ... "merge_kwargs": {"kwargs of merge_func": ''}, + ... "output_label": 'merged_core_data' + ... }, + ... ] + + """ + store = lazy_store(store) + + mps = align_datasets( + left_dataset_uuid=left_dataset_uuid, + right_dataset_uuid=right_dataset_uuid, + store=store, + match_how=match_how, + ) + mps = map_delayed( + _load_and_merge_mps, + mps, + store=store, + label_merger=label_merger, + metadata_merger=metadata_merger, + merge_tasks=merge_tasks, + ) + + return list(mps) + + def _load_and_concat_metapartitions_inner(mps, args, kwargs): return MetaPartition.concat_metapartitions( [mp.load_dataframes(*args, **kwargs) for mp in mps] @@ -146,13 +260,18 @@ def _load_and_concat_metapartitions(list_of_mps, *args, **kwargs): def read_dataset_as_delayed_metapartitions( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, - categoricals: Optional[Sequence[str]] = None, - dates_as_object: bool = True, + categoricals=None, + label_filter=None, + dates_as_object=False, + load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, + dispatch_metadata=True, ): """ A collection of dask.delayed objects to retrieve a dataset from store where each @@ -167,18 +286,36 @@ def read_dataset_as_delayed_metapartitions( """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=load_dataset_metadata, ) + if len(ds_factory.tables) > 1: + warnings.warn( + "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " + "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " + "functionality. " + "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", + DeprecationWarning, + ) + store = ds_factory.store_factory mps = dispatch_metapartitions_from_factory( - dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by, + dataset_factory=ds_factory, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, + label_filter=label_filter, + predicates=predicates, + dispatch_by=dispatch_by, + dispatch_metadata=dispatch_metadata, ) - if dispatch_by is not None: + if concat_partitions_on_primary_index or dispatch_by is not None: mps = _load_and_concat_metapartitions( mps, store=store, + tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, @@ -190,6 +327,7 @@ def read_dataset_as_delayed_metapartitions( MetaPartition.load_dataframes, mps, store=store, + tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, @@ -202,16 +340,15 @@ def read_dataset_as_delayed_metapartitions( ) if categoricals_from_index: - + func_dict = defaultdict(_identity) + func_dict.update( + { + table: partial(_cast_categorical_to_index_cat, categories=cats) + for table, cats in categoricals_from_index.items() + } + ) mps = map_delayed( - partial( # type: ignore - MetaPartition.apply, - func=partial( # type: ignore - _cast_categorical_to_index_cat, categories=categoricals_from_index - ), - type_safe=True, - ), - mps, + partial(MetaPartition.apply, func=func_dict, type_safe=True), mps ) return list(mps) @@ -221,10 +358,13 @@ def read_dataset_as_delayed_metapartitions( def read_dataset_as_delayed( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object: bool = True, + label_filter=None, + dates_as_object=False, predicates=None, factory=None, dispatch_by=None, @@ -240,19 +380,79 @@ def read_dataset_as_delayed( dataset_uuid=dataset_uuid, store=store, factory=factory, + tables=tables, columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, + load_dataset_metadata=False, predicates=predicates, dispatch_by=dispatch_by, ) return list(map_delayed(_get_data, mps)) +@default_docs +@normalize_args +def read_table_as_delayed( + dataset_uuid=None, + store=None, + table=SINGLE_TABLE, + columns=None, + concat_partitions_on_primary_index=False, + predicate_pushdown_to_io=True, + categoricals=None, + label_filter=None, + dates_as_object=False, + predicates=None, + factory=None, + dispatch_by=None, +): + """ + A collection of dask.delayed objects to retrieve a single table from + a dataset as partition-individual :class:`~pandas.DataFrame` instances. + + You can transform the collection of ``dask.delayed`` objects into + a ``dask.dataframe`` using the following code snippet. As older kartothek + specifications don't store schema information, this must be provided by + a separate code path. + + .. code :: + + >>> import dask.dataframe as dd + >>> ddf_tasks = read_table_as_delayed(…) + >>> meta = … + >>> ddf = dd.from_delayed(ddf_tasks, meta=meta) + + Parameters + ---------- + """ + if not isinstance(columns, dict): + columns = {table: columns} + mps = read_dataset_as_delayed_metapartitions( + dataset_uuid=dataset_uuid, + store=store, + tables=[table], + columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, + predicate_pushdown_to_io=predicate_pushdown_to_io, + categoricals=categoricals, + label_filter=label_filter, + dates_as_object=dates_as_object, + load_dataset_metadata=False, + predicates=predicates, + factory=factory, + dispatch_by=dispatch_by, + dispatch_metadata=False, + ) + return list(map_delayed(partial(_get_data, table=table), mps)) + + @default_docs def update_dataset_from_delayed( - delayed_tasks: List[Delayed], + delayed_tasks, store=None, dataset_uuid=None, delete_scope=None, @@ -264,7 +464,6 @@ def update_dataset_from_delayed( sort_partitions_by=None, secondary_indices=None, factory=None, - table_name=SINGLE_TABLE, ): """ A dask.delayed graph to add and store a list of dictionaries containing @@ -305,7 +504,6 @@ def update_dataset_from_delayed( df_serializer=df_serializer, dataset_uuid=dataset_uuid, sort_partitions_by=sort_partitions_by, - dataset_table_name=table_name, ) return dask.delayed(update_dataset_from_partitions)( @@ -322,7 +520,7 @@ def update_dataset_from_delayed( @default_docs @normalize_args def store_delayed_as_dataset( - delayed_tasks: List[Delayed], + delayed_tasks, store, dataset_uuid=None, metadata=None, @@ -332,7 +530,6 @@ def store_delayed_as_dataset( metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, - table_name: str = SINGLE_TABLE, secondary_indices=None, ) -> Delayed: """ @@ -352,9 +549,7 @@ def store_delayed_as_dataset( raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial( - parse_input_to_metapartition, - metadata_version=metadata_version, - table_name=table_name, + parse_input_to_metapartition, metadata_version=metadata_version ) mps = map_delayed(input_to_mps, delayed_tasks) diff --git a/kartothek/io/eager.py b/kartothek/io/eager.py index 6ac0f531..79050069 100644 --- a/kartothek/io/eager.py +++ b/kartothek/io/eager.py @@ -1,3 +1,4 @@ +import warnings from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast @@ -44,6 +45,7 @@ ) from kartothek.io_components.write import raise_if_dataset_exists from kartothek.serialization import DataFrameSerializer +from kartothek.serialization._parquet import ParquetSerializer __all__ = ( "delete_dataset", @@ -70,7 +72,11 @@ def delete_dataset(dataset_uuid=None, store=None, factory=None): """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, load_schema=False, store=store, factory=factory, + dataset_uuid=dataset_uuid, + load_schema=False, + store=store, + factory=factory, + load_dataset_metadata=False, ) # Remove possibly unreferenced files @@ -94,10 +100,13 @@ def delete_dataset(dataset_uuid=None, store=None, factory=None): def read_dataset_as_dataframes( dataset_uuid: Optional[str] = None, store=None, + tables: Optional[List[str]] = None, columns: Dict[str, List[str]] = None, + concat_partitions_on_primary_index: bool = False, predicate_pushdown_to_io: bool = True, - categoricals: List[str] = None, - dates_as_object: bool = True, + categoricals: Dict[str, List[str]] = None, + label_filter: Callable = None, + dates_as_object: bool = False, predicates: Optional[List[List[Tuple[str, str, Any]]]] = None, factory: Optional[DatasetFactory] = None, dispatch_by: Optional[List[str]] = None, @@ -131,17 +140,24 @@ def read_dataset_as_dataframes( """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=True, ) mps = read_dataset_as_metapartitions( + tables=tables, columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, dispatch_by=dispatch_by, + dispatch_metadata=False, ) return [mp.data for mp in mps] @@ -150,13 +166,17 @@ def read_dataset_as_dataframes( def read_dataset_as_metapartitions( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object: bool = True, + label_filter=None, + dates_as_object=False, predicates=None, factory=None, dispatch_by=None, + dispatch_metadata=True, ): """ Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`. @@ -187,31 +207,71 @@ def read_dataset_as_metapartitions( """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) + if len(ds_factory.tables) > 1: + warnings.warn( + "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " + "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " + "functionality. " + "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", + DeprecationWarning, + ) + from .iter import read_dataset_as_metapartitions__iterator ds_iter = read_dataset_as_metapartitions__iterator( + tables=tables, columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, dispatch_by=dispatch_by, + dispatch_metadata=dispatch_metadata, ) return list(ds_iter) +def _check_compatible_list(table, obj, argument_name=""): + if obj is None: + return obj + elif isinstance(obj, dict): + if table not in obj: + raise ValueError( + "Provided table {} is not compatible with input from argument {}.".format( + table, argument_name + ) + ) + return obj + elif isinstance(obj, list): + return {table: obj} + else: + raise TypeError( + "Unknown type encountered for argument {}. Expected `list`, got `{}` instead".format( + argument_name, type(obj) + ) + ) + + @default_docs def read_table( dataset_uuid: Optional[str] = None, store=None, + table: Optional[str] = SINGLE_TABLE, columns: Dict[str, List[str]] = None, + concat_partitions_on_primary_index: bool = False, predicate_pushdown_to_io: bool = True, - categoricals: List[str] = None, - dates_as_object: bool = True, + categoricals: Dict[str, List[str]] = None, + label_filter: Callable = None, + dates_as_object: bool = False, predicates: Optional[List[List[Tuple[str, str, Any]]]] = None, factory: Optional[DatasetFactory] = None, ) -> pd.DataFrame: @@ -243,26 +303,46 @@ def read_table( >>> df = read_table(store, 'dataset_uuid', 'core') """ + if concat_partitions_on_primary_index is not False: + warnings.warn( + "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.", + DeprecationWarning, + ) + + if not isinstance(table, str): + raise TypeError("Argument `table` needs to be a string") + + columns = _check_compatible_list(table, columns, "columns") + categoricals = _check_compatible_list(table, categoricals, "categoricals") ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) partitions = read_dataset_as_dataframes( + tables=[table], columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, ) - empty_df = empty_dataframe_from_schema(schema=ds_factory.schema, columns=columns,) + empty_df = empty_dataframe_from_schema( + schema=ds_factory.table_meta[table], + columns=columns[table] if columns is not None else None, + ) if categoricals: - empty_df = empty_df.astype({col: "category" for col in categoricals}) - dfs = [partition_data for partition_data in partitions] + [empty_df] + empty_df = empty_df.astype({col: "category" for col in categoricals[table]}) + dfs = [partition_data[table] for partition_data in partitions] + [empty_df] # require meta 4 otherwise, can't construct types/columns if categoricals: - dfs = align_categories(dfs, categoricals) + dfs = align_categories(dfs, categoricals[table]) df = pd.concat(dfs, ignore_index=True, sort=False) # ensure column order @@ -278,8 +358,10 @@ def commit_dataset( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, new_partitions: Optional[Iterable[MetaPartition]] = None, + output_dataset_uuid: Optional[str] = None, delete_scope: Optional[Iterable[Dict[str, Any]]] = None, metadata: Dict = None, + df_serializer: DataFrameSerializer = None, metadata_merger: Callable[[List[Dict]], Dict] = None, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[Iterable[str]] = None, @@ -357,17 +439,22 @@ def commit_dataset( Input partition to be committed. """ + if output_dataset_uuid is not None: + warnings.warn( + "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ", + DeprecationWarning, + ) + + if df_serializer is not None: + warnings.warn( + "The keyword `df_serializer` is deprecated and will be removed in the next major release.", + DeprecationWarning, + ) if not new_partitions and not metadata and not delete_scope: raise ValueError( "Need to provide either new data, new metadata or a delete scope. None of it was provided." ) - if new_partitions: - tables_in_partitions = {mp.table_name for mp in new_partitions} - if len(tables_in_partitions) > 1: - raise RuntimeError( - f"Cannot commit more than one table to a dataset but got tables {sorted(tables_in_partitions)}" - ) store = lazy_store(store) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, @@ -378,9 +465,7 @@ def commit_dataset( ) mps = parse_input_to_metapartition( - new_partitions, - metadata_version=metadata_version, - table_name=ds_factory.table_name, + new_partitions, metadata_version=metadata_version ) if secondary_indices: @@ -403,7 +488,7 @@ def commit_dataset( def _maybe_infer_files_attribute(metapartition, dataset_uuid): new_mp = metapartition.as_sentinel() for mp in metapartition: - if mp.file is None: + if len(mp.files) == 0: if mp.data is None or len(mp.data) == 0: raise ValueError( "Trying to commit partitions without `data` or `files` information." @@ -434,12 +519,11 @@ def store_dataframes_as_dataset( dfs: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]], metadata: Optional[Dict[str, Dict[str, Any]]] = None, partition_on: Optional[List[str]] = None, - df_serializer: Optional[DataFrameSerializer] = None, + df_serializer: Optional[ParquetSerializer] = None, overwrite: bool = False, secondary_indices=None, - metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, - metadata_version=DEFAULT_METADATA_VERSION, - table_name: str = SINGLE_TABLE, + metadata_storage_format: str = DEFAULT_METADATA_STORAGE_FORMAT, + metadata_version: int = DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). @@ -452,9 +536,12 @@ def store_dataframes_as_dataset( The dataframe(s) to be stored. """ - if isinstance(dfs, pd.DataFrame): - raise TypeError( - f"Please pass a list of pandas.DataFrame as input. Instead got {type(dfs)}" + if isinstance(dfs, (pd.DataFrame, dict)): + dfs = [dfs] + warnings.warn( + "Passing a single dataframe instead of an iterable is deprecated and may " + "be removed in the next major release.", + DeprecationWarning, ) return store_dataframes_as_dataset__iter( @@ -468,7 +555,6 @@ def store_dataframes_as_dataset( secondary_indices=secondary_indices, metadata_storage_format=metadata_storage_format, metadata_version=metadata_version, - table_name=table_name, ) @@ -477,13 +563,12 @@ def store_dataframes_as_dataset( def create_empty_dataset_header( store, dataset_uuid, - schema, + table_meta, partition_on=None, metadata=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, - table_name: str = SINGLE_TABLE, ): """ Create an dataset header without any partitions. This may be used in combination @@ -505,16 +590,20 @@ def create_empty_dataset_header( if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) - schema = make_meta(schema, origin=table_name, partition_keys=partition_on) - store_schema_metadata( - schema=schema, dataset_uuid=dataset_uuid, store=store, table=table_name, - ) + for table, schema in table_meta.items(): + table_meta[table] = make_meta(schema, origin=table, partition_keys=partition_on) + store_schema_metadata( + schema=table_meta[table], + dataset_uuid=dataset_uuid, + store=store, + table=table, + ) dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=partition_on, explicit_partitions=False, - schema=schema, + table_meta=table_meta, ) if metadata: for key, value in metadata.items(): @@ -538,12 +627,14 @@ def write_single_partition( store: Optional[KeyValueStore] = None, dataset_uuid: Optional[str] = None, data=None, - df_serializer: Optional[DataFrameSerializer] = None, + metadata: Optional[Dict[str, Dict[str, Any]]] = None, + df_serializer: Optional[ParquetSerializer] = None, + overwrite: bool = False, + metadata_merger=None, metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, factory=None, secondary_indices=None, - table_name: str = SINGLE_TABLE, ): """ Write the parquet file(s) for a single partition. This will **not** update the dataset header and can therefore @@ -574,24 +665,35 @@ def write_single_partition( ------- An empty :class:`~kartothek.io_components.metapartition.MetaPartition` referencing the new files """ + if metadata is not None: + warnings.warn( + "The keyword `metadata` has no use and will be removed in the next major release ", + DeprecationWarning, + ) + + if overwrite is not False: + warnings.warn( + "The keyword `overwrite` has no use and will be removed in the next major release ", + DeprecationWarning, + ) + + if metadata_merger is not None: + warnings.warn( + "The keyword `metadata_merger` has no use and will be removed in the next major release ", + DeprecationWarning, + ) + if data is None: raise TypeError("The parameter `data` is not optional") - dataset_factory, ds_metadata_version, partition_on = validate_partition_keys( + _, ds_metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=lazy_store(store), ds_factory=factory, default_metadata_version=metadata_version, partition_on=partition_on, ) - if dataset_factory.table_name: - if dataset_factory.table_name != table_name: - raise RuntimeError( - f"Trying to write a partition with table name {table_name} but dataset {dataset_factory.dataset_uuid} has already table {dataset_factory.table_name}." - ) - mp = parse_input_to_metapartition( - obj=data, metadata_version=ds_metadata_version, table_name=table_name - ) + mp = parse_input_to_metapartition(obj=data, metadata_version=ds_metadata_version) if partition_on: mp = mp.partition_on(partition_on) @@ -614,13 +716,14 @@ def update_dataset_from_dataframes( dataset_uuid: Optional[str] = None, delete_scope=None, metadata=None, - df_serializer: Optional[DataFrameSerializer] = None, + df_serializer: Optional[ParquetSerializer] = None, metadata_merger: Callable = None, + central_partition_metadata: bool = True, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, + load_dynamic_metadata: bool = True, sort_partitions_by: Optional[str] = None, secondary_indices: Optional[List[str]] = None, - table_name: str = SINGLE_TABLE, factory: Optional[DatasetFactory] = None, ) -> DatasetMetadata: """ @@ -641,6 +744,18 @@ def update_dataset_from_dataframes( -------- :ref:`mutating_datasets` """ + if load_dynamic_metadata is not True: + warnings.warn( + "The keyword `load_dynamic_metadata` has no use and will be removed in the next major release ", + DeprecationWarning, + ) + + if central_partition_metadata is not True: + warnings.warn( + "The keyword `central_partition_metadata` has no use and will be removed in the next major release ", + DeprecationWarning, + ) + ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, @@ -649,14 +764,13 @@ def update_dataset_from_dataframes( partition_on=partition_on, ) - # ensured by normalize_args but mypy doesn't recognize it - assert secondary_indices is not None - inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices) del secondary_indices mp = parse_input_to_metapartition( - df_list, metadata_version=metadata_version, table_name=table_name, + df_list, + metadata_version=metadata_version, + expected_secondary_indices=inferred_indices, ) if sort_partitions_by: @@ -696,14 +810,25 @@ def build_dataset_indices(store, dataset_uuid, columns, factory=None): ---------- """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) - cols_to_load = set(columns) & set(ds_factory.schema.names) + cols_to_load = { + table: set(columns) & set(meta.names) + for table, meta in ds_factory.table_meta.items() + } + cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols} new_partitions = [] for mp in dispatch_metapartitions_from_factory(ds_factory): - mp = mp.load_dataframes(store=ds_factory.store, columns=cols_to_load,) + mp = mp.load_dataframes( + store=ds_factory.store, + tables=list(cols_to_load.keys()), + columns=cols_to_load, + ) mp = mp.build_indices(columns=columns) mp = mp.remove_dataframes() # Remove dataframe from memory new_partitions.append(mp) @@ -728,7 +853,10 @@ def garbage_collect_dataset(dataset_uuid=None, store=None, factory=None): """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=False, ) nested_files = dispatch_files_to_gc( diff --git a/kartothek/io/iter.py b/kartothek/io/iter.py index df0ebce1..87b4533f 100644 --- a/kartothek/io/iter.py +++ b/kartothek/io/iter.py @@ -1,3 +1,6 @@ +# -*- coding: utf-8 -*- + +import warnings from functools import partial from typing import cast @@ -6,7 +9,6 @@ from kartothek.core.naming import ( DEFAULT_METADATA_STORAGE_FORMAT, DEFAULT_METADATA_VERSION, - SINGLE_TABLE, ) from kartothek.core.uuid import gen_uuid from kartothek.io_components.metapartition import ( @@ -39,13 +41,18 @@ def read_dataset_as_metapartitions__iterator( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object: bool = True, + label_filter=None, + dates_as_object=False, + load_dataset_metadata=False, predicates=None, factory=None, dispatch_by=None, + dispatch_metadata=True, ): """ @@ -62,20 +69,38 @@ def read_dataset_as_metapartitions__iterator( """ ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=factory, + dataset_uuid=dataset_uuid, + store=store, + factory=factory, + load_dataset_metadata=load_dataset_metadata, ) + if len(ds_factory.tables) > 1: + warnings.warn( + "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next " + "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube " + "functionality. " + "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html", + DeprecationWarning, + ) + store = ds_factory.store mps = dispatch_metapartitions_from_factory( - ds_factory, predicates=predicates, dispatch_by=dispatch_by, + ds_factory, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, + label_filter=label_filter, + predicates=predicates, + dispatch_by=dispatch_by, + dispatch_metadata=dispatch_metadata, ) for mp in mps: - if dispatch_by is not None: + if concat_partitions_on_primary_index or dispatch_by is not None: mp = MetaPartition.concat_metapartitions( [ mp_inner.load_dataframes( store=store, + tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, @@ -88,6 +113,7 @@ def read_dataset_as_metapartitions__iterator( mp = cast(MetaPartition, mp) mp = mp.load_dataframes( store=store, + tables=tables, columns=columns, categoricals=categoricals, predicate_pushdown_to_io=predicate_pushdown_to_io, @@ -102,10 +128,13 @@ def read_dataset_as_metapartitions__iterator( def read_dataset_as_dataframes__iterator( dataset_uuid=None, store=None, + tables=None, columns=None, + concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object: bool = True, + label_filter=None, + dates_as_object=False, predicates=None, factory=None, dispatch_by=None, @@ -151,13 +180,18 @@ def read_dataset_as_dataframes__iterator( mp_iter = read_dataset_as_metapartitions__iterator( dataset_uuid=dataset_uuid, store=store, + tables=tables, columns=columns, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, + load_dataset_metadata=False, predicates=predicates, factory=factory, dispatch_by=dispatch_by, + dispatch_metadata=False, ) for mp in mp_iter: yield mp.data @@ -173,12 +207,13 @@ def update_dataset_from_dataframes__iter( metadata=None, df_serializer=None, metadata_merger=None, + central_partition_metadata=True, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, + load_dynamic_metadata=True, sort_partitions_by=None, secondary_indices=None, factory=None, - table_name: str = SINGLE_TABLE, ): """ Update a kartothek dataset in store iteratively, using a generator of dataframes. @@ -196,7 +231,17 @@ def update_dataset_from_dataframes__iter( -------- :ref:`mutating_datasets` """ + if load_dynamic_metadata is not True: + warnings.warn( + "The keyword `load_dynamic_metadata` has no use and will be removed soon", + DeprecationWarning, + ) + if central_partition_metadata is not True: + warnings.warn( + "The keyword `central_partition_metadata` has no use and will be removed in the next major release ", + DeprecationWarning, + ) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, @@ -215,7 +260,9 @@ def update_dataset_from_dataframes__iter( new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition( - df, metadata_version=metadata_version, table_name=table_name, + df, + metadata_version=metadata_version, + expected_secondary_indices=secondary_indices, ) if sort_partitions_by: @@ -258,7 +305,6 @@ def store_dataframes_as_dataset__iter( metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, secondary_indices=None, - table_name: str = SINGLE_TABLE, ): """ Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files). @@ -285,9 +331,7 @@ def store_dataframes_as_dataset__iter( new_partitions = [] for df in df_generator: - mp = parse_input_to_metapartition( - df, metadata_version=metadata_version, table_name=table_name - ) + mp = parse_input_to_metapartition(df, metadata_version=metadata_version) if partition_on: mp = mp.partition_on(partition_on) diff --git a/kartothek/io/testing/build_cube.py b/kartothek/io/testing/build_cube.py index e8b8b620..26b68e5e 100644 --- a/kartothek/io/testing/build_cube.py +++ b/kartothek/io/testing/build_cube.py @@ -11,10 +11,12 @@ KTK_CUBE_METADATA_KEY_IS_SEED, KTK_CUBE_METADATA_PARTITION_COLUMNS, KTK_CUBE_METADATA_SUPPRESS_INDEX_ON, + KTK_CUBE_METADATA_VERSION, ) from kartothek.core.cube.cube import Cube from kartothek.core.dataset import DatasetMetadata from kartothek.core.index import ExplicitSecondaryIndex, PartitionIndex +from kartothek.io.eager import store_dataframes_as_dataset from kartothek.io.testing.utils import assert_num_row_groups from kartothek.io_components.cube.write import MultiTableCommitAborted from kartothek.io_components.metapartition import SINGLE_TABLE @@ -57,6 +59,7 @@ "test_nones", "test_overwrite", "test_overwrite_rollback_ktk_cube", + "test_overwrite_rollback_ktk", "test_parquet", "test_partition_on_enrich_extra", "test_partition_on_enrich_none", @@ -91,7 +94,7 @@ def test_simple_seed_only(driver, function_store): assert isinstance(ds.indices["p"], PartitionIndex) assert isinstance(ds.indices["x"], ExplicitSecondaryIndex) - assert ds.table_name == SINGLE_TABLE + assert set(ds.table_meta) == {SINGLE_TABLE} def test_simple_two_datasets(driver, function_store): @@ -132,8 +135,8 @@ def test_simple_two_datasets(driver, function_store): assert set(ds_enrich.indices.keys()) == {"p"} assert isinstance(ds_enrich.indices["p"], PartitionIndex) - assert ds_source.table_name == SINGLE_TABLE - assert ds_enrich.table_name == SINGLE_TABLE + assert set(ds_source.table_meta) == {SINGLE_TABLE} + assert set(ds_enrich.table_meta) == {SINGLE_TABLE} def test_indices(driver, function_store): @@ -1030,7 +1033,6 @@ def test_fails_null_index(driver, function_store): assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("seed"), function_store()) -@pytest.mark.xfail(reason="different") def test_fail_all_empty(driver, driver_name, function_store): """ Might happen due to DB-based filters. @@ -1129,7 +1131,81 @@ def test_overwrite_rollback_ktk_cube(driver, function_store): assert isinstance(ds_enrich.indices["p"], PartitionIndex) assert set(ds_enrich.indices["i2"].index_dct.keys()) == {20, 21, 22, 23} - assert ds_source.schema.field("v1").type == pa.int64() + assert ds_source.table_meta[SINGLE_TABLE].field("v1").type == pa.int64() + + +def test_overwrite_rollback_ktk(driver, function_store): + """ + Checks that require a rollback (like overlapping columns) should recover the former state correctly. + """ + cube = Cube( + dimension_columns=["x"], + partition_columns=["p"], + uuid_prefix="cube", + seed_dataset="source", + index_columns=["i1", "i2", "i3", "i4"], + ) + + df_source1 = pd.DataFrame( + { + "x": [0, 1, 2, 3], + "p": [0, 0, 1, 1], + "v1": [10, 11, 12, 13], + "i1": [10, 11, 12, 13], + } + ) + df_enrich1 = pd.DataFrame( + { + "x": [0, 1, 2, 3], + "p": [0, 0, 1, 1], + "i2": [20, 21, 22, 23], + "v1": [20, 21, 22, 23], + } + ) + store_dataframes_as_dataset( + dfs=[{"ktk_source": df_source1, "ktk_enrich": df_enrich1}], + store=function_store, + dataset_uuid=cube.ktk_dataset_uuid(cube.seed_dataset), + metadata_version=KTK_CUBE_METADATA_VERSION, + secondary_indices=["i1", "i2"], + ) + + df_source2 = pd.DataFrame( + { + "x": [10, 11], + "p": [10, 10], + "v1": [10.0, 11.0], # also use another dtype here (was int) + "i3": [10, 11], + } + ) + df_enrich2 = pd.DataFrame( + { + "x": [10, 11], + "p": [10, 10], + "v1": [20.0, 21.0], # also use another dtype here (was int) + "i4": [20, 21], + } + ) + with pytest.raises(MultiTableCommitAborted) as exc_info: + driver( + data={"source": df_source2, "enrich": df_enrich2}, + cube=cube, + store=function_store, + overwrite=True, + ) + cause = exc_info.value.__cause__ + assert str(cause).startswith("Found columns present in multiple datasets:") + + ds_source = DatasetMetadata.load_from_store( + uuid=cube.ktk_dataset_uuid(cube.seed_dataset), store=function_store() + ).load_all_indices(function_store()) + + assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset) + + assert len(ds_source.partitions) == 1 + + assert ds_source.table_meta["ktk_source"].field("v1").type == pa.int64() + assert ds_source.table_meta["ktk_enrich"].field("v1").type == pa.int64() @pytest.mark.parametrize("none_first", [False, True]) @@ -1158,7 +1234,7 @@ def test_nones(driver, function_store, none_first, driver_name): assert isinstance(ds.indices["p"], PartitionIndex) assert isinstance(ds.indices["x"], ExplicitSecondaryIndex) - assert ds.table_name == SINGLE_TABLE + assert set(ds.table_meta) == {SINGLE_TABLE} def test_fail_not_a_df(driver, function_store): @@ -1338,8 +1414,8 @@ def test_partition_on_enrich_none(driver, function_store): assert set(ds_enrich.indices.keys()) == set() - assert ds_source.table_name == SINGLE_TABLE - assert ds_enrich.table_name == SINGLE_TABLE + assert set(ds_source.table_meta) == {SINGLE_TABLE} + assert set(ds_enrich.table_meta) == {SINGLE_TABLE} def test_partition_on_enrich_extra(driver, function_store): @@ -1381,8 +1457,8 @@ def test_partition_on_enrich_extra(driver, function_store): assert isinstance(ds_enrich.indices["p"], PartitionIndex) assert isinstance(ds_enrich.indices["x"], PartitionIndex) - assert ds_source.table_name == SINGLE_TABLE - assert ds_enrich.table_name == SINGLE_TABLE + assert set(ds_source.table_meta) == {SINGLE_TABLE} + assert set(ds_enrich.table_meta) == {SINGLE_TABLE} def test_partition_on_index_column(driver, function_store): @@ -1424,8 +1500,8 @@ def test_partition_on_index_column(driver, function_store): assert set(ds_enrich.indices.keys()) == {"i"} assert isinstance(ds_enrich.indices["i"], PartitionIndex) - assert ds_source.table_name == SINGLE_TABLE - assert ds_enrich.table_name == SINGLE_TABLE + assert set(ds_source.table_meta) == {SINGLE_TABLE} + assert set(ds_enrich.table_meta) == {SINGLE_TABLE} def test_fail_partition_on_nondistinc_payload(driver, function_store): diff --git a/kartothek/io/testing/cleanup_cube.py b/kartothek/io/testing/cleanup_cube.py index 6c97ff4e..db408266 100644 --- a/kartothek/io/testing/cleanup_cube.py +++ b/kartothek/io/testing/cleanup_cube.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPARATOR +from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPERATOR from kartothek.core.cube.cube import Cube from kartothek.io.eager_cube import build_cube, copy_cube @@ -185,7 +185,7 @@ def test_additional_files(driver, function_store): key_in_ds = cube.ktk_dataset_uuid(cube.seed_dataset) + "/foo" key_with_ds_prefix = cube.ktk_dataset_uuid(cube.seed_dataset) + ".foo" key_with_cube_prefix = cube.uuid_prefix + ".foo" - key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPARATOR + ".foo" + key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPERATOR + ".foo" function_store().put(key_in_ds, b"") function_store().put(key_with_ds_prefix, b"") diff --git a/kartothek/io/testing/extend_cube.py b/kartothek/io/testing/extend_cube.py index 866999fd..abedf734 100644 --- a/kartothek/io/testing/extend_cube.py +++ b/kartothek/io/testing/extend_cube.py @@ -81,7 +81,7 @@ def test_simple(driver, function_store, existing_cube): assert isinstance(ds.indices["p"], PartitionIndex) assert isinstance(ds.indices["i3"], ExplicitSecondaryIndex) - assert ds.table_name == SINGLE_TABLE + assert set(ds.table_meta) == {SINGLE_TABLE} @pytest.mark.parametrize("chunk_size", [None, 2]) diff --git a/kartothek/io/testing/index.py b/kartothek/io/testing/index.py index 73e0e310..c68db601 100644 --- a/kartothek/io/testing/index.py +++ b/kartothek/io/testing/index.py @@ -5,6 +5,7 @@ from toolz.dicttoolz import valmap from kartothek.core.factory import DatasetFactory +from kartothek.core.index import ExplicitSecondaryIndex from kartothek.io.eager import store_dataframes_as_dataset @@ -17,8 +18,8 @@ def assert_index_dct_equal(dict1, dict2): def test_build_indices(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ - pd.DataFrame({"p": [1, 2]}), - pd.DataFrame({"p": [2, 3]}), + {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, + {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] dataset = store_dataframes_as_dataset( @@ -35,15 +36,8 @@ def test_build_indices(store_factory, metadata_version, bound_build_dataset_indi # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) - index_dct = dataset_factory.indices["p"].index_dct - - assert len(index_dct[1]) == 1 - assert len(index_dct[2]) == 2 - assert len(index_dct[3]) == 1 - - assert len(set(index_dct[3]) & set(index_dct[2])) == 1 - assert len(set(index_dct[1]) & set(index_dct[2])) == 1 - assert len(set(index_dct[1]) & set(index_dct[3])) == 0 + expected = {2: ["cluster_1", "cluster_2"], 3: ["cluster_2"], 1: ["cluster_1"]} + assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct) def test_create_index_from_inexistent_column_fails( @@ -51,8 +45,8 @@ def test_create_index_from_inexistent_column_fails( ): dataset_uuid = "dataset_uuid" partitions = [ - pd.DataFrame({"p": [1, 2]}), - pd.DataFrame({"p": [2, 3]}), + {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, + {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] store_dataframes_as_dataset( @@ -71,8 +65,24 @@ def test_add_column_to_existing_index( ): dataset_uuid = "dataset_uuid" partitions = [ - pd.DataFrame({"p": [1, 2], "x": [100, 4500]}), - pd.DataFrame({"p": [4, 3], "x": [500, 10]}), + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": [1, 2], "x": [100, 4500]}))], + "indices": { + "p": ExplicitSecondaryIndex( + "p", index_dct={1: ["cluster_1"], 2: ["cluster_1"]} + ) + }, + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": [4, 3], "x": [500, 10]}))], + "indices": { + "p": ExplicitSecondaryIndex( + "p", index_dct={4: ["cluster_2"], 3: ["cluster_2"]} + ) + }, + }, ] dataset = store_dataframes_as_dataset( @@ -80,7 +90,6 @@ def test_add_column_to_existing_index( store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, - secondary_indices="p", ) assert dataset.load_all_indices(store=store_factory()).indices.keys() == {"p"} @@ -105,17 +114,20 @@ def test_indices_uints(store_factory, metadata_version, bound_build_dataset_indi p3 = 17128351978467489013 partitions = [ - pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}), - pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}), - pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}), + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}))], + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}))], + }, + { + "label": "cluster_3", + "data": [("core", pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}))], + }, ] - - def assert_expected(index_dct): - assert len(index_dct) == 3 - referenced_partitions = [] - for val in index_dct.values(): - referenced_partitions.extend(val) - assert len(referenced_partitions) == 3 + expected = {p1: ["cluster_1"], p2: ["cluster_2"], p3: ["cluster_3"]} dataset = store_dataframes_as_dataset( dfs=partitions, @@ -131,24 +143,30 @@ def assert_expected(index_dct): # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) - assert_expected(dataset_factory.indices["p"].index_dct) - first_run = dataset_factory.indices["p"].index_dct.copy() + assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct) # Re-create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) - assert_index_dct_equal(first_run, dataset_factory.indices["p"].index_dct) + assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct) def test_empty_partitions(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ - pd.DataFrame({"p": pd.Series([], dtype=np.int8)}), - pd.DataFrame({"p": pd.Series([1], dtype=np.int8)}), + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": pd.Series([], dtype=np.int8)}))], + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": pd.Series([1], dtype=np.int8)}))], + }, ] + expected = {1: ["cluster_2"]} dataset = store_dataframes_as_dataset( dfs=partitions, @@ -164,4 +182,4 @@ def test_empty_partitions(store_factory, metadata_version, bound_build_dataset_i # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) - assert len(dataset_factory.indices["p"].index_dct) == 1 + assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct) diff --git a/kartothek/io/testing/merge.py b/kartothek/io/testing/merge.py new file mode 100644 index 00000000..ec3115c8 --- /dev/null +++ b/kartothek/io/testing/merge.py @@ -0,0 +1,91 @@ +from collections import OrderedDict +from datetime import date + +import pandas as pd +import pandas.testing as pdt + +from kartothek.io_components.metapartition import SINGLE_TABLE + +MERGE_TASKS = [ + { + "left": SINGLE_TABLE, + "right": "helper", + "merge_kwargs": {"how": "left", "sort": False, "copy": False}, + "output_label": "first_output", + }, + { + "left": "first_output", + "right": "PRED", + "merge_kwargs": {"how": "left", "sort": False, "copy": False}, + "output_label": "final", + }, +] + +MERGE_EXP_CL1 = pd.DataFrame( + OrderedDict( + [ + ("P", [1, 1]), + ("L", [1, 1]), + ("TARGET", [1, 1]), + ("HORIZON", [1, 2]), + ("info", ["a", "a"]), + ("PRED", [10, 20]), + ("DATE", pd.to_datetime([date(2010, 1, 1), date(2010, 1, 1)])), + ] + ) +) + +MERGE_EXP_CL2 = pd.DataFrame( + OrderedDict( + [ + ("P", [2, 2]), + ("L", [2, 2]), + ("TARGET", [2, 2]), + ("HORIZON", [1, 2]), + ("info", ["b", "b"]), + ("PRED", [10, 20]), + ("DATE", pd.to_datetime([date(2009, 12, 31), date(2009, 12, 31)])), + ] + ) +) + + +def test_merge_datasets( + dataset, + evaluation_dataset, + store_factory, + store_session_factory, + frozen_time, + bound_merge_datasets, +): + # In the __pipeline case, we also need to check that the write path is + # correct, the tests for it are much larger. + df_list = bound_merge_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=evaluation_dataset.uuid, + store=store_session_factory, + merge_tasks=MERGE_TASKS, + match_how="prefix", + ) + df_list = [mp.data for mp in df_list] + + # Two partitions + assert len(df_list) == 2 + assert len(df_list[1]) == 1 + assert len(df_list[0]) == 1 + # By using values() this test is agnostic to the used key, which is + # currently not of any importance + pdt.assert_frame_equal( + list(df_list[0].values())[0], + MERGE_EXP_CL1, + check_like=True, + check_dtype=False, + check_categorical=False, + ) + pdt.assert_frame_equal( + list(df_list[1].values())[0], + MERGE_EXP_CL2, + check_like=True, + check_dtype=False, + check_categorical=False, + ) diff --git a/kartothek/io/testing/read.py b/kartothek/io/testing/read.py index 073fc06b..4dc3c8cb 100644 --- a/kartothek/io/testing/read.py +++ b/kartothek/io/testing/read.py @@ -21,21 +21,27 @@ Feature toggles (optional): +* ``custom_read_parameters`` - Pass additional backend specific kwargs to the read function. The fixture should return a dict which can be passed using the double asterisks syntax to the callable. + The following fixtures should be present (see tests.read.conftest) * ``use_categoricals`` - Whether or not the call retrievs categorical data. * ``dates_as_object`` - Whether or not the call retrievs date columns as objects. +* ``label_filter`` - a callable to filter partitions by label. """ import datetime +from distutils.version import LooseVersion from functools import partial from itertools import permutations import pandas as pd import pandas.testing as pdt +import pyarrow as pa import pytest from storefact import get_store_from_url +from kartothek.core.uuid import gen_uuid from kartothek.io.eager import store_dataframes_as_dataset from kartothek.io.iter import store_dataframes_as_dataset__iter from kartothek.io_components.metapartition import SINGLE_TABLE, MetaPartition @@ -51,11 +57,36 @@ def dates_as_object(request): return request.param +@pytest.fixture( + params=[True, False], + ids=["load_dataset_metadata_TRUE", "load_dataset_metadata_FALSE"], +) +def load_dataset_metadata(request): + return request.param + + +@pytest.fixture(params=[None, lambda part_label: "cluster_1" in part_label]) +def label_filter(request): + return request.param + + +@pytest.fixture +def custom_read_parameters(): + return {} + + @pytest.fixture(params=[True, False], ids=["use_factory", "no_factory"]) def use_dataset_factory(request, dates_as_object): return request.param +def _strip_unused_categoricals(df): + for col in df.columns: + if pd.api.types.is_categorical_dtype(df[col]): + df[col] = df[col].cat.remove_unused_categories() + return df + + class NoPickle: def __getstate__(self): raise RuntimeError("do NOT pickle this object!") @@ -138,6 +169,7 @@ def _perform_read_test( execute_read_callable, use_categoricals, output_type, + label_filter, dates_as_object, read_kwargs=None, ds_factory=None, @@ -146,7 +178,7 @@ def _perform_read_test( read_kwargs = {} if use_categoricals: # dataset_with_index has an index on L but not on P - categoricals = ["P", "L"] + categoricals = {SINGLE_TABLE: ["P", "L"]} else: categoricals = None @@ -155,11 +187,17 @@ def _perform_read_test( store=store_factory, factory=ds_factory, categoricals=categoricals, + label_filter=label_filter, dates_as_object=dates_as_object, **read_kwargs, ) - assert len(result) == 2 + # The filter should allow only a single partition + if not label_filter: + assert len(result) == 2 + else: + # The filter should allow only a single partition + assert len(result) == 1 if output_type == "metapartition": for res in result: @@ -170,6 +208,7 @@ def sort_by(obj): return obj[SINGLE_TABLE].P.iloc[0] elif output_type == "table": + assert isinstance(result[0], pd.DataFrame) assert "P" in result[0] @@ -177,26 +216,30 @@ def sort_by(obj): return obj.P.iloc[0] else: - assert isinstance(result[0], pd.DataFrame) - assert "P" in result[0] + assert isinstance(result[0], dict) + assert SINGLE_TABLE in result[0] + assert "P" in result[0][SINGLE_TABLE] def sort_by(obj): - return obj.P.iloc[0] + return obj[SINGLE_TABLE].P.iloc[0] result = sorted(result, key=sort_by) expected_df_core_1 = pd.DataFrame( {"P": [1], "L": [1], "TARGET": [1], "DATE": [datetime.date(2010, 1, 1)]} ) + expected_df_helper_1 = pd.DataFrame({"P": [1], "info": "a"}) expected_df_core_2 = pd.DataFrame( {"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]} ) + expected_df_helper_2 = pd.DataFrame({"P": [2], "info": "b"}) + expected_dfs = [ - expected_df_core_1, - expected_df_core_2, + (expected_df_core_1, expected_df_helper_1), + (expected_df_core_2, expected_df_helper_2), ] - for res, expected_df_core in zip(result, expected_dfs): + for res, (expected_df_core, expected_df_helper) in zip(result, expected_dfs): if not dates_as_object: expected_df_core["DATE"] = pd.to_datetime(expected_df_core["DATE"]) if use_categoricals: @@ -204,13 +247,24 @@ def sort_by(obj): {"P": "category", "L": "category"} ) - pdt.assert_frame_equal( - res.reset_index(drop=True), - expected_df_core.reset_index(drop=True), - check_dtype=False, - check_like=True, - check_categorical=False, - ) + if output_type == "table": + + pdt.assert_frame_equal( + _strip_unused_categoricals(res).reset_index(drop=True), + expected_df_core.reset_index(drop=True), + check_dtype=False, + check_like=True, + ) + else: + actual_core = _strip_unused_categoricals(res[SINGLE_TABLE]) + actual_helper = _strip_unused_categoricals(res["helper"]) + assert len(res) == 2 + pdt.assert_frame_equal( + actual_core, expected_df_core, check_dtype=False, check_like=True + ) + pdt.assert_frame_equal( + actual_helper, expected_df_helper, check_dtype=False, check_like=True + ) @pytest.mark.parametrize( @@ -224,21 +278,40 @@ def sort_by(obj): ], ) def test_read_dataset_as_dataframes_predicate( - dataset, store_session_factory, bound_load_dataframes, predicates, output_type + dataset, + store_session_factory, + custom_read_parameters, + bound_load_dataframes, + predicates, + output_type, + backend_identifier, ): if output_type != "dataframe": pytest.skip() result = bound_load_dataframes( - dataset_uuid=dataset.uuid, store=store_session_factory, predicates=predicates, + dataset_uuid=dataset.uuid, + store=store_session_factory, + predicates=predicates, + **custom_read_parameters, ) - core_result = pd.concat(result) + core_result = pd.concat([data[SINGLE_TABLE] for data in result]) expected_core = pd.DataFrame( - {"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]} + { + "P": [2], + "L": [2], + "TARGET": [2], + "DATE": pd.to_datetime([datetime.date(2009, 12, 31)]), + } ) pdt.assert_frame_equal( core_result, expected_core, check_dtype=False, check_like=True ) + helper_result = pd.concat([data["helper"] for data in result]) + expected_helper = pd.DataFrame({"P": [2], "info": "b"}) + pdt.assert_frame_equal( + helper_result, expected_helper, check_dtype=False, check_like=True + ) @pytest.mark.parametrize( @@ -255,6 +328,7 @@ def test_read_dataset_as_dataframes_predicate( def test_read_dataset_as_dataframes_predicate_with_partition_keys( dataset_partition_keys, store_session_factory, + custom_read_parameters, bound_load_dataframes, predicates, output_type, @@ -265,12 +339,19 @@ def test_read_dataset_as_dataframes_predicate_with_partition_keys( dataset_uuid=dataset_partition_keys.uuid, store=store_session_factory, predicates=predicates, + tables=[SINGLE_TABLE], + **custom_read_parameters, ) - core_result = pd.concat(result) + core_result = pd.concat([data[SINGLE_TABLE] for data in result]) expected_core = pd.DataFrame( - {"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]} + { + "P": [2], + "L": [2], + "TARGET": [2], + "DATE": pd.to_datetime([datetime.date(2009, 12, 31)]), + } ) pdt.assert_frame_equal( core_result, expected_core, check_dtype=False, check_like=True @@ -278,7 +359,11 @@ def test_read_dataset_as_dataframes_predicate_with_partition_keys( def test_read_dataset_as_dataframes_predicate_empty( - dataset_partition_keys, store_session_factory, output_type, bound_load_dataframes, + dataset_partition_keys, + store_session_factory, + custom_read_parameters, + output_type, + bound_load_dataframes, ): if output_type != "dataframe": pytest.skip() @@ -286,14 +371,57 @@ def test_read_dataset_as_dataframes_predicate_empty( dataset_uuid=dataset_partition_keys.uuid, store=store_session_factory, predicates=[[("P", "==", -42)]], + tables=[SINGLE_TABLE], columns={SINGLE_TABLE: ["P", "L", "TARGET"]}, + **custom_read_parameters, ) assert len(result) == 0 def _gen_partition(b_c): b, c = b_c - return pd.DataFrame({"a": [1], "b": [b], "c": c}) + df = pd.DataFrame({"a": [1], "b": [b], "c": c}) + return {"data": [("data", df)]} + + +def test_read_dataset_as_dataframes_concat_primary( + store_factory, + custom_read_parameters, + bound_load_dataframes, + output_type, + metadata_version, +): + if output_type != "dataframe": + pytest.skip() + partitions = [] + for part_info in [["1", "H"], ["1", "G"], ["2", "H"], ["2", "G"]]: + partitions.append(_gen_partition(part_info)) + + store_dataframes_as_dataset( + dfs=partitions, + store=store_factory, + dataset_uuid="partitioned_uuid", + metadata_version=metadata_version, + partition_on=["a", "b"], + ) + + result = bound_load_dataframes( + dataset_uuid="partitioned_uuid", + store=store_factory, + concat_partitions_on_primary_index=True, + predicates=[[("b", "==", "1")]], + **custom_read_parameters, + ) + result_df = result[0]["data"].sort_values(by="c") + + expected_df = pd.DataFrame( + {"a": [1, 1], "b": ["1", "1"], "c": ["G", "H"]} + ).sort_values(by="c") + # Concatenated DataFrames have also a concatenated index. + # Reflect this in the test. + expected_df.index = [0, 0] + + pdt.assert_frame_equal(expected_df, result_df, check_like=True) def test_read_dataset_as_dataframes_dispatch_by_empty( @@ -338,7 +466,11 @@ def test_read_dataset_as_dataframes_dispatch_by_single_col( ) unique_a = set() - for data in dispatched_a: + for part in dispatched_a: + if isinstance(part, MetaPartition): + data = part.data["data"] + else: + data = part["data"] unique_dispatch = data[dispatch_by].unique() assert len(unique_dispatch) == 1 assert unique_dispatch[0] not in unique_a @@ -368,8 +500,10 @@ def dataset_dispatch_by( ) clusters = [cluster1, cluster2, cluster3, cluster4] + partitions = [{"data": [("data", c)]} for c in clusters] + store_dataframes_as_dataset__iter( - df_generator=clusters, + df_generator=partitions, store=store_session_factory, dataset_uuid=dataset_dispatch_by_uuid, metadata_version=metadata_version, @@ -397,9 +531,9 @@ def test_read_dataset_as_dataframes_dispatch_by_multi_col( uniques = pd.DataFrame(columns=dispatch_by) for part in dispatched: if isinstance(part, MetaPartition): - data = part.data + data = part.data["data"] else: - data = part + data = part["data"] unique_dispatch = data[list(dispatch_by)].drop_duplicates() assert len(unique_dispatch) == 1 row = unique_dispatch @@ -452,6 +586,7 @@ def test_read_dataset_as_dataframes( bound_load_dataframes, use_categoricals, output_type, + label_filter, dates_as_object, ): if use_dataset_factory: @@ -470,6 +605,7 @@ def test_read_dataset_as_dataframes( execute_read_callable=bound_load_dataframes, use_categoricals=use_categoricals, output_type=output_type, + label_filter=label_filter, dates_as_object=dates_as_object, ) @@ -477,10 +613,12 @@ def test_read_dataset_as_dataframes( def test_read_dataset_as_dataframes_columns_projection( store_factory, bound_load_dataframes, metadata_version ): + table_name = SINGLE_TABLE + def _f(b_c): b, c = b_c df = pd.DataFrame({"a": [1, 1], "b": [b, b], "c": c, "d": [b, b + 1]}) - return df + return {"label": str(c), "data": [(table_name, df)]} in_partitions = [_f([1, 100])] dataset_uuid = "partitioned_uuid" @@ -493,12 +631,16 @@ def _f(b_c): ) result = bound_load_dataframes( - dataset_uuid=dataset_uuid, store=store_factory, columns=["a", "b", "c"], + dataset_uuid=dataset_uuid, + store=store_factory, + columns={table_name: ["a", "b", "c"]}, ) probe = result[0] if isinstance(probe, MetaPartition): - result_dfs = [mp.data for mp in result] + result_dfs = [mp.data[table_name] for mp in result] + elif isinstance(probe, dict): + result_dfs = [mp[table_name] for mp in result] else: result_dfs = result result_df = pd.concat(result_dfs).reset_index(drop=True) @@ -510,10 +652,12 @@ def _f(b_c): def test_read_dataset_as_dataframes_columns_primary_index_only( store_factory, bound_load_dataframes, metadata_version ): + table_name = SINGLE_TABLE + def _f(b_c): b, c = b_c df = pd.DataFrame({"a": [1, 1], "b": [b, b], "c": c, "d": [b, b + 1]}) - return df + return {"label": str(c), "data": [(table_name, df)]} in_partitions = [_f([1, 100])] dataset_uuid = "partitioned_uuid" @@ -526,12 +670,14 @@ def _f(b_c): partition_on=["a", "b"], ) result = bound_load_dataframes( - dataset_uuid=dataset_uuid, store=store_factory, columns=["a", "b"] + dataset_uuid=dataset_uuid, store=store_factory, columns={table_name: ["a", "b"]} ) probe = result[0] if isinstance(probe, MetaPartition): - result_dfs = [mp.data for mp in result] + result_dfs = [mp.data[table_name] for mp in result] + elif isinstance(probe, dict): + result_dfs = [mp[table_name] for mp in result] else: result_dfs = result result_df = pd.concat(result_dfs).reset_index(drop=True) @@ -543,10 +689,12 @@ def _f(b_c): def test_empty_predicate_pushdown_empty_col_projection( dataset, store_session_factory, bound_load_dataframes, backend_identifier ): + table_name = SINGLE_TABLE result = bound_load_dataframes( dataset_uuid=dataset.uuid, + tables=table_name, store=store_session_factory, - columns=[], + columns={table_name: []}, predicates=[[("P", "==", 12345678)]], # this product doesn't exist ) @@ -555,7 +703,9 @@ def test_empty_predicate_pushdown_empty_col_projection( probe = result[0] if isinstance(probe, MetaPartition): - result_dfs = [mp.data for mp in result] + result_dfs = [mp.data[table_name] for mp in result] + elif isinstance(probe, dict): + result_dfs = [mp[table_name] for mp in result] else: result_dfs = result res = pd.concat(result_dfs).reset_index(drop=True) @@ -570,15 +720,18 @@ def test_datetime_predicate_with_dates_as_object( store_factory, bound_load_dataframes, metadata_version, + custom_read_parameters, output_type, partition_on, datetype, comp, ): + table_name = SINGLE_TABLE + def _f(b_c): b, c = b_c df = pd.DataFrame({"a": [1, 1], "b": [b, b], "c": c, "d": [b, b + 1]}) - return df + return {"label": gen_uuid(), "data": [(table_name, df)]} in_partitions = [_f([1, datetype(2000, 1, 1)])] dataset_uuid = "partitioned_uuid" @@ -592,32 +745,44 @@ def _f(b_c): result = bound_load_dataframes( dataset_uuid="partitioned_uuid", + tables=table_name, store=store_factory, predicates=[[("c", comp, datetype(2000, 1, 1))]], dates_as_object=True, + **custom_read_parameters, ) if output_type != "dataframe": return assert len(result) == 1 - df_actual = result[0] + dct = result[0] + assert set(dct.keys()) == {table_name} + df_actual = dct[table_name] - df_expected = in_partitions[0] + df_expected = in_partitions[0]["data"][0][1] pdt.assert_frame_equal(df_actual, df_expected, check_like=True) def test_binary_column_metadata(store_factory, bound_load_dataframes): - df = pd.DataFrame({b"int_col": [1], "🙈".encode(): [2]}) + table_name = SINGLE_TABLE + df = { + "label": "part1", + "data": [(table_name, pd.DataFrame({b"int_col": [1], "🙈".encode(): [2]}))], + } store_dataframes_as_dataset( dfs=[df], store=store_factory, dataset_uuid="dataset_uuid" ) - result = bound_load_dataframes(dataset_uuid="dataset_uuid", store=store_factory) + result = bound_load_dataframes( + dataset_uuid="dataset_uuid", store=store_factory, tables=table_name + ) probe = result[0] if isinstance(probe, MetaPartition): - result_dfs = [mp.data for mp in result] + result_dfs = [mp.data[table_name] for mp in result] + elif isinstance(probe, dict): + result_dfs = [mp[table_name] for mp in result] else: result_dfs = result df = pd.concat(result_dfs).reset_index(drop=True) @@ -626,35 +791,64 @@ def test_binary_column_metadata(store_factory, bound_load_dataframes): assert set(df.columns.map(type)) == {str} +@pytest.mark.xfail( + LooseVersion(pa.__version__) < "0.16.1.dev308", + reason="pa.Schema.from_pandas cannot deal with ExtensionDtype", +) def test_extensiondtype_rountrip(store_factory, bound_load_dataframes): - df = pd.DataFrame({"str": pd.Series(["a", "b"], dtype="string")}) + table_name = SINGLE_TABLE + df = { + "label": "part1", + "data": [ + (table_name, pd.DataFrame({"str": pd.Series(["a", "b"], dtype="string")})) + ], + } store_dataframes_as_dataset( dfs=[df], store=store_factory, dataset_uuid="dataset_uuid" ) - result = bound_load_dataframes(dataset_uuid="dataset_uuid", store=store_factory) + result = bound_load_dataframes( + dataset_uuid="dataset_uuid", store=store_factory, tables=table_name + ) probe = result[0] if isinstance(probe, MetaPartition): - result_dfs = [mp.data for mp in result] + result_dfs = [mp.data[table_name] for mp in result] + elif isinstance(probe, dict): + result_dfs = [mp[table_name] for mp in result] else: result_dfs = result result_df = pd.concat(result_dfs).reset_index(drop=True) - pdt.assert_frame_equal(df, result_df) + pdt.assert_frame_equal(df["data"][0][1], result_df) -def test_non_default_table_name_roundtrip(store_factory, bound_load_dataframes): - df = pd.DataFrame({"A": [1]}) - store_dataframes_as_dataset( - dfs=[df], store=store_factory, dataset_uuid="dataset_uuid", table_name="foo" +def test_read_dataset_multi_table_warning( + store_factory, metadata_version, bound_load_dataframes +): + dfs = [ + { + "data": { + "core-table": pd.DataFrame({"id": [22, 23], "f": [1.1, 2.4]}), + "aux-table": pd.DataFrame({"id": [22], "col1": ["x"]}), + } + }, + { + "data": { + "core-table": pd.DataFrame({"id": [29, 31], "f": [3.2, 0.6]}), + "aux-table": pd.DataFrame({"id": [31], "col1": ["y"]}), + } + }, + ] + + dm = store_dataframes_as_dataset( + dfs=dfs, store=store_factory, dataset_uuid="dataset_uuid" ) - result = bound_load_dataframes(dataset_uuid="dataset_uuid", store=store_factory) - probe = result[0] - if isinstance(probe, MetaPartition): - result_dfs = [mp.data for mp in result] - else: - result_dfs = result - result_df = pd.concat(result_dfs).reset_index(drop=True) - pdt.assert_frame_equal(df, result_df) + with pytest.warns( + DeprecationWarning, + match="Trying to read a dataset with multiple internal tables.*", + ): + bound_load_dataframes( + dataset_uuid="dataset_uuid", store=store_factory, tables=dm.tables[1] + ) diff --git a/kartothek/io/testing/update.py b/kartothek/io/testing/update.py index ea9bcd68..943aecf2 100644 --- a/kartothek/io/testing/update.py +++ b/kartothek/io/testing/update.py @@ -2,15 +2,14 @@ # pylint: disable=E1101 -from datetime import date from functools import partial import numpy as np import pandas as pd import pytest -from kartothek.api.dataset import read_dataset_as_ddf from kartothek.core.dataset import DatasetMetadata +from kartothek.core.index import ExplicitSecondaryIndex from kartothek.core.naming import DEFAULT_METADATA_VERSION from kartothek.core.testing import TIME_TO_FREEZE_ISO from kartothek.io.eager import ( @@ -20,12 +19,20 @@ from kartothek.io.iter import read_dataset_as_dataframes__iterator -def test_update_dataset_with_partitions( +def test_update_dataset_with_partitions__reducer( store_factory, metadata_version, bound_update_dataset, mocker, store ): partitions = [ - pd.DataFrame({"p": [1]}), - pd.DataFrame({"p": [2]}), + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": [1]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": [2]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, + }, ] dataset = bound_update_dataset( partitions, @@ -37,8 +44,14 @@ def test_update_dataset_with_partitions( ) dataset = dataset.load_index("p", store) + part3 = { + "label": "cluster_3", + "data": [("core", pd.DataFrame({"p": [3]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]})}, + } + dataset_updated = bound_update_dataset( - [pd.DataFrame({"p": [3]})], + [part3], store=store_factory, delete_scope=[{"p": 1}], metadata={"extra": "metadata"}, @@ -83,27 +96,72 @@ def test_update_dataset_with_partitions( assert dataset_updated == stored_dataset -@pytest.mark.xfail(reason="How to handle empty input??") -def test_update_dataset_with_partitions_delete_only( - store_factory, metadata_version, frozen_time_em, bound_update_dataset, store +def test_update_dataset_with_partitions_no_index_input_info( + store_factory, metadata_version, bound_update_dataset, store ): partitions = [ - pd.DataFrame({"p": [1]}), - pd.DataFrame({"p": [2]}), + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": [1]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": [2]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, + }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", + metadata_version=metadata_version, + ) + + # The input information doesn't explicitly provide index information + # Since the dataset has an index, it must be updated either way + part3 = {"label": "cluster_3", "data": [("core", pd.DataFrame({"p": [3]}))]} + dataset_updated = bound_update_dataset( + [part3], + store=store_factory, + dataset_uuid=dataset.uuid, + delete_scope=[{"p": 1}], + metadata={"extra": "metadata"}, + default_metadata_version=metadata_version, secondary_indices=["p"], + ) + dataset_updated = dataset_updated.load_all_indices(store) + assert 3 in dataset_updated.indices["p"].to_dict() + + +def test_update_dataset_with_partitions__reducer_delete_only( + store_factory, metadata_version, frozen_time_em, bound_update_dataset, store +): + partitions = [ + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": [1]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": [2]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, + }, + ] + dataset = store_dataframes_as_dataset( + dfs=partitions, + store=store_factory, + metadata={"dataset": "metadata"}, + dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) - # FIXME: is this a regression? + empty_part = [] dataset_updated = bound_update_dataset( - None, + [empty_part], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{"p": 1}], @@ -113,8 +171,8 @@ def test_update_dataset_with_partitions_delete_only( ) dataset_updated = dataset_updated.load_index("p", store) - assert len(dataset.partitions) == 2 - assert len(dataset_updated.partitions) == 1 + assert sorted(dataset.partitions) == ["cluster_1", "cluster_2"] + assert list(dataset_updated.partitions) == ["cluster_2"] store_files = list(store.keys()) # 1 dataset metadata file and 1 index file and 2 partition files @@ -125,8 +183,8 @@ def test_update_dataset_with_partitions_delete_only( expected_number_files += 1 assert len(store_files) == expected_number_files - assert set(dataset.indices["p"].observed_values()) == {1, 2} - assert set(dataset_updated.indices["p"].observed_values()) == {2} + assert dataset.indices["p"].index_dct == {1: ["cluster_1"], 2: ["cluster_2"]} + assert dataset_updated.indices["p"].index_dct == {2: ["cluster_2"]} # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) @@ -147,15 +205,22 @@ def test_update_dataset_with_partitions__reducer_partitions( df2.L = 2 df2.TARGET += 2 df_list = [ - df1, - df2, + { + "label": "cluster_1", + "data": [("core", df1)], + "indices": {"L": {k: ["cluster_1"] for k in df1["L"].unique()}}, + }, + { + "label": "cluster_2", + "data": [("core", df2)], + "indices": {"L": {k: ["cluster_2"] for k in df2["L"].unique()}}, + }, ] dataset = store_dataframes_as_dataset( dfs=df_list, store=store_factory, dataset_uuid="dataset_uuid", partition_on=["P"], - secondary_indices="L", metadata_version=4, ) dataset_loadedidx = dataset.load_all_indices(store=store_factory()) @@ -171,8 +236,14 @@ def test_update_dataset_with_partitions__reducer_partitions( df3 = df2.copy(deep=True) df3.TARGET -= 5 + part3 = { + "label": "cluster_3", + "data": {"core": df3}, + "indices": {"L": {k: ["cluster_3"] for k in df3["L"].unique()}}, + } + dataset_updated = bound_update_dataset( - [df3], + [part3], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{"L": 2}], @@ -216,8 +287,14 @@ def test_update_dataset_with_partitions__reducer_partitions( def test_update_dataset_with_partitions__reducer_nonexistent( store_factory, metadata_version, frozen_time_em, bound_update_dataset, store ): + + part3 = { + "label": "cluster_3", + "data": [("core", pd.DataFrame({"p": [3]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]})}, + } dataset_updated = bound_update_dataset( - [pd.DataFrame({"p": [3]})], + [part3], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{"p": 1}], @@ -368,8 +445,9 @@ def test_update_dataset_with_partitions__reducer_nonexistent( ], ) def test_schema_check_update(dfs, ok, store_factory, bound_update_dataset): + df_list = [{"label": "cluster_1", "data": [("core", df)]} for df in dfs] store_dataframes_as_dataset( - dfs=dfs[:1], + dfs=df_list[:1], store=store_factory, dataset_uuid="dataset_uuid", partition_on=["P"], @@ -383,13 +461,13 @@ def test_schema_check_update(dfs, ok, store_factory, bound_update_dataset): ) if ok: - pipe(dfs[1:]) + pipe(df_list[1:]) else: with pytest.raises( Exception, - match=r"Schemas\sfor\sdataset\s\\*'dataset_uuid\\*'\sare\snot\scompatible!", + match=r"Schemas\sfor\stable\s\\*'core\\*'\sof\sdataset\s\\*'dataset_uuid\\*'\sare\snot\scompatible!", ): - pipe(dfs[1:]) + pipe(df_list[1:]) def test_sort_partitions_by( @@ -412,10 +490,10 @@ def test_sort_partitions_by( } ) - df_list = [df1] + df_list = [{"label": "cluster_1", "data": [("core", df1)]}] new_partitions = [ - df2, - df3, + {"label": "cluster_2", "data": [("core", df2)]}, + {"label": "cluster_3", "data": [("core", df3)]}, ] store_dataframes_as_dataset( @@ -435,15 +513,13 @@ def test_sort_partitions_by( ) # Check that the `sort_partitions_by` column is indeed sorted monotonically among partitions - for df in read_dataset_as_dataframes__iterator( + for label_df_tupl in read_dataset_as_dataframes__iterator( store=store_factory, dataset_uuid="dataset_uuid" ): - assert (df.TARGET == sorted(df.TARGET)).all() + for _, df in label_df_tupl.items(): + assert (df.TARGET == sorted(df.TARGET)).all() -@pytest.mark.xfail( - reason="Unclear what this is actually testing, other than the mock itself" -) def test_metadata_version( store_factory, bound_update_dataset, @@ -455,8 +531,8 @@ def test_metadata_version( dataset_uuid = "dataset_uuid" partitions = [ - pd.DataFrame({"p": [1, 2]}), - pd.DataFrame({"p": [2, 3]}), + {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, + {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] dataset = store_dataframes_as_dataset( @@ -466,18 +542,18 @@ def test_metadata_version( metadata_version=DEFAULT_METADATA_VERSION, ) - # with pytest.raises(AssertionError, match="Traversed through mock"): - # # Try to commit data to dataset using a different metadata version - # # and different data format (format is mocked) - # # This does not raise when the `parse_input_to_metapartition` - # # argument is `default_metadata_version` instead of `metadata_version` - new_partitions = [pd.DataFrame({"p": [2, 3]})] - bound_update_dataset( - new_partitions, - store=store_factory, - dataset_uuid=dataset_uuid, - default_metadata_version=mock_default_metadata_version, - ) + with pytest.raises(AssertionError, match="Traversed through mock"): + # Try to commit data to dataset using a different metadata version + # and different data format (format is mocked) + # This does not raise when the `parse_input_to_metapartition` + # argument is `default_metadata_version` instead of `metadata_version` + new_partitions = ("core", pd.DataFrame({"p": [2, 3]})) + bound_update_dataset( + new_partitions, + store=store_factory, + dataset_uuid=dataset_uuid, + default_metadata_version=mock_default_metadata_version, + ) mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) assert len(mps) == len(dataset.partitions) @@ -486,8 +562,8 @@ def test_metadata_version( def test_raises_on_invalid_input(store_factory, bound_update_dataset): dataset_uuid = "dataset_uuid" partitions = [ - pd.DataFrame({"p": [1, 2]}), - pd.DataFrame({"p": [2, 3]}), + {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, + {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] dataset = store_dataframes_as_dataset( @@ -505,6 +581,53 @@ def test_raises_on_invalid_input(store_factory, bound_update_dataset): assert len(mps) == len(dataset.partitions) +@pytest.mark.parametrize("define_indices_on_partition", (False, True)) +def test_raises_on_new_index_creation( + backend_identifier, store_factory, bound_update_dataset, define_indices_on_partition +): + # This test can be removed once the variable index input is removed in + # favour of the test `test_update_secondary_indices_subset` + if backend_identifier == "dask.dataframe" and define_indices_on_partition: + pytest.skip() # Constructs a dataframe which ignores index information passed as dict + + dataset_uuid = "dataset_uuid" + index_column = "p" + partitions = [ + {"label": "cluster_1", "data": [("core", pd.DataFrame({index_column: [1, 2]}))]} + ] + + new_partition = { + "label": "cluster_2", + "data": [("core", pd.DataFrame({index_column: [2, 3]}))], + } + + dataset_update_secondary_indices = [index_column] + if define_indices_on_partition: + dataset_update_secondary_indices = None + new_partition["indices"] = { + index_column: ExplicitSecondaryIndex( + index_column, + { + k: [new_partition["label"]] + for k in new_partition["data"][0][1][index_column].unique() + }, + ) + } + + # Create dataset without secondary indices + store_dataframes_as_dataset( + dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid + ) + + with pytest.raises(Exception, match="Incorrect indices provided for dataset"): + bound_update_dataset( + [new_partition], + store=store_factory, + dataset_uuid=dataset_uuid, + secondary_indices=dataset_update_secondary_indices, + ) + + def test_update_secondary_indices_subset(store_factory, bound_update_dataset): df1 = pd.DataFrame({"A": range(10), "indexed": 1}) dataset_uuid = "dataset_uuid" @@ -530,16 +653,39 @@ def test_update_secondary_indices_subset(store_factory, bound_update_dataset): ) -def test_update_first_time_with_secondary_indices(store_factory, bound_update_dataset): +@pytest.mark.parametrize("define_indices_on_partition", (False, True)) +def test_update_first_time_with_secondary_indices( + store_factory, bound_update_dataset, define_indices_on_partition +): + """ + Check it is possible to create a new dataset with indices defined either on partition or using the + `secondary_indices` kwarg. The intention of this test is to verify that there are no exceptions raised + related to index validation when a dataset is created using an `update` function + """ dataset_uuid = "dataset_uuid" index_column = "p" - new_partition = [pd.DataFrame({index_column: [1, 2]})] - + new_partition = { + "label": "cluster_1", + "data": [("core", pd.DataFrame({index_column: [1, 2]}))], + } + + dataset_update_secondary_indices = [index_column] + if define_indices_on_partition: + dataset_update_secondary_indices = None + new_partition["indices"] = { + index_column: ExplicitSecondaryIndex( + index_column, + { + k: [new_partition["label"]] + for k in new_partition["data"][0][1][index_column].unique() + }, + ) + } bound_update_dataset( [new_partition], store=store_factory, dataset_uuid=dataset_uuid, - secondary_indices=[index_column], + secondary_indices=dataset_update_secondary_indices, ) @@ -557,7 +703,7 @@ def test_partition_on_null(store_factory, bound_update_dataset): # gh-262 Exception, match=r"Original dataframe size .* on a column with null values." ): bound_update_dataset( - [df], + [{"data": {"table": df}}], store=store_factory, dataset_uuid="a_unique_dataset_identifier", partition_on=["part"], @@ -568,7 +714,7 @@ def test_update_infers_partition_on(store_factory, bound_update_dataset, df_not_ dataset_uuid = "dataset_uuid" dataset = bound_update_dataset( - [df_not_nested], + [{"data": {"table": df_not_nested}}], dataset_uuid=dataset_uuid, store=store_factory, partition_on=df_not_nested.columns[0], @@ -577,7 +723,9 @@ def test_update_infers_partition_on(store_factory, bound_update_dataset, df_not_ # do not use partition_on since it should be interfered from the existing dataset updated_dataset = bound_update_dataset( - [df_not_nested], dataset_uuid=dataset_uuid, store=store_factory, + [{"data": {"table": df_not_nested}}], + dataset_uuid=dataset_uuid, + store=store_factory, ) assert len(updated_dataset.partitions) == 2 * len(dataset.partitions) @@ -588,7 +736,7 @@ def test_update_raises_incompatible_partition_keys( ): dataset_uuid = "dataset_uuid" bound_update_dataset( - [df_not_nested], + [{"data": {"table": df_not_nested}}], dataset_uuid=dataset_uuid, store=store_factory, partition_on=df_not_nested.columns[0], @@ -598,7 +746,7 @@ def test_update_raises_incompatible_partition_keys( ValueError, match="Incompatible set of partition keys encountered." ): bound_update_dataset( - [df_not_nested], + [{"data": {"table": df_not_nested}}], dataset_uuid=dataset_uuid, store=store_factory, partition_on=df_not_nested.columns[1], @@ -610,7 +758,7 @@ def test_update_raises_incompatible_inidces( ): dataset_uuid = "dataset_uuid" bound_update_dataset( - [df_not_nested], + [{"data": {"table": df_not_nested}}], dataset_uuid=dataset_uuid, store=store_factory, secondary_indices=df_not_nested.columns[0], @@ -618,55 +766,8 @@ def test_update_raises_incompatible_inidces( # Not allowed to update with indices which do not yet exist in dataset with pytest.raises(ValueError, match="indices"): bound_update_dataset( - [df_not_nested], + [{"data": {"table": df_not_nested}}], dataset_uuid=dataset_uuid, store=store_factory, secondary_indices=df_not_nested.columns[1], ) - - -def test_update_of_dataset_with_non_default_table_name( - store_factory, bound_update_dataset -): - """ - Tests that datasets with table names other than "table" can be created, - updated and read successfully (regression test for issue #445). - """ - - # Create initial dataset - dataset_uuid = "dataset_uuid" - df_create = pd.DataFrame( - {"date": [date(2021, 1, 1), date(2021, 1, 2)], "value": range(2)} - ) - store_dataframes_as_dataset( - dfs=[df_create], - store=store_factory, - dataset_uuid=dataset_uuid, - table_name="non-default-name", - partition_on=["date"], - ) - dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory()) - assert dm.table_name == "non-default-name" - - # Update dataset - df_update = pd.DataFrame( - {"date": [date(2021, 1, 3), date(2021, 1, 4)], "value": range(2)} - ) - bound_update_dataset( - [df_update], - store=store_factory, - dataset_uuid=dataset_uuid, - table_name="non-default-name", - partition_on=["date"], - ) - dm = DatasetMetadata.load_from_store(dataset_uuid, store_factory()) - assert dm.table_name == "non-default-name" - - # Assert equality of dataframe - df_read = ( - read_dataset_as_ddf(dataset_uuid, store_factory(), "table") - .compute() - .reset_index(drop=True) - ) - df_expected = df_create.append(df_update).reset_index(drop=True) - pd.testing.assert_frame_equal(df_read, df_expected) diff --git a/kartothek/io/testing/utils.py b/kartothek/io/testing/utils.py index fb5c4f0f..fba503b3 100644 --- a/kartothek/io/testing/utils.py +++ b/kartothek/io/testing/utils.py @@ -1,10 +1,12 @@ import math +import string import numpy as np import pandas as pd from pyarrow.parquet import ParquetFile from kartothek.io.eager import store_dataframes_as_dataset +from kartothek.io_components.metapartition import SINGLE_TABLE def create_dataset(dataset_uuid, store_factory, metadata_version): @@ -12,14 +14,28 @@ def create_dataset(dataset_uuid, store_factory, metadata_version): {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - df_list = [df.copy(deep=True), df.copy(deep=True)] + df_helper = pd.DataFrame( + {"P": np.arange(0, 10), "info": string.ascii_lowercase[:10]} + ) + + df_list = [ + { + "label": "cluster_1", + "data": [(SINGLE_TABLE, df.copy(deep=True)), ("helper", df_helper)], + "indices": {"P": {val: ["cluster_2"] for val in df.TARGET.unique()}}, + }, + { + "label": "cluster_2", + "data": [(SINGLE_TABLE, df.copy(deep=True)), ("helper", df_helper)], + "indices": {"P": {val: ["cluster_2"] for val in df.TARGET.unique()}}, + }, + ] return store_dataframes_as_dataset( dfs=df_list, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, - secondary_indices="P", ) diff --git a/kartothek/io/testing/write.py b/kartothek/io/testing/write.py index 1995a945..4ba4cfb8 100644 --- a/kartothek/io/testing/write.py +++ b/kartothek/io/testing/write.py @@ -2,6 +2,7 @@ # pylint: disable=E1101 +import string from collections import OrderedDict from functools import partial @@ -12,8 +13,8 @@ from storefact import get_store_from_url from kartothek.core.dataset import DatasetMetadata +from kartothek.core.index import ExplicitSecondaryIndex from kartothek.core.uuid import gen_uuid -from kartothek.io.eager import read_table from kartothek.io_components.metapartition import MetaPartition from kartothek.serialization import DataFrameSerializer @@ -72,7 +73,20 @@ def test_file_structure_dataset_v4(store_factory, bound_store_dataframes): {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - df_list = [df.copy(deep=True), df.copy(deep=True)] + df_helper = pd.DataFrame( + {"P": np.arange(0, 10), "info": string.ascii_lowercase[:10]} + ) + + df_list = [ + { + "label": "cluster_1", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + { + "label": "cluster_2", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + ] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=4 @@ -82,10 +96,19 @@ def test_file_structure_dataset_v4(store_factory, bound_store_dataframes): assert len(dataset.partitions) == 2 store = store_factory() - - assert len(store.keys()) == 4 - assert "dataset_uuid/table/_common_metadata" in store - assert "dataset_uuid.by-dataset-metadata.json" in store + # TODO: json -> msgpack + expected_keys = set( + [ + "dataset_uuid.by-dataset-metadata.json", + "dataset_uuid/helper/cluster_1.parquet", + "dataset_uuid/helper/cluster_2.parquet", + "dataset_uuid/helper/_common_metadata", + "dataset_uuid/core/cluster_1.parquet", + "dataset_uuid/core/cluster_2.parquet", + "dataset_uuid/core/_common_metadata", + ] + ) + assert set(expected_keys) == set(store.keys()) def test_file_structure_dataset_v4_partition_on(store_factory, bound_store_dataframes): @@ -94,8 +117,24 @@ def test_file_structure_dataset_v4_partition_on(store_factory, bound_store_dataf df = pd.DataFrame( {"P": [1, 2, 3, 1, 2, 3], "L": [1, 1, 1, 2, 2, 2], "TARGET": np.arange(10, 16)} ) + df_helper = pd.DataFrame( + { + "P": [1, 2, 3, 1, 2, 3], + "L": [1, 1, 1, 2, 2, 2], + "info": string.ascii_lowercase[:2], + } + ) - df_list = [df.copy(deep=True), df.copy(deep=True)] + df_list = [ + { + "label": "cluster_1", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + { + "label": "cluster_2", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + ] dataset = bound_store_dataframes( df_list, store=store_factory, @@ -111,8 +150,102 @@ def test_file_structure_dataset_v4_partition_on(store_factory, bound_store_dataf assert len(dataset.partitions) == 12 store = store_factory() - actual_keys = set(store.keys()) - assert len(actual_keys) == 14 # one per partition + json + schema + + expected_keys = set( + [ + "dataset_uuid.by-dataset-metadata.json", + "dataset_uuid/helper/P=1/L=1/cluster_1.parquet", + "dataset_uuid/helper/P=1/L=1/cluster_2.parquet", + "dataset_uuid/helper/P=1/L=2/cluster_1.parquet", + "dataset_uuid/helper/P=1/L=2/cluster_2.parquet", + "dataset_uuid/helper/P=2/L=1/cluster_1.parquet", + "dataset_uuid/helper/P=2/L=1/cluster_2.parquet", + "dataset_uuid/helper/P=2/L=2/cluster_1.parquet", + "dataset_uuid/helper/P=2/L=2/cluster_2.parquet", + "dataset_uuid/helper/P=3/L=1/cluster_1.parquet", + "dataset_uuid/helper/P=3/L=1/cluster_2.parquet", + "dataset_uuid/helper/P=3/L=2/cluster_1.parquet", + "dataset_uuid/helper/P=3/L=2/cluster_2.parquet", + "dataset_uuid/helper/_common_metadata", + "dataset_uuid/core/P=1/L=1/cluster_1.parquet", + "dataset_uuid/core/P=1/L=1/cluster_2.parquet", + "dataset_uuid/core/P=1/L=2/cluster_1.parquet", + "dataset_uuid/core/P=1/L=2/cluster_2.parquet", + "dataset_uuid/core/P=2/L=1/cluster_1.parquet", + "dataset_uuid/core/P=2/L=1/cluster_2.parquet", + "dataset_uuid/core/P=2/L=2/cluster_1.parquet", + "dataset_uuid/core/P=2/L=2/cluster_2.parquet", + "dataset_uuid/core/P=3/L=1/cluster_1.parquet", + "dataset_uuid/core/P=3/L=1/cluster_2.parquet", + "dataset_uuid/core/P=3/L=2/cluster_1.parquet", + "dataset_uuid/core/P=3/L=2/cluster_2.parquet", + "dataset_uuid/core/_common_metadata", + ] + ) + + assert set(expected_keys) == set(store.keys()) + + +def test_file_structure_dataset_v4_partition_on_second_table_no_index_col( + store_factory, bound_store_dataframes +): + df = pd.DataFrame( + {"P": np.arange(0, 2), "L": np.arange(0, 2), "TARGET": np.arange(10, 12)} + ) + df_helper = pd.DataFrame({"P": [0, 0, 1], "info": string.ascii_lowercase[:2]}) + + df_list = [ + { + "label": "cluster_1", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + { + "label": "cluster_2", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + ] + + with pytest.raises(Exception): + bound_store_dataframes( + df_list, + store=store_factory, + dataset_uuid="dataset_uuid", + partition_on=["P", "L"], + metadata_version=4, + ) + + +def test_file_structure_dataset_v4_partition_on_second_table_no_index_col_simple_group( + store_factory, bound_store_dataframes +): + """ + Pandas seems to stop evaluating the groupby expression if the dataframes after the first column split + is of length 1. This seems to be an optimization which should, however, still raise a KeyError + """ + df = pd.DataFrame( + {"P": np.arange(0, 2), "L": np.arange(0, 2), "TARGET": np.arange(10, 12)} + ) + df_helper = pd.DataFrame({"P": [0, 1], "info": string.ascii_lowercase[:2]}) + + df_list = [ + { + "label": "cluster_1", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + { + "label": "cluster_2", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + ] + + with pytest.raises(Exception): + bound_store_dataframes( + df_list, + store=store_factory, + dataset_uuid="dataset_uuid", + partition_on=["P", "L"], + metadata_version=4, + ) def test_store_dataframes_as_dataset( @@ -122,7 +255,20 @@ def test_store_dataframes_as_dataset( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - df_list = [df.copy(deep=True), df.copy(deep=True)] + df_helper = pd.DataFrame( + {"P": np.arange(0, 10), "info": string.ascii_lowercase[:10]} + ) + + df_list = [ + { + "label": "cluster_1", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + { + "label": "cluster_2", + "data": [("core", df.copy(deep=True)), ("helper", df_helper)], + }, + ] dataset = bound_store_dataframes( df_list, @@ -145,14 +291,24 @@ def test_store_dataframes_as_dataset( index_dct = stored_dataset.indices["P"].load(store).index_dct assert sorted(index_dct.keys()) == list(range(0, 10)) + assert any([sorted(p) == ["cluster_1", "cluster_2"] for p in index_dct.values()]) - counter = 0 - for k in store.keys(): - if "parquet" in k and "indices" not in k: - counter += 1 - df_stored = DataFrameSerializer.restore_dataframe(key=k, store=store) - pdt.assert_frame_equal(df, df_stored) - assert counter == 2 + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_1"].files["core"], store=store + ) + pdt.assert_frame_equal(df, df_stored) + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_2"].files["core"], store=store + ) + pdt.assert_frame_equal(df, df_stored) + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_1"].files["helper"], store=store + ) + pdt.assert_frame_equal(df_helper, df_stored) + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_2"].files["helper"], store=store + ) + pdt.assert_frame_equal(df_helper, df_stored) def test_store_dataframes_as_dataset_empty_dataframe( @@ -164,8 +320,25 @@ def test_store_dataframes_as_dataset_empty_dataframe( """ df_empty = df_all_types.drop(0) + # Store a second table with shared columns. All shared columns must be of the same type + # This may fail in the presence of empty partitions if the schema validation doesn't account for it + df_shared_cols = df_all_types.loc[:, df_all_types.columns[:3]] + df_shared_cols["different_col"] = "a" + assert df_empty.empty - df_list = [df_empty] + df_list = [ + { + "label": "cluster_1", + "data": [("tableA", df_empty), ("tableB", df_shared_cols.copy(deep=True))], + }, + { + "label": "cluster_2", + "data": [ + ("tableA", df_all_types), + ("tableB", df_shared_cols.copy(deep=True)), + ], + }, + ] dataset = bound_store_dataframes( df_list, @@ -175,7 +348,7 @@ def test_store_dataframes_as_dataset_empty_dataframe( ) assert isinstance(dataset, DatasetMetadata) - assert len(dataset.partitions) == 1 + assert len(dataset.partitions) == 2 store = store_factory() stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) @@ -184,28 +357,63 @@ def test_store_dataframes_as_dataset_empty_dataframe( assert dataset.partitions == stored_dataset.partitions df_stored = DataFrameSerializer.restore_dataframe( - key=next(iter(dataset.partitions.values())).files["table"], store=store + key=dataset.partitions["cluster_1"].files["tableA"], store=store ) pdt.assert_frame_equal(df_empty, df_stored) + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_2"].files["tableA"], store=store + ) + # Roundtrips for type date are not type preserving + df_stored["date"] = df_stored["date"].dt.date + pdt.assert_frame_equal(df_all_types, df_stored) + + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_1"].files["tableB"], store=store + ) + pdt.assert_frame_equal(df_shared_cols, df_stored) + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_2"].files["tableB"], store=store + ) + pdt.assert_frame_equal(df_shared_cols, df_stored) + def test_store_dataframes_as_dataset_batch_mode( store_factory, metadata_version, bound_store_dataframes ): - # TODO: Kick this out? values_p1 = [1, 2, 3] values_p2 = [4, 5, 6] df = pd.DataFrame({"P": values_p1}) df2 = pd.DataFrame({"P": values_p2}) - df_list = [[df, df2]] + df_list = [ + [ + { + "label": "cluster_1", + "data": [("core", df)], + "indices": { + "P": ExplicitSecondaryIndex( + "P", {v: ["cluster_1"] for v in values_p1} + ) + }, + }, + { + "label": "cluster_2", + "data": [("core", df2)], + "indices": { + "P": ExplicitSecondaryIndex( + "P", {v: ["cluster_2"] for v in values_p2} + ) + }, + }, + ] + ] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, - secondary_indices="P", ) assert isinstance(dataset, DatasetMetadata) @@ -219,7 +427,23 @@ def test_store_dataframes_as_dataset_batch_mode( assert dataset.metadata == stored_dataset.metadata assert dataset.partitions == stored_dataset.partitions - assert "P" in dataset.indices + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_1"].files["core"], store=store + ) + pdt.assert_frame_equal(df, df_stored) + df_stored = DataFrameSerializer.restore_dataframe( + key=dataset.partitions["cluster_2"].files["core"], store=store + ) + pdt.assert_frame_equal(df2, df_stored) + + assert stored_dataset.indices["P"].to_dict() == { + 1: np.array(["cluster_1"], dtype=object), + 2: np.array(["cluster_1"], dtype=object), + 3: np.array(["cluster_1"], dtype=object), + 4: np.array(["cluster_2"], dtype=object), + 5: np.array(["cluster_2"], dtype=object), + 6: np.array(["cluster_2"], dtype=object), + } def test_store_dataframes_as_dataset_auto_uuid( @@ -229,14 +453,33 @@ def test_store_dataframes_as_dataset_auto_uuid( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - df_list = [df.copy(deep=True)] + df_helper = pd.DataFrame( + {"P": np.arange(0, 10), "info": string.ascii_lowercase[:10]} + ) + + df_list = [ + { + "label": "cluster_1", + "data": [ + ("core", df.copy(deep=True)), + ("helper", df_helper.copy(deep=True)), + ], + }, + { + "label": "cluster_2", + "data": [ + ("core", df.copy(deep=True)), + ("helper", df_helper.copy(deep=True)), + ], + }, + ] dataset = bound_store_dataframes( df_list, store=store_factory, metadata_version=metadata_version ) assert isinstance(dataset, DatasetMetadata) - assert len(dataset.partitions) == 1 + assert len(dataset.partitions) == 2 stored_dataset = DatasetMetadata.load_from_store( "auto_dataset_uuid", store_factory() @@ -246,6 +489,34 @@ def test_store_dataframes_as_dataset_auto_uuid( assert dataset.partitions == stored_dataset.partitions +def test_store_dataframes_as_dataset_list_input( + store_factory, metadata_version, bound_store_dataframes +): + df = pd.DataFrame( + {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} + ) + df2 = pd.DataFrame( + { + "P": np.arange(100, 110), + "L": np.arange(100, 110), + "TARGET": np.arange(10, 20), + } + ) + df_list = [df, df2] + + dataset = bound_store_dataframes( + df_list, + store=store_factory, + dataset_uuid="dataset_uuid", + metadata_version=metadata_version, + ) + + assert isinstance(dataset, DatasetMetadata) + assert len(dataset.partitions) == 2 + stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store_factory()) + assert dataset == stored_dataset + + def test_store_dataframes_as_dataset_mp_partition_on_none( metadata_version, store, store_factory, bound_store_dataframes ): @@ -253,7 +524,13 @@ def test_store_dataframes_as_dataset_mp_partition_on_none( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - mp = MetaPartition(label=gen_uuid(), data=df, metadata_version=metadata_version) + df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) + + mp = MetaPartition( + label=gen_uuid(), + data={"core": df, "helper": df2}, + metadata_version=metadata_version, + ) df_list = [None, mp] dataset = bound_store_dataframes( @@ -280,14 +557,24 @@ def test_store_dataframes_partition_on(store_factory, bound_store_dataframes): ) # First partition is empty, test this edgecase - input_ = [df.head(0), df] + input_ = [ + { + "label": "label", + "data": [("order_proposals", df.head(0))], + "indices": {"location": {}}, + }, + { + "label": "label", + "data": [("order_proposals", df)], + "indices": {"location": {k: ["label"] for k in df["location"].unique()}}, + }, + ] dataset = bound_store_dataframes( input_, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=4, partition_on=["other"], - secondary_indices="location", ) assert len(dataset.partitions) == 1 @@ -415,10 +702,11 @@ def _exception_str(exception): ], ) def test_schema_check_write(dfs, ok, store_factory, bound_store_dataframes): + df_list = [{"label": "cluster_1", "data": [("core", df)]} for df in dfs] if ok: bound_store_dataframes( - dfs, + df_list, store=store_factory, dataset_uuid="dataset_uuid", partition_on=["P"], @@ -427,22 +715,41 @@ def test_schema_check_write(dfs, ok, store_factory, bound_store_dataframes): else: with pytest.raises(Exception) as exc: bound_store_dataframes( - dfs, + df_list, store=store_factory, dataset_uuid="dataset_uuid", partition_on=["P"], metadata_version=4, ) assert ( - "Schemas for dataset 'dataset_uuid' are not compatible!" + "Schemas for table 'core' of dataset 'dataset_uuid' are not compatible!" in _exception_str(exc.value) ) -@pytest.mark.xfail(reason="mocking doesn't work for dask atm") -def test_schema_check_write_nice_error( - store_factory, bound_store_dataframes, mock_uuid -): +def test_schema_check_write_shared(store_factory, bound_store_dataframes): + df1 = pd.DataFrame( + {"P": pd.Series([1], dtype=np.int64), "X": pd.Series([1], dtype=np.int64)} + ) + df2 = pd.DataFrame( + {"P": pd.Series([1], dtype=np.uint64), "Y": pd.Series([1], dtype=np.int64)} + ) + df_list = [ + {"label": "cluster_1", "data": [("core", df1)]}, + {"label": "cluster_2", "data": [("prediction", df2)]}, + ] + with pytest.raises(Exception) as exc: + bound_store_dataframes( + df_list, + store=store_factory, + dataset_uuid="dataset_uuid", + partition_on=["P"], + metadata_version=4, + ) + assert 'Found incompatible entries for column "P"' in str(exc.value) + + +def test_schema_check_write_nice_error(store_factory, bound_store_dataframes): df1 = pd.DataFrame( { "P": pd.Series([1, 1], dtype=np.int64), @@ -458,8 +765,8 @@ def test_schema_check_write_nice_error( } ) df_list = [ - df1, - df2, + {"label": "uuid1", "data": [("core", df1)]}, + {"label": "uuid2", "data": [("core", df2)]}, ] with pytest.raises(Exception) as exc: bound_store_dataframes( @@ -469,22 +776,20 @@ def test_schema_check_write_nice_error( partition_on=["P", "Q"], metadata_version=4, ) - assert _exception_str(exc.value).startswith( - """Schemas for dataset 'dataset_uuid' are not compatible! + """Schemas for table 'core' of dataset 'dataset_uuid' are not compatible! Schema violation -Origin schema: {P=2/Q=2/auto_dataset_uuid} -Origin reference: {P=1/Q=2/auto_dataset_uuid} +Origin schema: {core/P=2/Q=2/uuid2} +Origin reference: {core/P=1/Q=2/uuid1} Diff: """ ) -@pytest.mark.xfail(reason="mocking doesn't work for dask atm") -def test_schema_check_write_cut_error(store_factory, bound_store_dataframes, mock_uuid): +def test_schema_check_write_cut_error(store_factory, bound_store_dataframes): df1 = pd.DataFrame( { "P": pd.Series([1] * 100, dtype=np.int64), @@ -500,8 +805,8 @@ def test_schema_check_write_cut_error(store_factory, bound_store_dataframes, moc } ) df_list = [ - df1, - df2, + {"label": "uuid1", "data": [("core", df1)]}, + {"label": "uuid2", "data": [("core", df2)]}, ] with pytest.raises(Exception) as exc: bound_store_dataframes( @@ -512,12 +817,12 @@ def test_schema_check_write_cut_error(store_factory, bound_store_dataframes, moc metadata_version=4, ) assert _exception_str(exc.value).startswith( - """Schemas for dataset 'dataset_uuid' are not compatible! + """Schemas for table 'core' of dataset 'dataset_uuid' are not compatible! Schema violation -Origin schema: {P=2/Q=99/auto_dataset_uuid} -Origin reference: {P=1/Q=99/auto_dataset_uuid} +Origin schema: {core/P=2/Q=99/uuid2} +Origin reference: {core/P=1/Q=99/uuid1} Diff: """ @@ -533,18 +838,43 @@ def test_metadata_consistency_errors_fails( {"P": np.arange(10, 20), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - df_list = [df, df_2] + df_list = [ + {"label": "cluster_1", "data": [("core", df)]}, + {"label": "cluster_2", "data": [("core", df_2)]}, + ] # Also test `df_list` in reverse order, as this could lead to different results for dfs in [df_list, list(reversed(df_list))]: with pytest.raises( - Exception, match=r"Schemas for dataset .* are not compatible!" + Exception, match=r"Schemas for table .* of dataset .* are not compatible!" ): return bound_store_dataframes( dfs, store=store_factory, metadata_version=metadata_version ) +def test_table_consistency_resistance( + store_factory, metadata_version, bound_store_dataframes +): + df = pd.DataFrame({"P": np.arange(0, 10)}) + + df_helper = pd.DataFrame( + {"P": np.arange(15, 35), "info": string.ascii_lowercase[:10]} + ) + + df_list = [ + {"label": "cluster_1", "data": [("core", df)]}, + {"label": "cluster_2", "data": [("core", df), ("helper", df_helper)]}, + ] + + store_kwargs = dict(store=store_factory, metadata_version=metadata_version) + metadata1 = bound_store_dataframes(df_list, **store_kwargs) + + metadata2 = bound_store_dataframes(list(reversed(df_list)), **store_kwargs) + + assert set(metadata1.tables) == set(metadata2.tables) == {"core", "helper"} + + def test_store_dataframes_as_dataset_overwrite( store_factory, dataset_function, bound_store_dataframes ): @@ -563,14 +893,13 @@ def test_store_dataframes_as_dataset_overwrite( ) -@pytest.mark.skip("What is the intended behaviour for this?") def test_store_empty_dataframes_partition_on(store_factory, bound_store_dataframes): df1 = pd.DataFrame({"x": [1], "y": [1]}).iloc[[]] md1 = bound_store_dataframes( [df1], store=store_factory, dataset_uuid="uuid", partition_on=["x"] ) assert md1.tables == ["table"] - assert set(md1.schema.names) == set(df1.columns) + assert set(md1.table_meta["table"].names) == set(df1.columns) df2 = pd.DataFrame({"x": [1], "y": [1], "z": [1]}).iloc[[]] md2 = bound_store_dataframes( @@ -581,7 +910,7 @@ def test_store_empty_dataframes_partition_on(store_factory, bound_store_datafram overwrite=True, ) assert md2.tables == ["table"] - assert set(md2.schema.names) == set(df2.columns) + assert set(md2.table_meta["table"].names) == set(df2.columns) df3 = pd.DataFrame({"x": [1], "y": [1], "a": [1]}).iloc[[]] md3 = bound_store_dataframes( @@ -592,20 +921,19 @@ def test_store_empty_dataframes_partition_on(store_factory, bound_store_datafram overwrite=True, ) assert md3.tables == ["table2"] - assert set(md3.schema.names) == set(df3.columns) + assert set(md3.table_meta["table2"].names) == set(df3.columns) -@pytest.mark.skip("What is the intended behaviour for this?") def test_store_overwrite_none(store_factory, bound_store_dataframes): df1 = pd.DataFrame({"x": [1], "y": [1]}) md1 = bound_store_dataframes( [df1], store=store_factory, dataset_uuid="uuid", partition_on=["x"] ) assert md1.tables == ["table"] - assert set(md1.schema.names) == set(df1.columns) + assert set(md1.table_meta["table"].names) == set(df1.columns) md2 = bound_store_dataframes( - [None], + [{}], store=store_factory, dataset_uuid="uuid", partition_on=["x"], @@ -622,16 +950,3 @@ def test_secondary_index_on_partition_column(store_factory, bound_store_datafram bound_store_dataframes( [df1], store=store_factory, partition_on=["x"], secondary_indices=["x"] ) - - -def test_non_default_table_name_roundtrip(store_factory, bound_store_dataframes): - df = pd.DataFrame({"A": [1]}) - bound_store_dataframes( - [df], store=store_factory, dataset_uuid="dataset_uuid", table_name="foo" - ) - for k in store_factory(): - if k.endswith(".parquet") and "indices" not in k: - assert "foo" in k - result = read_table(dataset_uuid="dataset_uuid", store=store_factory) - - pdt.assert_frame_equal(df, result) diff --git a/kartothek/io_components/cube/cleanup.py b/kartothek/io_components/cube/cleanup.py index 0fc45de9..a561445e 100644 --- a/kartothek/io_components/cube/cleanup.py +++ b/kartothek/io_components/cube/cleanup.py @@ -1,6 +1,6 @@ from functools import reduce -from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPARATOR +from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPERATOR from kartothek.utils.ktk_adapters import get_dataset_keys __all__ = ("get_keys_to_clean",) @@ -27,7 +27,7 @@ def get_keys_to_clean(cube_uuid_prefix, datasets, store): ) keys_present = { - k for k in store.iter_keys(cube_uuid_prefix + KTK_CUBE_UUID_SEPARATOR) + k for k in store.iter_keys(cube_uuid_prefix + KTK_CUBE_UUID_SEPERATOR) } return keys_present - keys_should diff --git a/kartothek/io_components/cube/query/__init__.py b/kartothek/io_components/cube/query/__init__.py index a60410e2..f81ae1fa 100644 --- a/kartothek/io_components/cube/query/__init__.py +++ b/kartothek/io_components/cube/query/__init__.py @@ -21,6 +21,7 @@ determine_intention, ) from kartothek.io_components.cube.query._regroup import regroup +from kartothek.io_components.metapartition import SINGLE_TABLE from kartothek.utils.ktk_adapters import get_dataset_columns __all__ = ("QueryGroup", "QueryIntention", "load_group", "plan_query", "quick_concat") @@ -332,7 +333,7 @@ def plan_query( empty_df = { ktk_cube_dataset_id: _reduce_empty_dtype_sizes( empty_dataframe_from_schema( - schema=ds.schema, + schema=ds.table_meta[SINGLE_TABLE], columns=sorted( get_dataset_columns(ds) & set(load_columns[ktk_cube_dataset_id]) ), diff --git a/kartothek/io_components/cube/query/_group.py b/kartothek/io_components/cube/query/_group.py index c50d7237..8884ff07 100644 --- a/kartothek/io_components/cube/query/_group.py +++ b/kartothek/io_components/cube/query/_group.py @@ -6,7 +6,7 @@ import attr import pandas as pd -from kartothek.io_components.metapartition import MetaPartition +from kartothek.io_components.metapartition import SINGLE_TABLE, MetaPartition from kartothek.utils.converters import converter_str from kartothek.utils.pandas import ( concat_dataframes, @@ -85,10 +85,11 @@ def _load_all_mps(mps, store, load_columns, predicates, empty): mp = mp.load_dataframes( store=store, predicate_pushdown_to_io=True, - columns=sorted(load_columns), + tables=[SINGLE_TABLE], + columns={SINGLE_TABLE: sorted(load_columns)}, predicates=predicates, ) - df = mp.data + df = mp.data[SINGLE_TABLE] df.columns = df.columns.map(converter_str) dfs_mp.append(df) return concat_dataframes(dfs_mp, empty) diff --git a/kartothek/io_components/cube/query/_intention.py b/kartothek/io_components/cube/query/_intention.py index b5ad5b13..55eb15ed 100644 --- a/kartothek/io_components/cube/query/_intention.py +++ b/kartothek/io_components/cube/query/_intention.py @@ -12,7 +12,7 @@ from kartothek.core.cube.conditions import Conjunction from kartothek.serialization._parquet import _normalize_value from kartothek.utils.converters import converter_str_set, converter_str_tupleset -from kartothek.utils.ktk_adapters import get_dataset_columns +from kartothek.utils.ktk_adapters import get_dataset_columns, get_dataset_schema __all__ = ("QueryIntention", "determine_intention") @@ -120,7 +120,7 @@ def _test_condition_types(conditions, datasets): for ktk_cube_dataset_id in sorted(datasets.keys()): dataset = datasets[ktk_cube_dataset_id] - meta = dataset.schema + meta = get_dataset_schema(dataset) if col not in meta.names: continue pa_type = meta.field(col).type diff --git a/kartothek/io_components/cube/query/_regroup.py b/kartothek/io_components/cube/query/_regroup.py index 7a67a5d2..ebde6887 100644 --- a/kartothek/io_components/cube/query/_regroup.py +++ b/kartothek/io_components/cube/query/_regroup.py @@ -306,8 +306,8 @@ def _map_ktk_mps_to_groups(cube, datasets, label2gp): label2gp_sub = label2gp[ktk_cube_dataset_id] for mp in dispatch_metapartitions_from_factory( dataset_factory=metadata_factory_from_dataset(ds), + concat_partitions_on_primary_index=False, ): - # FIXME: can this be simplified? if mp.label not in label2gp_sub: # filtered out by pre-condition continue diff --git a/kartothek/io_components/cube/write.py b/kartothek/io_components/cube/write.py index b879f7a9..bf339c31 100644 --- a/kartothek/io_components/cube/write.py +++ b/kartothek/io_components/cube/write.py @@ -22,7 +22,7 @@ from kartothek.core.dataset import DatasetMetadataBuilder from kartothek.core.naming import metadata_key_from_uuid from kartothek.core.uuid import gen_uuid -from kartothek.io_components.metapartition import MetaPartition +from kartothek.io_components.metapartition import SINGLE_TABLE, MetaPartition from kartothek.utils.converters import converter_str from kartothek.utils.pandas import mask_sorted_duplicates_keep_last, sort_dataframe @@ -130,7 +130,6 @@ def prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata): return ds_metadata -# XXX: This is not consistent with plain kartothek datasets (indices are accepted on nullable columns there) def assert_dimesion_index_cols_notnull( df: pd.DataFrame, ktk_cube_dataset_id: str, cube: Cube, partition_on: Sequence[str] ) -> pd.DataFrame: @@ -358,7 +357,9 @@ def prepare_data_for_ktk( # create MetaPartition object for easier handling mp = MetaPartition( - label=gen_uuid(), data=df, metadata_version=KTK_CUBE_METADATA_VERSION, + label=gen_uuid(), + data={SINGLE_TABLE: df}, + metadata_version=KTK_CUBE_METADATA_VERSION, ) del df @@ -366,8 +367,8 @@ def prepare_data_for_ktk( mp = mp.partition_on(list(partition_on)) # reset indices again (because partition_on breaks it) - for mp2 in mp: - mp2.data.reset_index(drop=True, inplace=True) + for mp2 in mp.metapartitions: + mp2["data"][SINGLE_TABLE].reset_index(drop=True, inplace=True) del mp2 # calculate indices @@ -435,7 +436,7 @@ def apply_postwrite_checks(datasets, cube, store, existing_datasets): empty_datasets = { ktk_cube_dataset_id for ktk_cube_dataset_id, ds in datasets.items() - if len(ds.partitions) == 0 + if SINGLE_TABLE not in ds.table_meta or len(ds.partitions) == 0 } if empty_datasets: @@ -552,9 +553,10 @@ def _rollback_transaction(existing_datasets, new_datasets, store): ds = existing_datasets[ktk_cube_dataset_id] builder = DatasetMetadataBuilder.from_dataset(ds) store.put(*builder.to_json()) - store_schema_metadata( - schema=ds.schema, dataset_uuid=ds.uuid, store=store, table=ds.table_name - ) + for table, schema in ds.table_meta.items(): + store_schema_metadata( + schema=schema, dataset_uuid=ds.uuid, store=store, table=table + ) def prepare_ktk_partition_on( diff --git a/kartothek/io_components/gc.py b/kartothek/io_components/gc.py index c0d693dc..951d0a14 100644 --- a/kartothek/io_components/gc.py +++ b/kartothek/io_components/gc.py @@ -7,7 +7,10 @@ def dispatch_files_to_gc(dataset_uuid, store_factory, chunk_size, factory): ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store_factory, factory=factory, + dataset_uuid=dataset_uuid, + store=store_factory, + factory=factory, + load_dataset_metadata=False, ) dataset_uuid = dataset_uuid or ds_factory.uuid diff --git a/kartothek/io_components/merge.py b/kartothek/io_components/merge.py new file mode 100644 index 00000000..d58b4524 --- /dev/null +++ b/kartothek/io_components/merge.py @@ -0,0 +1,120 @@ +import logging +from typing import Callable, Generator, List, Union + +from kartothek.core.dataset import DatasetMetadata +from kartothek.core.typing import StoreInput +from kartothek.core.utils import ensure_store +from kartothek.io_components.metapartition import MetaPartition + +LOGGER = logging.getLogger(__name__) + +try: + from typing_extensions import Literal # type: ignore +except ImportError: + from typing import Literal # type: ignore + + +def align_datasets( + left_dataset_uuid: str, + right_dataset_uuid: str, + store: StoreInput, + match_how: Union[Literal["exact", "prefix", "all"], Callable] = "exact", +) -> Generator[List[MetaPartition], None, None]: + """ + Determine dataset partition alignment + + Parameters + ---------- + left_dataset_uuid + right_dataset_uuid + store + match_how + + + Yields + ------ + List + + """ + store = ensure_store(store) + left_dataset = DatasetMetadata.load_from_store(uuid=left_dataset_uuid, store=store) + right_dataset = DatasetMetadata.load_from_store( + uuid=right_dataset_uuid, store=store + ) + + metadata_version = left_dataset.metadata_version + + # Loop over the dataset with fewer partitions, treating its keys as + # partition label prefixes + if ( + callable(match_how) + or match_how == "left" + or ( + match_how == "prefix" + and len(list(left_dataset.partitions.keys())[0]) + < len(list(right_dataset.partitions.keys())[0]) + ) + ): + first_dataset = left_dataset + second_dataset = right_dataset + else: + first_dataset = right_dataset + second_dataset = left_dataset + # The del statements are here to reduce confusion below + del left_dataset + del right_dataset + + # For every partition in the 'small' dataset, at least one partition match + # needs to be found in the larger dataset. + available_partitions = list(second_dataset.partitions.items()) + partition_stack = available_partitions[:] + + # TODO: write a test which protects against the following scenario!! + # Sort the partition labels by length of the labels, starting with the + # labels which are the longest. This way we prevent label matching for + # similar partitions, e.g. cluster_100 and cluster_1. This, of course, + # works only as long as the internal loop removes elements which were + # matched already (here improperly called stack) + for l_1 in sorted(first_dataset.partitions, key=len, reverse=True): + p_1 = first_dataset.partitions[l_1] + res = [ + MetaPartition.from_partition( + partition=p_1, metadata_version=metadata_version + ) + ] + for parts in available_partitions: + l_2, p_2 = parts + if callable(match_how) and not match_how(l_1, l_2): + continue + if match_how == "exact" and l_1 != l_2: + continue + elif match_how == "prefix" and not l_2.startswith(l_1): + LOGGER.debug("rejecting (%s, %s)", l_1, l_2) + continue + + LOGGER.debug( + "Found alignment between partitions " "(%s, %s) and" "(%s, %s)", + first_dataset.uuid, + p_1.label, + second_dataset.uuid, + p_2.label, + ) + res.append( + MetaPartition.from_partition( + partition=p_2, metadata_version=metadata_version + ) + ) + + # In exact or prefix matching schemes, it is expected to only + # find one partition alignment. in this case reduce the size of + # the inner loop + if match_how in ["exact", "prefix"]: + partition_stack.remove((l_2, p_2)) + # Need to copy, otherwise remove will alter the loop iterator + available_partitions = partition_stack[:] + if len(res) == 1: + raise RuntimeError( + "No matching partition for {} in dataset {} " + "found".format(p_1, first_dataset) + ) + yield res diff --git a/kartothek/io_components/metapartition.py b/kartothek/io_components/metapartition.py index 871b4da2..7e4636db 100644 --- a/kartothek/io_components/metapartition.py +++ b/kartothek/io_components/metapartition.py @@ -4,7 +4,8 @@ import os import time import warnings -from collections import namedtuple +from collections import defaultdict, namedtuple +from copy import copy from functools import wraps from typing import ( Any, @@ -24,7 +25,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from simplekv import KeyValueStore from kartothek.core import naming from kartothek.core.common_metadata import ( @@ -33,6 +33,7 @@ normalize_column_order, read_schema_metadata, validate_compatible, + validate_shared_columns, ) from kartothek.core.docs import default_docs from kartothek.core.index import ExplicitSecondaryIndex, IndexBase @@ -47,7 +48,12 @@ verify_metadata_version, ) from kartothek.core.uuid import gen_uuid -from kartothek.io_components.utils import align_categories +from kartothek.io_components.utils import ( + InferredIndices, + _ensure_valid_indices, + align_categories, + combine_metadata, +) from kartothek.serialization import ( DataFrameSerializer, PredicatesType, @@ -73,7 +79,8 @@ "number_rows_per_row_group": np.dtype(int), } -MetaPartitionInput = Optional[Union[pd.DataFrame, Sequence, "MetaPartition"]] +_MULTI_TABLE_DICT_LIST = Dict[str, Iterable[str]] +MetaPartitionInput = Union[Dict, pd.DataFrame, Sequence, "MetaPartition"] def _predicates_to_named(predicates): @@ -142,7 +149,6 @@ def _impl(self, *method_args, **method_kwargs): method_args, method_kwargs = _initialize_store_for_metapartition( method, method_args, method_kwargs ) - method_return = None # declare for mypy if (len(self) == 1) and (self.label is None): result = method(self, *method_args, **method_kwargs) else: @@ -189,11 +195,11 @@ def __next__(self): mp_dict = current.metapartitions[self.position] # These are global attributes, i.e. the nested metapartitions do not carry these and need # to be added here + mp_dict["dataset_metadata"] = current.dataset_metadata mp_dict["metadata_version"] = current.metadata_version - mp_dict["schema"] = current.schema + mp_dict["table_meta"] = current.table_meta mp_dict["partition_keys"] = current.partition_keys mp_dict["logical_conjunction"] = current.logical_conjunction - mp_dict["table_name"] = current.table_name self.position += 1 return MetaPartition.from_dict(mp_dict) @@ -209,12 +215,13 @@ class MetaPartition(Iterable): def __init__( self, label: Optional[str], - file: Optional[str] = None, - table_name: str = SINGLE_TABLE, - data: Optional[pd.DataFrame] = None, + files: Optional[Dict[str, str]] = None, + metadata: Any = None, + data: Optional[Dict[str, pd.DataFrame]] = None, + dataset_metadata: Optional[Dict] = None, indices: Optional[Dict[Any, Any]] = None, metadata_version: Optional[int] = None, - schema: Optional[SchemaWrapper] = None, + table_meta: Optional[Dict[str, SchemaWrapper]] = None, partition_keys: Optional[Sequence[str]] = None, logical_conjunction: Optional[List[Tuple[Any, str, Any]]] = None, ): @@ -241,6 +248,8 @@ def __init__( data A dictionary including the materialized in-memory DataFrames corresponding to the file references in `files`. + dataset_metadata + The metadata of the original dataset indices Kartothek index dictionary, metadata_version @@ -259,13 +268,15 @@ def __init__( else: self.metadata_version = metadata_version verify_metadata_version(self.metadata_version) - self.schema = schema - self.table_name = table_name - if data is not None and schema is None: - self.schema = make_meta( - data, origin=f"{table_name}/{label}", partition_keys=partition_keys - ) - + self.table_meta = table_meta if table_meta else {} + if isinstance(data, dict) and (len(self.table_meta) == 0): + for table, df in data.items(): + if df is not None: + self.table_meta[table] = make_meta( + df, + origin="{}/{}".format(table, label), + partition_keys=partition_keys, + ) indices = indices or {} for column, index_dct in indices.items(): if isinstance(index_dct, dict): @@ -276,12 +287,13 @@ def __init__( self.metapartitions = [ { "label": label, - "data": data, - "file": file or None, + "data": data or {}, + "files": files or {}, "indices": indices, "logical_conjunction": logical_conjunction, } ] + self.dataset_metadata = dataset_metadata or {} self.partition_keys = partition_keys or [] def __repr__(self): @@ -289,8 +301,11 @@ def __repr__(self): label = "NESTED ({})".format(len(self.metapartitions)) else: label = self.label - return "<{_class} v{version} | {label} >".format( - version=self.metadata_version, _class=self.__class__.__name__, label=label + return "<{_class} v{version} | {label} | tables {tables} >".format( + version=self.metadata_version, + _class=self.__class__.__name__, + label=label, + tables=sorted(set(self.table_meta.keys())), ) def __len__(self): @@ -315,25 +330,25 @@ def data(self): return self.metapartitions[0]["data"] @property - def file(self) -> str: + def files(self): if len(self.metapartitions) > 1: raise AttributeError( "Accessing `files` attribute is not allowed while nested" ) - return cast(str, self.metapartitions[0]["file"]) + return self.metapartitions[0]["files"] @property - def is_sentinel(self) -> bool: + def is_sentinel(self): return len(self.metapartitions) == 1 and self.label is None @property - def label(self) -> str: + def label(self): if len(self.metapartitions) > 1: raise AttributeError( "Accessing `label` attribute is not allowed while nested" ) assert isinstance(self.metapartitions[0], dict), self.metapartitions[0] - return cast(str, self.metapartitions[0]["label"]) + return self.metapartitions[0]["label"] @property def indices(self): @@ -344,8 +359,12 @@ def indices(self): return self.metapartitions[0]["indices"] @property - def partition(self) -> Partition: - return Partition(label=self.label, files={self.table_name: self.file}) + def tables(self): + return list(set(self.data.keys()).union(set(self.files.keys()))) + + @property + def partition(self): + return Partition(label=self.label, files=self.files) def __eq__(self, other): if not isinstance(other, MetaPartition): @@ -354,7 +373,15 @@ def __eq__(self, other): if self.metadata_version != other.metadata_version: return False - if self.schema is not None and not self.schema.equals(other.schema): + for table, meta in self.table_meta.items(): + # https://issues.apache.org/jira/browse/ARROW-5873 + other_meta = other.table_meta.get(table, None) + if other_meta is None: + return False + if not meta.equals(other_meta): + return False + + if self.dataset_metadata != other.dataset_metadata: return False if len(self.metapartitions) != len(other.metapartitions): @@ -377,27 +404,33 @@ def __eq__(self, other): # This is unnested only - if self.label != other.label: + self_keys = set(self.data.keys()) + other_keys = set(other.data.keys()) + if not (self_keys == other_keys): return False - if self.file != other.file: + if self.label != other.label: return False - if self.data is not None and not self.data.equals(other.data): + if self.files != other.files: return False + for label, df in self.data.items(): + if not (df.equals(other.data[label])): + return False + return True @staticmethod def from_partition( partition: Partition, - data: Optional[pd.DataFrame] = None, - indices: Optional[Dict] = None, + data: Optional[Dict] = None, + dataset_metadata: Dict = None, + indices: Dict = None, metadata_version: Optional[int] = None, - schema: Optional[SchemaWrapper] = None, + table_meta: Optional[Dict] = None, partition_keys: Optional[List[str]] = None, logical_conjunction: Optional[List[Tuple[Any, str, Any]]] = None, - table_name: str = SINGLE_TABLE, ): """ Transform a kartothek :class:`~kartothek.core.partition.Partition` into a @@ -409,9 +442,11 @@ def from_partition( The kartothek partition to be wrapped data A dictionaries with materialised :class:`~pandas.DataFrame` - indices : dict + dataset_metadata + The metadata of the original dataset + indices The index dictionary of the dataset - schema + table_meta Type metadata for each table, optional metadata_version partition_keys @@ -423,32 +458,41 @@ def from_partition( """ return MetaPartition( label=partition.label, - file=partition.files[table_name], + files=partition.files, data=data, + dataset_metadata=dataset_metadata, indices=indices, metadata_version=metadata_version, - schema=schema, + table_meta=table_meta, partition_keys=partition_keys, logical_conjunction=logical_conjunction, - table_name=table_name, ) def add_metapartition( - self, metapartition: "MetaPartition", schema_validation: bool = True, + self, + metapartition: "MetaPartition", + metadata_merger: Optional[Callable] = None, + schema_validation: bool = True, ): """ Adds a metapartition to the internal list structure to enable batch processing. + The top level `dataset_metadata` dictionary is combined with the existing dict and + all other attributes are stored in the `metapartitions` list + Parameters ---------- metapartition The MetaPartition to be added. + metadata_merger + A callable to perform the metadata merge. By default [kartothek.io_components.utils.combine_metadata] is used schema_validation If True (default), ensure that the `table_meta` of both `MetaPartition` objects are the same """ if self.is_sentinel: return metapartition + table_meta = metapartition.table_meta existing_label = [mp_["label"] for mp_ in self.metapartitions] if any( @@ -457,21 +501,28 @@ def add_metapartition( raise RuntimeError( "Duplicate labels for nested metapartitions are not allowed!" ) - schema = metapartition.schema - if schema_validation and schema: - # This ensures that only schema-compatible metapartitions can be nested - # The returned schema by validate_compatible is the reference schema with the most - # information, i.e. the fewest null columns - schema = validate_compatible([self.schema, metapartition.schema]) + if schema_validation: + table_meta = {} + for table, meta in self.table_meta.items(): + other = metapartition.table_meta.get(table, None) + # This ensures that only schema-compatible metapartitions can be nested + # The returned schema by validate_compatible is the reference schema with the most + # information, i.e. the fewest null columns + table_meta[table] = validate_compatible([meta, other]) + + metadata_merger = metadata_merger or combine_metadata + new_dataset_metadata = metadata_merger( + [self.dataset_metadata, metapartition.dataset_metadata] + ) new_object = MetaPartition( label="NestedMetaPartition", + dataset_metadata=new_dataset_metadata, metadata_version=metapartition.metadata_version, - schema=schema, + table_meta=table_meta, partition_keys=metapartition.partition_keys or None, logical_conjunction=metapartition.logical_conjunction or None, - table_name=metapartition.table_name, ) # Add metapartition information to the new object @@ -497,12 +548,13 @@ def from_dict(dct): """ return MetaPartition( label=dct["label"], - file=dct.get("file", None), - data=dct.get("data", None), - table_name=dct.get("table_name", SINGLE_TABLE), + files=dct.get("files", {}), + metadata=dct.get("metadata", {}), + data=dct.get("data", {}), indices=dct.get("indices", {}), metadata_version=dct.get("metadata_version", None), - schema=dct.get("schema", None), + dataset_metadata=dct.get("dataset_metadata", {}), + table_meta=dct.get("table_meta", {}), partition_keys=dct.get("partition_keys", None), logical_conjunction=dct.get("logical_conjunction", None), ) @@ -510,14 +562,14 @@ def from_dict(dct): def to_dict(self): return { "label": self.label, - "file": self.file, - "data": self.data, + "files": self.files or {}, + "data": self.data or {}, "indices": self.indices, "metadata_version": self.metadata_version, - "schema": self.schema, + "dataset_metadata": self.dataset_metadata, + "table_meta": self.table_meta, "partition_keys": self.partition_keys, "logical_conjunction": self.logical_conjunction, - "table_name": self.table_name, } @_apply_to_list @@ -525,7 +577,7 @@ def remove_dataframes(self): """ Remove all dataframes from the metapartition in memory. """ - return self.copy(data=None) + return self.copy(data={}) def _split_predicates_in_index_and_content(self, predicates): """ @@ -549,13 +601,13 @@ def _split_predicates_in_index_and_content(self, predicates): split_predicates.append(_SplitPredicate(key_part, content_part)) return split_predicates, has_index_condition - def _apply_partition_key_predicates(self, indices, split_predicates): + def _apply_partition_key_predicates(self, table, indices, split_predicates): """ Apply the predicates to the partition_key columns and return the remaining predicates that should be pushed to the DataFrame serialiser. """ # Construct a single line DF with the partition columns - schema = self.schema + schema = self.table_meta[table] index_df_dct = {} for column, value in indices: pa_dtype = schema[schema.get_field_index(column)].type @@ -595,11 +647,12 @@ def _apply_partition_key_predicates(self, indices, split_predicates): @_apply_to_list def load_dataframes( self, - store: KeyValueStore, - columns: Optional[Sequence[str]] = None, + store: StoreInput, + tables: _MULTI_TABLE_DICT_LIST = None, + columns: _MULTI_TABLE_DICT_LIST = None, predicate_pushdown_to_io: bool = True, - categoricals: Optional[Sequence[str]] = None, - dates_as_object: bool = True, + categoricals: _MULTI_TABLE_DICT_LIST = None, + dates_as_object: bool = False, predicates: PredicatesType = None, ) -> "MetaPartition": """ @@ -631,126 +684,149 @@ def load_dataframes( } """ + if columns is None: + columns = {} + elif set(columns).difference(self.tables): + raise ( + ValueError( + "You are trying to read columns from invalid table(s): {}".format( + set(columns).difference(self.tables) + ) + ) + ) if categoricals is None: - categoricals = [] - if not dates_as_object: - warnings.warn( - "The argument `date_as_object` is set to False. This argument will be deprecated and the future behaviour will be as if the paramere was set to `True`. Please migrate your code accordingly ahead of time.", - DeprecationWarning, - ) + categoricals = {} LOGGER.debug("Loading internal dataframes of %s", self.label) - if not self.file: + if len(self.files) == 0: # This used to raise, but the specs do not require this, so simply do a no op - LOGGER.debug("Partition %s is empty and has no data.", self.label) + LOGGER.debug("Partition %s is empty and has not tables/files", self.label) return self + new_data = copy(self.data) predicates = _combine_predicates(predicates, self.logical_conjunction) predicates = _predicates_to_named(predicates) - dataset_uuid, _, indices, _ = decode_key(self.file) - - # In case the columns only refer to the partition indices, we need to load at least a single column to - # determine the length of the required dataframe. - table_columns_to_io = columns - - filtered_predicates = predicates - - self = self.load_schema(dataset_uuid=dataset_uuid, store=store) - - # Filter predicates that would apply to this partition and remove the partition columns - if predicates: - # Check if there are predicates that match to the partition columns. - # For these we need to check if the partition columns already falsify - # the conditition. - # - # We separate these predicates into their index and their Parquet part. - ( - split_predicates, - has_index_condition, - ) = self._split_predicates_in_index_and_content(predicates) + for table, key in self.files.items(): + table_columns = columns.get(table, None) + categories = categoricals.get(table, None) + dataset_uuid, _, indices, file_name = decode_key(key) + if tables and table not in tables: + continue - filtered_predicates = [] - if has_index_condition: - filtered_predicates = self._apply_partition_key_predicates( - indices, split_predicates - ) + # In case the columns only refer to the partition indices, we need to load at least a single column to + # determine the length of the required dataframe. + if table_columns is None: + table_columns_to_io = None else: - filtered_predicates = [pred.content_part for pred in split_predicates] - - # Remove partition_keys from table_columns_to_io - if self.partition_keys and table_columns_to_io is not None: - keys_to_remove = set(self.partition_keys) & set(table_columns_to_io) - # This is done to not change the ordering of the list - table_columns_to_io = [ - c for c in table_columns_to_io if c not in keys_to_remove - ] - - start = time.time() - df = DataFrameSerializer.restore_dataframe( - key=self.file, - store=store, - columns=table_columns_to_io, - categories=categoricals, - predicate_pushdown_to_io=predicate_pushdown_to_io, - predicates=filtered_predicates, - date_as_object=dates_as_object, - ) - LOGGER.debug( - "Loaded dataframe %s in %s seconds.", self.file, time.time() - start - ) - # Metadata version >=4 parse the index columns and add them back to the dataframe - - df = self._reconstruct_index_columns( - df=df, - key_indices=indices, - columns=columns, - categories=categoricals, - date_as_object=dates_as_object, - ) - - df.columns = df.columns.map(ensure_string_type) - if columns is not None: - # TODO: When the write-path ensures that all partitions have the same column set, this check can be - # moved before `DataFrameSerializer.restore_dataframe`. At the position of the current check we - # may want to double check the columns of the loaded DF and raise an exception indicating an - # inconsistent dataset state instead. - missing_cols = set(columns).difference(df.columns) - if missing_cols: - raise ValueError( - "Columns cannot be found in stored dataframe: {}".format( - ", ".join(sorted(missing_cols)) + table_columns_to_io = table_columns + + filtered_predicates = predicates + + self._load_table_meta(dataset_uuid=dataset_uuid, table=table, store=store) + + # Filter predicates that would apply to this partition and remove the partition columns + if predicates: + # Check if there are predicates that match to the partition columns. + # For these we need to check if the partition columns already falsify + # the conditition. + # + # We separate these predicates into their index and their Parquet part. + ( + split_predicates, + has_index_condition, + ) = self._split_predicates_in_index_and_content(predicates) + + filtered_predicates = [] + if has_index_condition: + filtered_predicates = self._apply_partition_key_predicates( + table, indices, split_predicates ) - ) + else: + filtered_predicates = [ + pred.content_part for pred in split_predicates + ] + + # Remove partition_keys from table_columns_to_io + if self.partition_keys and table_columns_to_io is not None: + keys_to_remove = set(self.partition_keys) & set(table_columns_to_io) + # This is done to not change the ordering of the list + table_columns_to_io = [ + c for c in table_columns_to_io if c not in keys_to_remove + ] + + start = time.time() + df = DataFrameSerializer.restore_dataframe( + key=key, + store=store, + columns=table_columns_to_io, + categories=categories, + predicate_pushdown_to_io=predicate_pushdown_to_io, + predicates=filtered_predicates, + date_as_object=dates_as_object, + ) + LOGGER.debug("Loaded dataframe %s in %s seconds.", key, time.time() - start) + # Metadata version >=4 parse the index columns and add them back to the dataframe + + df = self._reconstruct_index_columns( + df=df, + key_indices=indices, + table=table, + columns=table_columns, + categories=categories, + date_as_object=dates_as_object, + ) - if list(df.columns) != columns: - df = df.reindex(columns=columns, copy=False) + df.columns = df.columns.map(ensure_string_type) + if table_columns is not None: + # TODO: When the write-path ensures that all partitions have the same column set, this check can be + # moved before `DataFrameSerializer.restore_dataframe`. At the position of the current check we + # may want to double check the columns of the loaded DF and raise an exception indicating an + # inconsistent dataset state instead. + missing_cols = set(table_columns).difference(df.columns) + if missing_cols: + raise ValueError( + "Columns cannot be found in stored dataframe: {}".format( + ", ".join(sorted(missing_cols)) + ) + ) - return self.copy(data=df) + if list(df.columns) != table_columns: + df = df.reindex(columns=table_columns, copy=False) + new_data[table] = df + return self.copy(data=new_data) @_apply_to_list - def load_schema(self, store: StoreInput, dataset_uuid: str) -> "MetaPartition": + def load_all_table_meta( + self, store: StoreInput, dataset_uuid: str + ) -> "MetaPartition": """ Loads all table metadata in memory and stores it under the `tables` attribute """ + for table in self.files: + self._load_table_meta(dataset_uuid, table, store) + return self - if self.schema is None: - store = ensure_store(store) - self.schema = read_schema_metadata( - dataset_uuid=dataset_uuid, store=store, table=self.table_name + def _load_table_meta( + self, dataset_uuid: str, table: str, store: StoreInput + ) -> "MetaPartition": + if table not in self.table_meta: + _common_metadata = read_schema_metadata( + dataset_uuid=dataset_uuid, store=store, table=table ) + self.table_meta[table] = _common_metadata return self def _reconstruct_index_columns( - self, df, key_indices, columns, categories, date_as_object + self, df, key_indices, table, columns, categories, date_as_object ): if len(key_indices) == 0: return df original_columns = list(df.columns) zeros = np.zeros(len(df), dtype=int) - schema = self.schema + schema = self.table_meta[table] # One of the few places `inplace=True` makes a signifcant difference df.reset_index(drop=True, inplace=True) @@ -767,8 +843,7 @@ def _reconstruct_index_columns( # indexer call is slow, so only do that if really necessary df = df.reindex(columns=cleaned_original_columns, copy=False) - pos = 0 - for primary_key, value in key_indices: + for pos, (primary_key, value) in enumerate(key_indices): # If there are predicates, don't reconstruct the index if it wasn't requested if columns is not None and primary_key not in columns: continue @@ -802,10 +877,86 @@ def _reconstruct_index_columns( if convert_to_date: value = pd.Timestamp(value).to_pydatetime().date() df.insert(pos, primary_key, value) - pos += 1 return df + @_apply_to_list + def merge_dataframes( + self, + left: str, + right: str, + output_label: str, + merge_func: Callable = pd.merge, + merge_kwargs: Optional[Dict] = None, + ): + """ + Merge internal dataframes. + + The two referenced dataframes are removed from the internal list and + the newly created dataframe is added. + + The merge itself can be completely customized by supplying a + callable `merge_func(left_df, right_df, **merge_kwargs)` which can + handle data pre-processing as well as the merge itself. + + The files attribute of the result will be empty since the in-memory + DataFrames are no longer representations of the referenced files. + + Parameters + ---------- + left + Category of the left dataframe. + right + Category of the right dataframe. + output_label + Category for the newly created dataframe + merge_func + The function to take care of the merge. By default: pandas.merge. + The function should have the signature + `func(left_df, right_df, **kwargs)` + merge_kwargs + Keyword arguments which should be supplied to the merge function + + Returns + ------- + MetaPartition + + """ + # Shallow copy + new_data = copy(self.data) + if merge_kwargs is None: + merge_kwargs = {} + + left_df = new_data.pop(left) + right_df = new_data.pop(right) + + LOGGER.debug("Merging internal dataframes of %s", self.label) + + try: + df_merged = merge_func(left_df, right_df, **merge_kwargs) + except TypeError: + LOGGER.error( + "Tried to merge using %s with\n left:%s\nright:%s\n " "kwargs:%s", + merge_func.__name__, + left_df.head(), + right_df.head(), + merge_kwargs, + ) + raise + + new_data[output_label] = df_merged + new_table_meta = copy(self.table_meta) + # The tables are no longer part of the MetaPartition, thus also drop + # their schema. + del new_table_meta[left] + del new_table_meta[right] + new_table_meta[output_label] = make_meta( + df_merged, + origin="{}/{}".format(output_label, self.label), + partition_keys=self.partition_keys, + ) + return self.copy(files={}, data=new_data, table_meta=new_table_meta) + @_apply_to_list def validate_schema_compatible( self, store: StoreInput, dataset_uuid: str @@ -824,16 +975,25 @@ def validate_schema_compatible( # Load the reference meta of the existing dataset. Using the built-in # `load_all_table_meta` would not be helpful here as it would be a no-op # as we have already loaded the meta from the input DataFrame. - store = ensure_store(store) - reference_meta = read_schema_metadata( - dataset_uuid=dataset_uuid, store=store, table=self.table_name - ) - try: - validate_compatible([self.schema, reference_meta]) - except ValueError as e: - raise ValueError( - f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}" + reference_meta = {} + for table in self.table_meta: + _common_metadata = read_schema_metadata( + dataset_uuid=dataset_uuid, store=store, table=table ) + reference_meta[table] = _common_metadata + + result = {} + for table, schema in self.table_meta.items(): + try: + result[table] = validate_compatible([schema, reference_meta[table]]) + except ValueError as e: + raise ValueError( + "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}".format( + table=table, dataset_uuid=dataset_uuid, e=e + ) + ) + validate_shared_columns(list(result.values())) + return self @_apply_to_list @@ -842,6 +1002,8 @@ def store_dataframes( store: StoreInput, dataset_uuid: str, df_serializer: Optional[DataFrameSerializer] = None, + store_metadata: bool = False, + metadata_storage_format: Optional[str] = None, ) -> "MetaPartition": """ Stores all dataframes of the MetaPartitions and registers the saved @@ -862,17 +1024,18 @@ def store_dataframes( df_serializer = ( df_serializer if df_serializer is not None else default_serializer() ) + file_dct = {} - key = get_partition_file_prefix( - partition_label=self.label, - dataset_uuid=dataset_uuid, - metadata_version=self.metadata_version, - table=self.table_name, - ) - if self.data is not None: - df = self.data + for table, df in self.data.items(): + key = get_partition_file_prefix( + partition_label=self.label, + dataset_uuid=dataset_uuid, + table=table, + metadata_version=self.metadata_version, + ) + LOGGER.debug("Store dataframe for table `%s` to %s ...", table, key) try: - file = df_serializer.store(store, key, df) + file_dct[table] = df_serializer.store(store, key, df) except Exception as exc: try: if isinstance(df, pd.DataFrame): @@ -889,15 +1052,67 @@ def store_dataframes( pass finally: raise + LOGGER.debug("Storage of dataframe for table `%s` successful", table) + + new_metapartition = self.copy(files=file_dct, data={}) + + return new_metapartition + + @_apply_to_list + def concat_dataframes(self): + """ + Concatenates all dataframes with identical columns. - new_metapartition = self.copy(file=file, data=None) + In case of changes on the dataframes, the files attribute will be + emptied since the in-memory DataFrames are no longer representations + of the referenced files. - return new_metapartition + Returns + ------- + MetaPartition + A metapartition where common column dataframes are merged. The + file attribute will be empty since there is no direct relation + between the referenced files and the in-memory dataframes anymore + + """ + + count_cols = defaultdict(list) + for label, df in self.data.items(): + # List itself is not hashable + key = "".join(sorted(df.columns)) + count_cols[key].append((label, df)) + is_modified = False + new_data = {} + for _, tuple_list in count_cols.items(): + if len(tuple_list) > 1: + is_modified = True + data = [x[1] for x in tuple_list] + label = _unique_label([x[0] for x in tuple_list]) + new_data[label] = pd.concat(data).reset_index(drop=True) + else: + label, df = tuple_list[0] + new_data[label] = df + new_table_meta = { + label: make_meta( + df, + origin="{}/{}".format(self.label, label), + partition_keys=self.partition_keys, + ) + for (label, df) in new_data.items() + } + if is_modified: + return self.copy(files={}, data=new_data, table_meta=new_table_meta) else: return self @_apply_to_list - def apply(self, func: Callable, type_safe: bool = False,) -> "MetaPartition": + def apply( + self, + func: Union[Callable, Dict[str, Callable]], + tables: Optional[List[str]] = None, + metadata: Optional[Dict] = None, + type_safe: bool = False, + ) -> "MetaPartition": """ Applies a given function to all dataframes of the MetaPartition. @@ -905,25 +1120,52 @@ def apply(self, func: Callable, type_safe: bool = False,) -> "MetaPartition": ---------- func A callable accepting and returning a :class:`pandas.DataFrame` - uuid : + tables + Only apply and return the function on the given tables. + Note: behavior will change in future versions! + New behavior will be: + Only apply the provided function to the given tables + uuid The changed dataset is assigned a new UUID. type_safe If the transformation is type-safe, optimizations can be applied """ - - new_data = func(self.data) - - if type_safe: - new_schema = self.schema + if tables is None: + tables = self.data.keys() else: - new_schema = make_meta( - new_data, origin=self.label, partition_keys=self.partition_keys, + warnings.warn( + "The behavior for passing ``table`` parameter to ``MetaPartition.apply`` will " + "change in the next major version. The future behavior will be to return all " + "data and only apply the function to the selected tables. All other tables " + "will be left untouched.", + FutureWarning, + ) + if callable(func): + new_data = {k: func(v) for k, v in self.data.items() if k in tables} + elif isinstance(func, dict): + new_data = {k: func[k](v) for k, v in self.data.items() if k in tables} + if metadata: + warnings.warn( + "The keyword argument ``metadata`` doesn't have any effect and will be removed soon.", + DeprecationWarning, ) - return self.copy(data=new_data, schema=new_schema) + if type_safe: + new_table_meta = self.table_meta + else: + new_table_meta = { + table: make_meta( + df, + origin="{}/{}".format(self.label, table), + partition_keys=self.partition_keys, + ) + for table, df in new_data.items() + } + return self.copy(data=new_data, table_meta=new_table_meta) def as_sentinel(self): - """""" + """ + """ return MetaPartition( None, metadata_version=self.metadata_version, @@ -936,9 +1178,12 @@ def copy(self, **kwargs): """ def _renormalize_meta(meta): - if "partition_keys" in kwargs and meta is not None: + if "partition_keys" in kwargs: pk = kwargs["partition_keys"] - return normalize_column_order(meta, pk) + return { + table: normalize_column_order(schema, pk) + for table, schema in meta.items() + } else: return meta @@ -948,33 +1193,39 @@ def _renormalize_meta(meta): first_mp = metapartitions.pop() mp_parent = MetaPartition( label=first_mp.get("label"), - file=first_mp.get("file"), + files=first_mp.get("files"), + metadata=first_mp.get("metadata"), data=first_mp.get("data"), + dataset_metadata=kwargs.get("dataset_metadata", self.dataset_metadata), indices=first_mp.get("indices"), metadata_version=self.metadata_version, - schema=_renormalize_meta(kwargs.get("schema", self.schema)), + table_meta=_renormalize_meta(kwargs.get("table_meta", self.table_meta)), partition_keys=kwargs.get("partition_keys", self.partition_keys), logical_conjunction=kwargs.get( "logical_conjunction", self.logical_conjunction ), - table_name=kwargs.get("table_name", self.table_name), ) for mp in metapartitions: mp_parent = mp_parent.add_metapartition( MetaPartition( label=mp.get("label"), - file=mp.get("file"), + files=mp.get("files"), + metadata=mp.get("metadata"), data=mp.get("data"), + dataset_metadata=mp.get( + "dataset_metadata", self.dataset_metadata + ), indices=mp.get("indices"), metadata_version=self.metadata_version, - schema=_renormalize_meta(kwargs.get("schema", self.schema)), + table_meta=_renormalize_meta( + kwargs.get("table_meta", self.table_meta) + ), partition_keys=kwargs.get( "partition_keys", self.partition_keys ), logical_conjunction=kwargs.get( "logical_conjunction", self.logical_conjunction ), - table_name=kwargs.get("table_name", self.table_name), ), schema_validation=False, ) @@ -982,16 +1233,16 @@ def _renormalize_meta(meta): else: mp = MetaPartition( label=kwargs.get("label", self.label), - file=kwargs.get("file", self.file), + files=kwargs.get("files", self.files), data=kwargs.get("data", self.data), + dataset_metadata=kwargs.get("dataset_metadata", self.dataset_metadata), indices=kwargs.get("indices", self.indices), metadata_version=kwargs.get("metadata_version", self.metadata_version), - schema=_renormalize_meta(kwargs.get("schema", self.schema)), + table_meta=_renormalize_meta(kwargs.get("table_meta", self.table_meta)), partition_keys=kwargs.get("partition_keys", self.partition_keys), logical_conjunction=kwargs.get( "logical_conjunction", self.logical_conjunction ), - table_name=kwargs.get("table_name", self.table_name), ) return mp @@ -1010,24 +1261,31 @@ def build_indices(self, columns: Iterable[str]): new_indices = {} for col in columns: - possible_values: Set[str] = set() + col_in_partition = False + for df in self.data.values(): + + if col in df: + possible_values = possible_values | set(df[col].dropna().unique()) + col_in_partition = True - df = self.data - if not self.is_sentinel and col not in df: + if (self.label is not None) and (not col_in_partition): raise RuntimeError( - "Column `{corrupt_col}` could not be found in the partition `{partition_label}` Please check for any typos and validate your dataset.".format( - corrupt_col=col, partition_label=self.label + "Column `{corrupt_col}` could not be found in the partition `{partition_label}` " + "with tables `{tables}`. Please check for any typos and validate your dataset.".format( + corrupt_col=col, + partition_label=self.label, + tables=sorted(self.data.keys()), ) ) - possible_values = possible_values | set(df[col].dropna().unique()) - - if self.schema is not None: - dtype = self.schema.field(col).type - else: - dtype = None - + # There is at least one table with this column (see check above), so we can get the dtype from there. Also, + # shared dtypes are ensured to be compatible. + dtype = list( + meta.field(col).type + for meta in self.table_meta.values() + if col in meta.names + )[0] new_index = ExplicitSecondaryIndex( column=col, index_dct={value: [self.label] for value in possible_values}, @@ -1087,13 +1345,13 @@ def partition_on(self, partition_on: Union[str, Sequence[str]]): raise ValueError( "Trying to `partition_on` on a column with an explicit index!" ) - if self.is_sentinel: - return self.copy(partition_keys=partition_on) - else: - new_mp = self.as_sentinel().copy( - partition_keys=partition_on, - schema=normalize_column_order(self.schema, partition_on), - ) + new_mp = self.as_sentinel().copy( + partition_keys=partition_on, + table_meta={ + table: normalize_column_order(schema, partition_on) + for table, schema in self.table_meta.items() + }, + ) if isinstance(partition_on, str): partition_on = [partition_on] @@ -1101,18 +1359,21 @@ def partition_on(self, partition_on: Union[str, Sequence[str]]): new_data = self._partition_data(partition_on) - for label, data in new_data.items(): + for label, data_dct in new_data.items(): tmp_mp = MetaPartition( label=label, - file=self.file, - data=data, + files=self.files, + data=data_dct, + dataset_metadata=self.dataset_metadata, metadata_version=self.metadata_version, indices={}, - schema=normalize_column_order(self.schema, partition_on).with_origin( - f"{label}" - ), + table_meta={ + table: normalize_column_order(schema, partition_on).with_origin( + "{}/{}".format(table, label) + ) + for table, schema in self.table_meta.items() + }, partition_keys=partition_on, - table_name=self.table_name, ) new_mp = new_mp.add_metapartition(tmp_mp, schema_validation=False) if self.indices: @@ -1142,58 +1403,64 @@ def _partition_data(self, partition_on): 2: ] dct = dict() - df = self.data + empty_tables = [] - # Check that data sizes do not change. This might happen if the - # groupby below drops data, e.g. nulls - size_after = 0 - size_before = len(df) + for table, df in self.data.items(): + # Check that data sizes do not change. This might happen if the + # groupby below drops data, e.g. nulls + size_after = 0 + size_before = len(df) - # Implementation from pyarrow - # See https://github.com/apache/arrow/blob/b33dfd9c6bd800308bb1619b237dbf24dea159be/python/pyarrow/parquet.py#L1030 # noqa: E501 + # Implementation from pyarrow + # See https://github.com/apache/arrow/blob/b33dfd9c6bd800308bb1619b237dbf24dea159be/python/pyarrow/parquet.py#L1030 # noqa: E501 - # column sanity checks - data_cols = set(df.columns).difference(partition_on) - missing_po_cols = set(partition_on).difference(df.columns) - if missing_po_cols: - raise ValueError( - "Partition column(s) missing: {}".format( - ", ".join(sorted(missing_po_cols)) + # column sanity checks + data_cols = set(df.columns).difference(partition_on) + missing_po_cols = set(partition_on).difference(df.columns) + if missing_po_cols: + raise ValueError( + "Partition column(s) missing: {}".format( + ", ".join(sorted(missing_po_cols)) + ) ) - ) - if len(data_cols) == 0: - raise ValueError("No data left to save outside partition columns") - - # To be aligned with open source tooling we drop the index columns and recreate - # them upon reading as it is done by fastparquet and pyarrow - partition_keys = [df[col] for col in partition_on] - - # # The handling of empty dfs is not part of the arrow implementation - # if df.empty: - # return {} - - data_df = df.drop(partition_on, axis="columns") - for value, group in data_df.groupby(by=partition_keys, sort=False): - partitioning_info = [] - if pd.api.types.is_scalar(value): - value = [value] - if existing_indices: - partitioning_info.extend(quote_indices(existing_indices)) - partitioning_info.extend(quote_indices(zip(partition_on, value))) - partitioning_info.append(base_label) - new_label = "/".join(partitioning_info) - - if new_label not in dct: - dct[new_label] = {} - dct[new_label] = group - size_after += len(group) - - if size_before != size_after: - raise ValueError( - f"Original dataframe size ({size_before} rows) does not " - f"match new dataframe size ({size_after} rows). " - f"Hint: you may see this if you are trying to use `partition_on` on a column with null values." - ) + if len(data_cols) == 0: + raise ValueError("No data left to save outside partition columns") + + # To be aligned with open source tooling we drop the index columns and recreate + # them upon reading as it is done by fastparquet and pyarrow + partition_keys = [df[col] for col in partition_on] + + # The handling of empty dfs is not part of the arrow implementation + if df.empty: + empty_tables.append((table, df)) + + data_df = df.drop(partition_on, axis="columns") + for value, group in data_df.groupby(by=partition_keys, sort=False): + partitioning_info = [] + if pd.api.types.is_scalar(value): + value = [value] + if existing_indices: + partitioning_info.extend(quote_indices(existing_indices)) + partitioning_info.extend(quote_indices(zip(partition_on, value))) + partitioning_info.append(base_label) + new_label = "/".join(partitioning_info) + + if new_label not in dct: + dct[new_label] = {} + dct[new_label][table] = group + size_after += len(group) + + if size_before != size_after: + raise ValueError( + f"Original dataframe size ({size_before} rows) does not " + f"match new dataframe size ({size_after} rows) for table {table}. " + f"Hint: you may see this if you are trying to use `partition_on` on a column with null values." + ) + + for label, table_dct in dct.items(): + for empty_table, df in empty_tables: + if empty_table not in table_dct: + table_dct[empty_table] = df.drop(labels=partition_on, axis=1) return dct @@ -1224,38 +1491,96 @@ def _merge_labels(metapartitions, label_merger=None): return new_label @staticmethod - def concat_metapartitions(metapartitions, label_merger=None): - LOGGER.debug("Concatenating metapartitions") + def _merge_metadata(metapartitions, metadata_merger=None): + if metadata_merger is None: + metadata_merger = combine_metadata + + new_ds_meta = metadata_merger([mp.dataset_metadata for mp in metapartitions]) + + return new_ds_meta + @staticmethod + def merge_metapartitions(metapartitions, label_merger=None, metadata_merger=None): + LOGGER.debug("Merging metapartitions") + data = defaultdict(list) new_metadata_version = -1 - data = [] - schema = [] + logical_conjunction = None + for mp in metapartitions: new_metadata_version = max(new_metadata_version, mp.metadata_version) - data.append(mp.data) - schema.append(mp.schema) + for label, df in mp.data.items(): + data[label].append(df) + if mp.logical_conjunction or logical_conjunction: + if logical_conjunction != mp.logical_conjunction: + raise TypeError( + "Can only merge metapartitions belonging to the same logical partition." + ) + else: + logical_conjunction = mp.logical_conjunction + + new_data = {} + for label in data: + if len(data[label]) == 1: + new_data[label] = data[label][0] + else: + for ix, idf in enumerate(data[label]): + new_label = "{}_{}".format(label, ix) + new_data[new_label] = idf + + new_label = MetaPartition._merge_labels(metapartitions, label_merger) + new_ds_meta = MetaPartition._merge_metadata(metapartitions, metadata_merger) + + new_mp = MetaPartition( + label=new_label, + data=new_data, + dataset_metadata=new_ds_meta, + metadata_version=new_metadata_version, + logical_conjunction=logical_conjunction, + ) + + return new_mp + + @staticmethod + def concat_metapartitions(metapartitions, label_merger=None, metadata_merger=None): + LOGGER.debug("Concatenating metapartitions") + data = defaultdict(list) + schema = defaultdict(list) + new_metadata_version = -1 + for mp in metapartitions: + new_metadata_version = max(new_metadata_version, mp.metadata_version) + for table in mp.data: + data[table].append(mp.data[table]) + schema[table].append(mp.table_meta[table]) # Don't care about the partition_keys. If we try to merge # MetaPartitions without alignment the schemas won't match. partition_keys = mp.partition_keys - categoricals = [ - col - for col, dtype in data[0].items() - if pd.api.types.is_categorical_dtype(dtype) - ] - if categoricals: - data = align_categories(data, categoricals) - new_df = pd.concat(data) + new_data = {} + new_schema = {} + + for table in data: + if len(data[table]) == 1: + new_data[table] = data[table][0] + else: + categoricals = [ + col + for col, dtype in data[table][0].items() + if pd.api.types.is_categorical_dtype(dtype) + ] + data[table] = align_categories(data[table], categoricals) + new_data[table] = pd.concat(data[table]) - new_schema = validate_compatible(schema) + new_schema[table] = validate_compatible(schema[table]) new_label = MetaPartition._merge_labels(metapartitions, label_merger) + new_ds_meta = MetaPartition._merge_metadata(metapartitions, metadata_merger) new_mp = MetaPartition( label=new_label, - data=new_df, + data=new_data, + dataset_metadata=new_ds_meta, metadata_version=new_metadata_version, - schema=new_schema, + table_meta=new_schema, partition_keys=partition_keys, ) @@ -1267,10 +1592,11 @@ def delete_from_store( ) -> "MetaPartition": store = ensure_store(store) # Delete data first - store.delete(self.file) - return self.copy(file=None, data=None) + for file_key in self.files.values(): + store.delete(file_key) + return self.copy(files={}, data={}, metadata={}) - def get_parquet_metadata(self, store: StoreInput) -> pd.DataFrame: + def get_parquet_metadata(self, store: StoreInput, table_name: str) -> pd.DataFrame: """ Retrieve the parquet metadata for the MetaPartition. Especially relevant for calculating dataset statistics. @@ -1287,33 +1613,37 @@ def get_parquet_metadata(self, store: StoreInput) -> pd.DataFrame: pd.DataFrame A DataFrame with relevant parquet metadata """ + if not isinstance(table_name, str): + raise TypeError("Expecting a string for parameter `table_name`.") + store = ensure_store(store) data = {} - with store.open(self.file) as fd: # type: ignore - pq_metadata = pa.parquet.ParquetFile(fd).metadata - - data = { - "partition_label": self.label, - "serialized_size": pq_metadata.serialized_size, - "number_rows_total": pq_metadata.num_rows, - "number_row_groups": pq_metadata.num_row_groups, - "row_group_id": [], - "number_rows_per_row_group": [], - "row_group_compressed_size": [], - "row_group_uncompressed_size": [], - } - for rg_ix in range(pq_metadata.num_row_groups): - rg = pq_metadata.row_group(rg_ix) - data["row_group_id"].append(rg_ix) - data["number_rows_per_row_group"].append(rg.num_rows) - data["row_group_compressed_size"].append(rg.total_byte_size) - data["row_group_uncompressed_size"].append( - sum( - rg.column(col_ix).total_uncompressed_size - for col_ix in range(rg.num_columns) + if table_name in self.files: + with store.open(self.files[table_name]) as fd: # type: ignore + pq_metadata = pa.parquet.ParquetFile(fd).metadata + + data = { + "partition_label": self.label, + "serialized_size": pq_metadata.serialized_size, + "number_rows_total": pq_metadata.num_rows, + "number_row_groups": pq_metadata.num_row_groups, + "row_group_id": [], + "number_rows_per_row_group": [], + "row_group_compressed_size": [], + "row_group_uncompressed_size": [], + } + for rg_ix in range(pq_metadata.num_row_groups): + rg = pq_metadata.row_group(rg_ix) + data["row_group_id"].append(rg_ix) + data["number_rows_per_row_group"].append(rg.num_rows) + data["row_group_compressed_size"].append(rg.total_byte_size) + data["row_group_uncompressed_size"].append( + sum( + rg.column(col_ix).total_uncompressed_size + for col_ix in range(rg.num_columns) + ) ) - ) df = pd.DataFrame(data=data, columns=_METADATA_SCHEMA.keys()) df = df.astype(_METADATA_SCHEMA) @@ -1348,25 +1678,89 @@ def partition_labels_from_mps(mps: List[MetaPartition]) -> List[str]: def parse_input_to_metapartition( obj: MetaPartitionInput, - table_name: str = SINGLE_TABLE, metadata_version: Optional[int] = None, + expected_secondary_indices: Optional[InferredIndices] = False, ) -> MetaPartition: """ - Parses given user input and return a MetaPartition + Parses given user input and returns a MetaPartition + + The format specification supports multiple input modes as following: + + 1. Mode - Dictionary with partition information + + In this case, a dictionary is supplied where the keys describe the partition. + + * **label** - (optional) Unique partition label. If None is given, a UUID \ + is generated using :func:`kartothek.core.uuid.gen_uuid`. + * **data** - A dict or list of tuples. The keys represent the table name \ + and the values are the actual payload data as a pandas.DataFrame. + * **indices** - Deprecated, see the keyword argument `secondary_indices` to create indices. + A dictionary to describe the dataset indices. All \ + partition level indices are finally merged using \ + :func:`kartothek.io_components.metapartition.MetaPartition.merge_indices` \ + into a single dataset index + + Examples:: + + # A partition with explicit label, no metadata, one table and index information + input_obj = { + 'label': 'partition_label', + 'data': [('table', pd.DataFrame([{'column_1':values_1, 'column_2':values_2}]))], + 'indices': { + "column_1": { + value: ['partition_label'] + } + } + } + # If no label is given, a UUID will be generated using :func:`kartothek.core.uuid.gen_uuid` + simple_input = { + 'data': [('table', pd.DataFrame())], + } + + 2. Mode - `pandas.DataFrame` + + If only a DataFrame is provided, a UUID is generated and the dataframe is stored + for the table name `SINGLE_TABLE` + + 3. Mode - :class:`~kartothek.io_components.metapartition.MetaPartition` + + If a MetaPartition is passed directly, it is simply passed through. - The expected input is a :class:`pandas.DataFrame` or a list of - :class:`pandas.DataFrame`. + 4. Mode - List of tuples - Every element of the list will be treated as a dedicated user input and will - result in a physical file, if not specified otherwise. + The first item represents the table name and the second is the actual payload data \ + as a pandas.DataFrame. + + Example:: + + # A partition with no explicit label, no metadata and one table + input_obj = [('table', pd.DataFrame())] + + Nested MetaPartitions: + + The input may also be provided as a list to ease batch processing. The returned MetaPartition + will be nested and each list element represents a single physical partition. For details on + nested MetaPartitions, see :class:`~kartothek.io_components.metapartition.MetaPartition` Parameters ---------- obj - table_name - The table name assigned to the partitions metadata_version The kartothek dataset specification version + expected_secondary_indices + Iterable of strings containing expected columns on which indices are created. An empty iterable indicates no + indices are expected. + The default is `False`, which, indicates no checking will be done (`None` behaves the same way). + This is only used in mode "Dictionary with partition information". + + Raises + ------ + ValueError + In case the given input is not understood + + Returns + ------- + MetaPartition """ if obj is None: @@ -1375,21 +1769,58 @@ def parse_input_to_metapartition( if len(obj) == 0: return MetaPartition(label=None, metadata_version=metadata_version) first_element = obj[0] + if isinstance(first_element, tuple): + data = {"data": [df] for df in obj} + return parse_input_to_metapartition( + obj=data, + metadata_version=metadata_version, + expected_secondary_indices=expected_secondary_indices, + ) mp = parse_input_to_metapartition( - obj=first_element, metadata_version=metadata_version, table_name=table_name, + obj=first_element, + metadata_version=metadata_version, + expected_secondary_indices=expected_secondary_indices, ) for mp_in in obj[1:]: mp = mp.add_metapartition( parse_input_to_metapartition( - obj=mp_in, metadata_version=metadata_version, table_name=table_name, + obj=mp_in, + metadata_version=metadata_version, + expected_secondary_indices=expected_secondary_indices, ) ) + elif isinstance(obj, dict): + if not obj.get("data"): + data = obj + elif isinstance(obj["data"], list): + data = dict(obj["data"]) + else: + data = obj["data"] + + indices = obj.get("indices", {}) + if indices: + warnings.warn( + "The explicit input of indices using the `indices` key is deprecated." + 'Use the `secondary_indices` keyword argument of "write" and "update" functions instead.', + DeprecationWarning, + ) + indices = {k: v for k, v in indices.items() if v} + _ensure_valid_indices( + mp_indices=indices, secondary_indices=expected_secondary_indices, data=data + ) + + mp = MetaPartition( + # TODO: Deterministic hash for the input? + label=obj.get("label", gen_uuid()), + data=data, + indices=indices, + metadata_version=metadata_version, + ) elif isinstance(obj, pd.DataFrame): mp = MetaPartition( label=gen_uuid(), - data=obj, + data={SINGLE_TABLE: obj}, metadata_version=metadata_version, - table_name=table_name, ) elif isinstance(obj, MetaPartition): return obj diff --git a/kartothek/io_components/read.py b/kartothek/io_components/read.py index 775b6500..b4b36c81 100644 --- a/kartothek/io_components/read.py +++ b/kartothek/io_components/read.py @@ -1,8 +1,10 @@ -from typing import Iterator, List, Optional, Set, Union, cast, overload +import warnings +from typing import Callable, Iterator, List, Optional, Set, Union, cast, overload import pandas as pd from kartothek.core.factory import DatasetFactory +from kartothek.core.index import ExplicitSecondaryIndex from kartothek.core.typing import StoreInput from kartothek.io_components.metapartition import MetaPartition from kartothek.io_components.utils import normalize_args @@ -16,15 +18,25 @@ @overload def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, + label_filter: Optional[Callable] = None, + concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, + store: Optional[StoreInput] = None, dispatch_by: None = None, + dispatch_metadata: bool = False, ) -> Iterator[MetaPartition]: ... @overload def dispatch_metapartitions_from_factory( - dataset_factory: DatasetFactory, predicates: PredicatesType, dispatch_by: List[str], + dataset_factory: DatasetFactory, + label_filter: Optional[Callable], + concat_partitions_on_primary_index: bool, + predicates: PredicatesType, + store: Optional[StoreInput], + dispatch_by: List[str], + dispatch_metadata: bool, ) -> Iterator[List[MetaPartition]]: ... @@ -32,13 +44,38 @@ def dispatch_metapartitions_from_factory( @normalize_args def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, + label_filter: Optional[Callable] = None, + concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, + store: Optional[StoreInput] = None, dispatch_by: Optional[List[str]] = None, + dispatch_metadata: bool = False, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ + if dispatch_metadata: + + warnings.warn( + "The dispatch of metadata and index information as part of the MetaPartition instance is deprecated. " + "The future behaviour will be that this metadata is not dispatched. To set the future behaviour, " + "specifiy ``dispatch_metadata=False``", + DeprecationWarning, + ) + + if dispatch_by is not None and concat_partitions_on_primary_index: + raise ValueError( + "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, " + "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. " + "Please only provide the `dispatch_by` argument. " + ) + if concat_partitions_on_primary_index: + warnings.warn( + "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. Use `dispatch_by=dataset_factory.partition_keys` to achieve the same behavior instead.", + DeprecationWarning, + ) + dispatch_by = dataset_factory.partition_keys if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) @@ -65,6 +102,15 @@ def dispatch_metapartitions_from_factory( list(index_cols), predicates=predicates ) + if label_filter: + base_df = base_df[base_df.index.map(label_filter)] + + indices_to_dispatch = { + name: ix.unload() + for name, ix in dataset_factory.indices.items() + if isinstance(ix, ExplicitSecondaryIndex) + } + if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) @@ -87,11 +133,14 @@ def dispatch_metapartitions_from_factory( mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], + dataset_metadata=dataset_factory.metadata + if dispatch_metadata + else None, + indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, - schema=dataset_factory.schema, + table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, - table_name=dataset_factory.table_name, ) ) yield mps @@ -101,26 +150,42 @@ def dispatch_metapartitions_from_factory( yield MetaPartition.from_partition( partition=part, + dataset_metadata=dataset_factory.metadata + if dispatch_metadata + else None, + indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, - schema=dataset_factory.schema, + table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, - table_name=dataset_factory.table_name, ) def dispatch_metapartitions( dataset_uuid: str, store: StoreInput, + load_dataset_metadata: bool = True, + keep_indices: bool = True, + keep_table_meta: bool = True, + label_filter: Optional[Callable] = None, + concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, dispatch_by: Optional[List[str]] = None, + dispatch_metadata: bool = False, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: dataset_factory = DatasetFactory( dataset_uuid=dataset_uuid, store_factory=store, load_schema=True, load_all_indices=False, + load_dataset_metadata=load_dataset_metadata, ) return dispatch_metapartitions_from_factory( - dataset_factory=dataset_factory, predicates=predicates, dispatch_by=dispatch_by, + dataset_factory=dataset_factory, + store=None, + label_filter=label_filter, + predicates=predicates, + dispatch_by=dispatch_by, + concat_partitions_on_primary_index=concat_partitions_on_primary_index, + dispatch_metadata=dispatch_metadata, ) diff --git a/kartothek/io_components/utils.py b/kartothek/io_components/utils.py index 211f0591..6dd61324 100644 --- a/kartothek/io_components/utils.py +++ b/kartothek/io_components/utils.py @@ -4,7 +4,7 @@ import collections import inspect import logging -from typing import Dict, Iterable, List, Optional, Union, overload +from typing import Dict, Iterable, List, Optional, TypeVar, Union, overload import decorator import pandas as pd @@ -19,6 +19,9 @@ except ImportError: from typing import Literal # type: ignore +# Literal false is sentinel, see function body of `_ensure_compatible_indices` for details +InferredIndices = Union[Literal[False], List[str]] + signature = inspect.signature @@ -110,10 +113,10 @@ def _combine_metadata(dataset_metadata, append_to_list): def _ensure_compatible_indices( - dataset: Optional[DatasetMetadataBase], secondary_indices: Iterable[str], -) -> List[str]: + dataset: Optional[DatasetMetadataBase], secondary_indices: Optional[Iterable[str]], +) -> InferredIndices: if dataset: - ds_secondary_indices = sorted(dataset.secondary_indices.keys()) + ds_secondary_indices = list(dataset.secondary_indices.keys()) if secondary_indices and not set(secondary_indices).issubset( ds_secondary_indices @@ -123,17 +126,53 @@ def _ensure_compatible_indices( f"Expected: {ds_secondary_indices}\n" f"But got: {secondary_indices}" ) - return ds_secondary_indices - return sorted(secondary_indices) + else: + # We return `False` if there is no dataset in storage and `secondary_indices` is undefined + # (`secondary_indices` is normalized to `[]` by default). + # In consequence, `parse_input_to_metapartition` will not check indices at the partition level. + if secondary_indices: + return list(secondary_indices) + else: + return False + + +def _ensure_valid_indices(mp_indices, secondary_indices=None, data=None): + # TODO (Kshitij68): Behavior is closely matches `_ensure_compatible_indices`. Refactoring can prove to be helpful + if data: + for table_name in data: + for index in mp_indices.keys(): + if index not in data[table_name].columns: + raise ValueError( + f"In table {table_name}, no column corresponding to index {index}" + ) + if secondary_indices not in (False, None): + secondary_indices = set(secondary_indices) + # If the dataset has `secondary_indices` defined, then these indices will be build later so there is no need to + # ensure that they are also defined here (on a partition level). + # Hence, we just check that no new indices are defined on the partition level. + if not secondary_indices.issuperset(mp_indices.keys()): + raise ValueError( + "Incorrect indices provided for dataset.\n" + f"Expected index columns: {secondary_indices}" + f"Provided index: {mp_indices}" + ) def validate_partition_keys( - dataset_uuid, store, ds_factory, default_metadata_version, partition_on, + dataset_uuid, + store, + ds_factory, + default_metadata_version, + partition_on, + load_dataset_metadata=True, ): if ds_factory or DatasetMetadata.exists(dataset_uuid, ensure_store(store)): ds_factory = _ensure_factory( - dataset_uuid=dataset_uuid, store=store, factory=ds_factory, + dataset_uuid=dataset_uuid, + store=store, + factory=ds_factory, + load_dataset_metadata=load_dataset_metadata, ) ds_metadata_version = ds_factory.metadata_version @@ -165,20 +204,7 @@ def validate_partition_keys( _NORMALIZE_ARGS = _NORMALIZE_ARGS_LIST + ["store", "dispatch_by"] - -@overload -def normalize_arg( - arg_name: Literal[ - "partition_on", - "delete_scope", - "secondary_indices", - "bucket_by", - "sort_partitions_by", - "dispatch_by", - ], - old_value: None, -) -> None: - ... +T = TypeVar("T") @overload @@ -191,8 +217,8 @@ def normalize_arg( "sort_partitions_by", "dispatch_by", ], - old_value: Union[str, List[str]], -) -> List[str]: + old_value: Optional[Union[T, List[T]]], +) -> List[T]: ... @@ -380,6 +406,32 @@ def sort_values_categorical( return df.sort_values(by=columns).reset_index(drop=True) +def check_single_table_dataset(dataset, expected_table=None): + """ + Raise if the given dataset is not a single-table dataset. + + Parameters + ---------- + dataset: kartothek.core.dataset.DatasetMetadata + The dataset to be validated + expected_table: Optional[str] + Ensure that the table in the dataset is the same as the given one. + """ + + if len(dataset.tables) > 1: + raise TypeError( + "Expected single table dataset but found dataset with tables: `{}`".format( + dataset.tables + ) + ) + if expected_table and dataset.tables != [expected_table]: + raise TypeError( + "Unexpected table in dataset:\nFound:\t{}\nExpected:\t{}".format( + dataset.tables, expected_table + ) + ) + + def raise_if_indices_overlap(partition_on, secondary_indices): partition_secondary_overlap = set(partition_on) & set(secondary_indices) if partition_secondary_overlap: diff --git a/kartothek/io_components/write.py b/kartothek/io_components/write.py index 874766a0..8aebdc0d 100644 --- a/kartothek/io_components/write.py +++ b/kartothek/io_components/write.py @@ -1,18 +1,19 @@ +from collections import defaultdict from functools import partial -from typing import Dict, Iterable, List, Optional, cast +from typing import Dict, Optional, Sequence, Union, cast -from simplekv import KeyValueStore +import pandas as pd from kartothek.core import naming from kartothek.core.common_metadata import ( - SchemaWrapper, read_schema_metadata, store_schema_metadata, validate_compatible, + validate_shared_columns, ) from kartothek.core.dataset import DatasetMetadataBuilder -from kartothek.core.factory import DatasetFactory from kartothek.core.index import ExplicitSecondaryIndex, IndexBase, PartitionIndex +from kartothek.core.partition import Partition from kartothek.core.typing import StoreFactory, StoreInput from kartothek.core.utils import ensure_store from kartothek.io_components.metapartition import ( @@ -23,6 +24,7 @@ partition_labels_from_mps, ) from kartothek.io_components.utils import ( + InferredIndices, combine_metadata, extract_duplicates, sort_values_categorical, @@ -34,24 +36,32 @@ def write_partition( partition_df: MetaPartitionInput, - secondary_indices: List[str], - sort_partitions_by: List[str], + secondary_indices: Optional[InferredIndices], + sort_partitions_by: Optional[Union[str, Sequence[str]]], dataset_uuid: str, - partition_on: List[str], + partition_on: Optional[Union[str, Sequence[str]]], store_factory: StoreFactory, df_serializer: Optional[DataFrameSerializer], metadata_version: int, - dataset_table_name: str = SINGLE_TABLE, + dataset_table_name: Optional[str] = None, ) -> MetaPartition: """ Write a dataframe to store, performing all necessary preprocessing tasks like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order. """ store = ensure_store(store_factory) - + parse_input: MetaPartitionInput + if isinstance(partition_df, pd.DataFrame) and dataset_table_name: + parse_input = [{"data": {dataset_table_name: partition_df}}] + else: + parse_input = partition_df + # delete reference to enable release after partition_on; before index build + del partition_df # I don't have access to the group values mps = parse_input_to_metapartition( - partition_df, metadata_version=metadata_version, table_name=dataset_table_name, + parse_input, + metadata_version=metadata_version, + expected_secondary_indices=secondary_indices, ) if sort_partitions_by: mps = mps.apply(partial(sort_values_categorical, columns=sort_partitions_by)) @@ -87,39 +97,49 @@ def persist_indices( return output_filenames -def persist_common_metadata( - schemas: Iterable[SchemaWrapper], - update_dataset: Optional[DatasetFactory], - store: KeyValueStore, - dataset_uuid: str, - table_name: str, -): - - if not schemas: - return None - schemas_set = set(schemas) - del schemas +def persist_common_metadata(partition_list, update_dataset, store, dataset_uuid): + # hash the schemas for quick equality check with possible false negatives + # (e.g. other pandas version or null schemas) + tm_dct = defaultdict(set) + for mp in partition_list: + for tab, tm in mp.table_meta.items(): + tm_dct[tab].add(tm) if update_dataset: - schemas_set.add( - read_schema_metadata( - dataset_uuid=dataset_uuid, store=store, table=table_name + if set(tm_dct.keys()) and set(update_dataset.tables) != set(tm_dct.keys()): + raise ValueError( + ( + "Input partitions for update have different tables than dataset:\n" + "Input partition tables: {}\n" + "Tables of existing dataset: {}" + ).format(set(tm_dct.keys()), update_dataset.tables) + ) + for table in update_dataset.tables: + tm_dct[table].add( + read_schema_metadata( + dataset_uuid=dataset_uuid, store=store, table=table + ) ) - ) - schemas_sorted = sorted(schemas_set, key=lambda s: sorted(s.origin)) + result = {} - try: - result = validate_compatible(schemas_sorted) - except ValueError as e: - raise ValueError( - "Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}".format( - dataset_uuid=dataset_uuid, e=e + # sort tables and schemas to have reproducible error messages + for table in sorted(tm_dct.keys()): + schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin)) + try: + result[table] = validate_compatible(schemas) + except ValueError as e: + raise ValueError( + "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}".format( + table=table, dataset_uuid=dataset_uuid, e=e + ) ) - ) - if result: + + validate_shared_columns(list(result.values())) + + for table, schema in result.items(): store_schema_metadata( - schema=result, dataset_uuid=dataset_uuid, store=store, table=table_name + schema=schema, dataset_uuid=dataset_uuid, store=store, table=table ) return result @@ -136,20 +156,16 @@ def store_dataset_from_partitions( ): store = ensure_store(store) - schemas = set() if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version - table_name = update_dataset.table_name - schemas.add(update_dataset.schema) else: mp = next(iter(partition_list), None) - if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) - table_name = mp.table_name + metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, @@ -157,25 +173,16 @@ def store_dataset_from_partitions( partition_keys=mp.partition_keys, ) - for mp in partition_list: - if mp.schema: - schemas.add(mp.schema) - - dataset_builder.schema = persist_common_metadata( - schemas=schemas, - update_dataset=update_dataset, - store=store, - dataset_uuid=dataset_uuid, - table_name=table_name, + dataset_builder.explicit_partitions = True + + dataset_builder.table_meta = persist_common_metadata( + partition_list, update_dataset, store, dataset_uuid ) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) - - # This could be safely removed since we do not allow to set this by the user - # anymore. It has implications on tests if mocks are used non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: @@ -192,7 +199,7 @@ def store_dataset_from_partitions( metadata_merger = combine_metadata dataset_builder = update_metadata( - dataset_builder, metadata_merger, dataset_metadata + dataset_builder, metadata_merger, partition_list, dataset_metadata ) dataset_builder = update_partitions( dataset_builder, partition_list, remove_partitions @@ -214,9 +221,10 @@ def store_dataset_from_partitions( return dataset -def update_metadata(dataset_builder, metadata_merger, dataset_metadata): +def update_metadata(dataset_builder, metadata_merger, add_partitions, dataset_metadata): metadata_list = [dataset_builder.metadata] + metadata_list += [mp.dataset_metadata for mp in add_partitions] new_dataset_metadata = metadata_merger(metadata_list) dataset_metadata = dataset_metadata or {} @@ -233,10 +241,13 @@ def update_metadata(dataset_builder, metadata_merger, dataset_metadata): def update_partitions(dataset_builder, add_partitions, remove_partitions): for mp in add_partitions: - for mmp in mp: - if mmp.label is not None: - dataset_builder.explicit_partitions = True - dataset_builder.add_partition(mmp.label, mmp.partition) + for sub_mp_dct in mp.metapartitions: + # label is None in case of an empty partition + if sub_mp_dct["label"] is not None: + partition = Partition( + label=sub_mp_dct["label"], files=sub_mp_dct["files"] + ) + dataset_builder.add_partition(sub_mp_dct["label"], partition) for partition_name in remove_partitions: del dataset_builder.partitions[partition_name] diff --git a/kartothek/utils/ktk_adapters.py b/kartothek/utils/ktk_adapters.py index 38246119..3f6657f2 100644 --- a/kartothek/utils/ktk_adapters.py +++ b/kartothek/utils/ktk_adapters.py @@ -14,18 +14,37 @@ METADATA_FORMAT_JSON, TABLE_METADATA_FILE, ) +from kartothek.io_components.metapartition import SINGLE_TABLE from kartothek.serialization._io_buffer import BlockBuffer from kartothek.utils.converters import converter_str __all__ = ( "get_dataset_columns", "get_dataset_keys", + "get_dataset_schema", "get_partition_dataframe", "get_physical_partition_stats", "metadata_factory_from_dataset", ) +def get_dataset_schema(dataset): + """ + Get schema from a Kartothek_Cube-compatible Kartothek dataset. + + Parameters + ---------- + dataset: kartothek.core.dataset.DatasetMetadata + Dataset to get the schema from. + + Returns + ------- + schema: pyarrow.Schema + Schema data. + """ + return dataset.table_meta[SINGLE_TABLE] + + def get_dataset_columns(dataset): """ Get columns present in a Kartothek_Cube-compatible Kartothek dataset. @@ -42,7 +61,7 @@ def get_dataset_columns(dataset): """ return { converter_str(col) - for col in dataset.schema.names + for col in get_dataset_schema(dataset).names if not col.startswith("__") and col != "KLEE_TS" } @@ -130,6 +149,9 @@ def get_physical_partition_stats(metapartitions, store): """ Get statistics for partition. + .. hint:: + To get the metapartitions pre-aligned, use ``concat_partitions_on_primary_index=True`` during dispatch. + Parameters ---------- metapartitions: Iterable[kartothek.io_components.metapartition.MetaPartition] @@ -149,14 +171,15 @@ def get_physical_partition_stats(metapartitions, store): blobsize = 0 rows = 0 for mp in metapartitions: - files += 1 - fp = BlockBuffer(store.open(mp.file)) - try: - fp_parquet = pq.ParquetFile(fp) - rows += fp_parquet.metadata.num_rows - blobsize += fp.size - finally: - fp.close() + for f in mp.files.values(): + files += 1 + fp = BlockBuffer(store.open(f)) + try: + fp_parquet = pq.ParquetFile(fp) + rows += fp_parquet.metadata.num_rows + blobsize += fp.size + finally: + fp.close() return {"blobsize": blobsize, "files": files, "partitions": 1, "rows": rows} diff --git a/tests/api/test_discover.py b/tests/api/test_discover.py index 7320a71a..c3fd9f68 100644 --- a/tests/api/test_discover.py +++ b/tests/api/test_discover.py @@ -25,7 +25,7 @@ store_dataframes_as_dataset, update_dataset_from_dataframes, ) -from kartothek.io_components.metapartition import MetaPartition +from kartothek.io_components.metapartition import SINGLE_TABLE, MetaPartition @pytest.fixture @@ -56,7 +56,9 @@ def store_data( partition_on = cube.partition_columns if isinstance(df, pd.DataFrame): - mp = MetaPartition(label=gen_uuid(), data=df, metadata_version=metadata_version) + mp = MetaPartition( + label=gen_uuid(), data={SINGLE_TABLE: df}, metadata_version=metadata_version + ) indices_to_build = set(cube.index_columns) & set(df.columns) if name == cube.seed_dataset: @@ -440,6 +442,47 @@ def test_raises_wrong_metadata_version(self, cube, function_store): partition_on=None, ) + def test_raises_wrong_table(self, cube, function_store): + store_data( + cube=cube, + function_store=function_store, + df=MetaPartition( + label=gen_uuid(), + data={"foo": pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})}, + metadata_version=KTK_CUBE_METADATA_VERSION, + ), + name=cube.seed_dataset, + ) + with pytest.raises(ValueError) as exc: + discover_datasets(cube, function_store) + assert ( + str(exc.value) + == "Invalid datasets because table is wrong. Expected table: myseed (foo)" + ) + + def test_raises_extra_table(self, cube, function_store): + store_data( + cube=cube, + function_store=function_store, + df=MetaPartition( + label=gen_uuid(), + data={ + SINGLE_TABLE: pd.DataFrame( + {"x": [0], "y": [0], "p": [0], "q": [0]} + ), + "foo": pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), + }, + metadata_version=KTK_CUBE_METADATA_VERSION, + ).build_indices(["x", "y"]), + name=cube.seed_dataset, + ) + with pytest.raises(ValueError) as exc: + discover_datasets(cube, function_store) + assert ( + str(exc.value) + == "Invalid datasets because table is wrong. Expected table: myseed (foo, table)" + ) + def test_raises_dtypes(self, cube, function_store): store_data( cube=cube, @@ -498,7 +541,7 @@ def test_raises_missing_dimension_columns(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"x": [0], "p": [0], "q": [0]}), + data={SINGLE_TABLE: pd.DataFrame({"x": [0], "p": [0], "q": [0]})}, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x"]), name=cube.seed_dataset, @@ -535,7 +578,9 @@ def test_raises_dimension_index_missing(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), + data={ + SINGLE_TABLE: pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name=cube.seed_dataset, @@ -553,7 +598,9 @@ def test_raises_other_index_missing(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), + data={ + SINGLE_TABLE: pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x", "y"]), name=cube.seed_dataset, @@ -563,9 +610,11 @@ def test_raises_other_index_missing(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame( - {"x": [0], "y": [0], "p": [0], "q": [0], "i1": [1337]} - ), + data={ + SINGLE_TABLE: pd.DataFrame( + {"x": [0], "y": [0], "p": [0], "q": [0], "i1": [1337]} + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name="enrich", @@ -584,9 +633,11 @@ def test_accepts_addional_indices(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame( - {"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]} - ), + data={ + SINGLE_TABLE: pd.DataFrame( + {"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]} + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x", "y", "v1"]), name=cube.seed_dataset, @@ -596,16 +647,18 @@ def test_accepts_addional_indices(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame( - { - "x": [0], - "y": [0], - "p": [0], - "q": [0], - "i1": [1337], - "v2": [42], - } - ), + data={ + SINGLE_TABLE: pd.DataFrame( + { + "x": [0], + "y": [0], + "p": [0], + "q": [0], + "i1": [1337], + "v2": [42], + } + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["i1", "x", "v2"]), name="enrich", @@ -627,7 +680,11 @@ def test_accepts_partition_index_for_index(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"x": [0], "y": [0], "i1": [1337], "v2": [42]}), + data={ + SINGLE_TABLE: pd.DataFrame( + {"x": [0], "y": [0], "i1": [1337], "v2": [42]} + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name="enrich", @@ -665,7 +722,11 @@ def test_accepts_projected_datasets(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), + data={ + SINGLE_TABLE: pd.DataFrame( + {"x": [0], "y": [0], "p": [0], "q": [0]} + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x", "y"]), name=cube.seed_dataset, @@ -675,7 +736,11 @@ def test_accepts_projected_datasets(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"x": [0], "p": [0], "q": [0], "v1": [42]}), + data={ + SINGLE_TABLE: pd.DataFrame( + {"x": [0], "p": [0], "q": [0], "v1": [42]} + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name="x", @@ -685,7 +750,11 @@ def test_accepts_projected_datasets(self, cube, function_store): function_store=function_store, df=MetaPartition( label=gen_uuid(), - data=pd.DataFrame({"y": [0], "p": [0], "q": [0], "v2": [42]}), + data={ + SINGLE_TABLE: pd.DataFrame( + {"y": [0], "p": [0], "q": [0], "v2": [42]} + ) + }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name="y", diff --git a/tests/conftest.py b/tests/conftest.py index be0d9c19..21b58b19 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,7 @@ get_dataframe_not_nested, ) from kartothek.io_components.metapartition import ( # noqa: E402 + SINGLE_TABLE, MetaPartition, gen_uuid, parse_input_to_metapartition, @@ -206,19 +207,15 @@ def patched__verify_metadata_version(metadata_version): # Mock `kartothek.io_components.metapartition.parse_input_to_metapartition` def patched__parse_input_to_metapartition( - obj, table_name, metadata_version=None, *args, **kwargs + obj, metadata_version=None, *args, **kwargs ): if metadata_version == mock_metadata_version: + table, data = obj # Tuple return MetaPartition( - label=gen_uuid(), - data=obj, - metadata_version=metadata_version, - table_name=table_name, + label=gen_uuid(), data={table: data}, metadata_version=metadata_version ) try: - return parse_input_to_metapartition( - obj, table_name, metadata_version, *args, **kwargs - ) + return parse_input_to_metapartition(obj, metadata_version, *args, **kwargs) except ValueError as e: # Raise a "custom" error to distinguish this error from the error raised # by `parse_input_to_metapartition` when the object has not previously @@ -293,8 +290,13 @@ def _get_meta_partitions_with_dataframe(metadata_version): ] ) ) - mp = MetaPartition(label="cluster_1", data=df, metadata_version=metadata_version) - df_2 = pd.DataFrame( + df_2 = pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])])) + mp = MetaPartition( + label="cluster_1", + data={SINGLE_TABLE: df, "helper": df_2}, + metadata_version=metadata_version, + ) + df_3 = pd.DataFrame( OrderedDict( [ ("P", [2]), @@ -304,7 +306,12 @@ def _get_meta_partitions_with_dataframe(metadata_version): ] ) ) - mp2 = MetaPartition(label="cluster_2", data=df_2, metadata_version=metadata_version) + df_4 = pd.DataFrame(OrderedDict([("P", [2]), ("info", ["b"])])) + mp2 = MetaPartition( + label="cluster_2", + data={SINGLE_TABLE: df_3, "helper": df_4}, + metadata_version=metadata_version, + ) return [mp, mp2] @@ -342,24 +349,26 @@ def meta_partitions_evaluation_dataframe(metadata_version): df = pd.DataFrame( OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [1]), ("PRED", [10])]) ) - mp = MetaPartition(label="cluster_1_1", data=df, metadata_version=metadata_version) + mp = MetaPartition( + label="cluster_1_1", data={"PRED": df}, metadata_version=metadata_version + ) df_2 = pd.DataFrame( OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [2]), ("PRED", [20])]) ) mp2 = MetaPartition( - label="cluster_1_2", data=df_2, metadata_version=metadata_version + label="cluster_1_2", data={"PRED": df_2}, metadata_version=metadata_version ) df_3 = pd.DataFrame( OrderedDict([("P", [2]), ("L", [2]), ("HORIZON", [1]), ("PRED", [10])]) ) mp3 = MetaPartition( - label="cluster_2_1", data=df_3, metadata_version=metadata_version + label="cluster_2_1", data={"PRED": df_3}, metadata_version=metadata_version ) df_4 = pd.DataFrame( OrderedDict([("P", [2]), ("L", [2]), ("HORIZON", [2]), ("PRED", [20])]) ) mp4 = MetaPartition( - label="cluster_2_2", data=df_4, metadata_version=metadata_version + label="cluster_2_2", data={"PRED": df_4}, metadata_version=metadata_version ) return [mp, mp2, mp3, mp4] @@ -367,7 +376,9 @@ def meta_partitions_evaluation_dataframe(metadata_version): def _store_metapartitions(meta_partitions_dataframe, store): result = [] for mp in meta_partitions_dataframe: - mp = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid") + mp = mp.store_dataframes( + store=store, dataset_uuid="dataset_uuid", store_metadata=True + ) result.append(mp) return result @@ -449,6 +460,7 @@ def dataset_factory(dataset, store_session_factory): store_factory=store_session_factory, load_schema=True, load_all_indices=False, + load_dataset_metadata=True, ) @@ -479,6 +491,7 @@ def dataset_partition_keys_factory(dataset_partition_keys, store_session_factory store_factory=store_session_factory, load_schema=True, load_all_indices=False, + load_dataset_metadata=True, ) @@ -513,6 +526,7 @@ def dataset_with_index_factory(dataset_with_index, store_session_factory): store_factory=store_session_factory, load_schema=True, load_all_indices=False, + load_dataset_metadata=True, ) diff --git a/tests/core/cube/test_constants.py b/tests/core/cube/test_constants.py index e2fcf174..e067318c 100644 --- a/tests/core/cube/test_constants.py +++ b/tests/core/cube/test_constants.py @@ -1,6 +1,6 @@ -from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPARATOR +from kartothek.core.cube.constants import KTK_CUBE_UUID_SEPERATOR from kartothek.core.dataset import _validate_uuid def test_uuid_seperator_valid(): - assert _validate_uuid(KTK_CUBE_UUID_SEPARATOR) + assert _validate_uuid(KTK_CUBE_UUID_SEPERATOR) diff --git a/tests/core/test_builder.py b/tests/core/test_builder.py index 0c511458..57501b8f 100644 --- a/tests/core/test_builder.py +++ b/tests/core/test_builder.py @@ -75,7 +75,10 @@ def test_builder_full(metadata_version, frozen_time): "dataset_metadata_version": metadata_version, "partitions": { "run_id=1/L=1/P=1/part_1": { - "files": {"core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet"} + "files": { + "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", + "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", + } } }, "metadata": {"key": "value", "creation_time": TIME_TO_FREEZE_ISO}, @@ -94,7 +97,10 @@ def test_builder_full(metadata_version, frozen_time): ) part_2 = Partition( label="run_id=1/L=1/P=1/part_1", - files={"core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet"}, + files={ + "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", + "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", + }, ) builder.add_partition("run_id=1/L=1/P=1/part_1", part_2) builder.add_metadata("key", "value") diff --git a/tests/core/test_dataset_dyn_part.py b/tests/core/test_dataset_dyn_part.py index a4dbb4c6..c33e395e 100644 --- a/tests/core/test_dataset_dyn_part.py +++ b/tests/core/test_dataset_dyn_part.py @@ -67,13 +67,25 @@ def test_dynamic_partitions(store): [("location", "L-1")], "{}.parquet".format(partition_suffix), ) + partition0_ext = create_partition_key( + dataset_uuid, + "extension", + [("location", "L-0")], + "{}.parquet".format(partition_suffix), + ) + partition1_ext = create_partition_key( + dataset_uuid, + "extension", + [("location", "L-1")], + "{}.parquet".format(partition_suffix), + ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/{}".format(partition_suffix): { - "files": {"core": partition0_core} + "files": {"core": partition0_core, "extension": partition0_ext} }, "location=L-1/{}".format(partition_suffix): { - "files": {"core": partition1_core} + "files": {"core": partition1_core, "extension": partition1_ext} }, } expected_indices = { @@ -90,6 +102,8 @@ def test_dynamic_partitions(store): ) store.put(partition0_core, b"test") store.put(partition1_core, b"test") + store.put(partition0_ext, b"test") + store.put(partition1_ext, b"test") store_schema_metadata( make_meta( pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}), @@ -111,8 +125,20 @@ def test_dynamic_partitions(store): ), origin="core", ) + extension_schema = make_meta( + pd.DataFrame( + { + "column_77": pd.Series([1], dtype=int), + "column_78": pd.Series([1], dtype=int), + "location": pd.Series(["str"]), + } + ), + origin="extension", + ) store_schema_metadata(core_schema, dataset_uuid, store, "core") - + store_schema_metadata(extension_schema, dataset_uuid, store, "extension") + dmd = DatasetMetadata.load_from_store(dataset_uuid, store) + # reload metadata to use table metadata dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() diff --git a/tests/core/test_dataset_explicit_part.py b/tests/core/test_dataset_explicit_part.py index 94eccf9d..fa895e95 100644 --- a/tests/core/test_dataset_explicit_part.py +++ b/tests/core/test_dataset_explicit_part.py @@ -85,24 +85,6 @@ def test_roundtrip_json(metadata_version): assert expected == result -def test_raise_multitable(metadata_version): - expected = { - "dataset_metadata_version": metadata_version, - "dataset_uuid": "uuid", - "metadata": {}, - "partitions": { - "part_1": {"files": {"tableA": "file.parquet", "tableB": "file.parquet"}} - }, - "partition_keys": [], - } - - with pytest.raises( - RuntimeError, - match=r"Dataset uuid has tables.*but read support for multi tabled dataset was dropped with kartothek 4\.0\.", - ): - DatasetMetadata.from_dict(expected) - - def test_roundtrip_msgpack(): expected = { "dataset_metadata_version": 4, @@ -154,19 +136,29 @@ def test_read_table_meta(store): "dataset_uuid": "dataset_uuid", "partitions": { "location_id=1/part_1": { - "files": {"table1": "dataset_uuid/table1/location_id=1/part_1.parquet"} + "files": { + "table1": "dataset_uuid/table1/location_id=1/part_1.parquet", + "table2": "dataset_uuid/table2/location_id=1/part_1.parquet", + } } }, } df1 = pd.DataFrame( {"location_id": pd.Series([1], dtype=int), "x": pd.Series([True], dtype=bool)} ) + df2 = pd.DataFrame( + {"location_id": pd.Series([1], dtype=int), "y": pd.Series([1.0], dtype=float)} + ) schema1 = make_meta(df1, origin="1") + schema2 = make_meta(df2, origin="2") store_schema_metadata(schema1, "dataset_uuid", store, "table1") + store_schema_metadata(schema2, "dataset_uuid", store, "table2") dmd = DatasetMetadata.load_from_dict(meta_dct, store) - assert dmd.schema == schema1 + actual = dmd.table_meta + expected = {"table1": schema1, "table2": schema2} + assert actual == expected def test_load_indices_embedded(metadata_version): @@ -214,7 +206,7 @@ def test_load_all_indices(store, metadata_version): }, } dmd = DatasetMetadata.from_dict(meta_dct) - dmd.schema = make_meta( + dmd.table_meta["core_data"] = make_meta( pd.DataFrame({"location_id": pd.Series([1], dtype=int)}), origin="core" ) diff --git a/tests/core/test_docs.py b/tests/core/test_docs.py index 07f499b2..81c22a5d 100644 --- a/tests/core/test_docs.py +++ b/tests/core/test_docs.py @@ -17,8 +17,10 @@ ) from kartothek.io.dask.delayed import ( delete_dataset__delayed, + merge_datasets_as_delayed, read_dataset_as_delayed, read_dataset_as_delayed_metapartitions, + read_table_as_delayed, store_delayed_as_dataset, update_dataset_from_delayed, ) @@ -54,8 +56,10 @@ store_dataset_from_ddf, update_dataset_from_ddf, delete_dataset__delayed, + merge_datasets_as_delayed, read_dataset_as_delayed_metapartitions, read_dataset_as_delayed, + read_table_as_delayed, update_dataset_from_delayed, store_delayed_as_dataset, delete_dataset, @@ -89,12 +93,7 @@ def test_docs(function): f"Wrong or missing docstrings for parameters {valid_docs[False]}.\n\n{docstrings}" ) - sorted_arguments = sorted(arguments) - args_in_docs = [argument in docstrings for argument in sorted_arguments] - - assert all(args_in_docs), [ - sorted_arguments[ix] for ix, val in enumerate(args_in_docs) if val is False - ] + assert all([argument in docstrings for argument in arguments]) def test_docs_duplicity(): diff --git a/tests/io/dask/bag/test_read.py b/tests/io/dask/bag/test_read.py index 9083e2a6..23d9af07 100644 --- a/tests/io/dask/bag/test_read.py +++ b/tests/io/dask/bag/test_read.py @@ -12,7 +12,7 @@ from kartothek.io.testing.read import * # noqa -@pytest.fixture(params=["dataframe"]) +@pytest.fixture(params=["dataframe", "metapartition"]) def output_type(request): return request.param @@ -47,7 +47,8 @@ def test_read_dataset_as_dataframes_partition_size(store_factory, metadata_versi cluster4 = pd.DataFrame( {"A": [2, 2], "B": [10, 10], "C": [1, 2], "Content": ["cluster4", "cluster4"]} ) - partitions = [cluster1, cluster2, cluster3, cluster4] + clusters = [cluster1, cluster2, cluster3, cluster4] + partitions = [{"data": [("data", c)]} for c in clusters] store_dataframes_as_dataset__iter( df_generator=partitions, diff --git a/tests/io/dask/dataframe/test_read.py b/tests/io/dask/dataframe/test_read.py index 176e4b7c..a6807730 100644 --- a/tests/io/dask/dataframe/test_read.py +++ b/tests/io/dask/dataframe/test_read.py @@ -32,7 +32,8 @@ def _read_as_ddf( **kwargs, ): table = tables or SINGLE_TABLE - + if categoricals: + categoricals = categoricals[table] ddf = read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store, @@ -84,6 +85,7 @@ def test_load_dataframe_categoricals_with_index(dataset_with_index_factory): bound_load_dataframes=func, use_categoricals=True, output_type="table", + label_filter=None, dates_as_object=False, ) diff --git a/tests/io/dask/dataframe/test_shuffle.py b/tests/io/dask/dataframe/test_shuffle.py index 9c949299..2b281722 100644 --- a/tests/io/dask/dataframe/test_shuffle.py +++ b/tests/io/dask/dataframe/test_shuffle.py @@ -183,7 +183,7 @@ def test_update_shuffle_buckets( range(unique_secondaries) ) - assert set(dataset.schema.names) == { + assert set(dataset.table_meta["core"].names) == { "primary", "secondary", "sorted_column", @@ -197,8 +197,9 @@ def test_update_shuffle_buckets( assert not ind_df.duplicated().any() - for df in read_dataset_as_dataframes__iterator( + for data_dct in read_dataset_as_dataframes__iterator( dataset_uuid=dataset.uuid, store=store_factory ): + df = data_dct["core"] assert len(df.primary.unique()) == 1 assert df.sorted_column.is_monotonic diff --git a/tests/io/dask/dataframe/test_stats.py b/tests/io/dask/dataframe/test_stats.py index 751c6cde..0a69044e 100644 --- a/tests/io/dask/dataframe/test_stats.py +++ b/tests/io/dask/dataframe/test_stats.py @@ -6,7 +6,8 @@ store_dataframes_as_dataset, update_dataset_from_dataframes, ) -from kartothek.io_components.metapartition import _METADATA_SCHEMA +from kartothek.io_components.metapartition import _METADATA_SCHEMA, MetaPartition +from kartothek.io_components.write import store_dataset_from_partitions from kartothek.serialization import ParquetSerializer @@ -14,6 +15,7 @@ def test_collect_dataset_metadata(store_session_factory, dataset): df_stats = collect_dataset_metadata( store=store_session_factory, dataset_uuid="dataset_uuid", + table_name="table", predicates=None, frac=1, ).compute() @@ -47,6 +49,7 @@ def test_collect_dataset_metadata_predicates(store_session_factory, dataset): df_stats = collect_dataset_metadata( store=store_session_factory, dataset_uuid="dataset_uuid", + table_name="table", predicates=predicates, frac=1, ).compute() @@ -85,7 +88,11 @@ def test_collect_dataset_metadata_predicates_on_index(store_factory): predicates = [[("L", "==", "b")]] df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", predicates=predicates, frac=1, + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table", + predicates=predicates, + frac=1, ).compute() assert "L=b" in df_stats["partition_label"].values[0] @@ -129,7 +136,11 @@ def test_collect_dataset_metadata_predicates_row_group_size(store_factory): predicates = [[("L", "==", "a")]] df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", predicates=predicates, frac=1, + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table", + predicates=predicates, + frac=1, ).compute() for part_label in df_stats["partition_label"]: @@ -160,7 +171,10 @@ def test_collect_dataset_metadata_predicates_row_group_size(store_factory): def test_collect_dataset_metadata_frac_smoke(store_session_factory, dataset): df_stats = collect_dataset_metadata( - store=store_session_factory, dataset_uuid="dataset_uuid", frac=0.8, + store=store_session_factory, + dataset_uuid="dataset_uuid", + table_name="table", + frac=0.8, ).compute() columns = { "partition_label", @@ -176,13 +190,28 @@ def test_collect_dataset_metadata_frac_smoke(store_session_factory, dataset): assert set(df_stats.columns) == columns +def test_collect_dataset_metadata_empty_dataset_mp(store_factory): + mp = MetaPartition(label="cluster_1") + store_dataset_from_partitions( + partition_list=[mp], store=store_factory, dataset_uuid="dataset_uuid" + ) + + df_stats = collect_dataset_metadata( + store=store_factory, dataset_uuid="dataset_uuid", table_name="table" + ).compute() + + expected = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) + expected = expected.astype(_METADATA_SCHEMA) + pd.testing.assert_frame_equal(expected, df_stats, check_index_type=False) + + def test_collect_dataset_metadata_empty_dataset(store_factory): df = pd.DataFrame(columns=["A", "b"], index=pd.RangeIndex(start=0, stop=0)) store_dataframes_as_dataset( store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"] ) df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid" + store=store_factory, dataset_uuid="dataset_uuid", table_name="table", ).compute() expected = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) expected = expected.astype(_METADATA_SCHEMA) @@ -196,7 +225,7 @@ def test_collect_dataset_metadata_concat(store_factory): store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"] ) df_stats1 = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", + store=store_factory, dataset_uuid="dataset_uuid", table_name="table", ).compute() # Remove all partitions of the dataset @@ -205,7 +234,7 @@ def test_collect_dataset_metadata_concat(store_factory): ) df_stats2 = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", + store=store_factory, dataset_uuid="dataset_uuid", table_name="table", ).compute() pd.concat([df_stats1, df_stats2]) @@ -221,7 +250,7 @@ def test_collect_dataset_metadata_delete_dataset(store_factory): ) df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", + store=store_factory, dataset_uuid="dataset_uuid", table_name="table", ).compute() expected = pd.DataFrame(columns=_METADATA_SCHEMA) expected = expected.astype(_METADATA_SCHEMA) @@ -257,13 +286,59 @@ def test_collect_dataset_metadata_at_least_one_partition(store_factory): assert len(df_stats) == 1 +def test_collect_dataset_metadata_table_without_partition(store_factory): + """ + df2 doesn't have files for all partition (specifically `A==2`). + Make sure that we still collect the right metadata + """ + df1 = pd.DataFrame(data={"A": [1, 1, 2, 2], "b": [1, 1, 2, 2]}) + df2 = pd.DataFrame(data={"A": [1, 1], "b": [1, 1]}) + + store_dataframes_as_dataset( + store=store_factory, + dataset_uuid="dataset_uuid", + dfs=[{"table1": df1, "table2": df2}], + partition_on=["A"], + ) + + df_stats = collect_dataset_metadata( + store=store_factory, dataset_uuid="dataset_uuid", table_name="table2", + ).compute() + actual = df_stats.drop( + columns=[ + "partition_label", + "row_group_compressed_size", + "row_group_uncompressed_size", + "serialized_size", + ], + axis=1, + ) + expected = pd.DataFrame( + data={ + "row_group_id": [0], + "number_rows_total": [2], + "number_row_groups": [1], + "number_rows_per_row_group": [2], + } + ) + pd.testing.assert_frame_equal(actual, expected) + assert len(df_stats) == 1 + assert df_stats.iloc[0]["partition_label"].startswith("A=1/") + + def test_collect_dataset_metadata_invalid_frac(store_session_factory, dataset): with pytest.raises(ValueError, match="Invalid value for parameter `frac`"): collect_dataset_metadata( - store=store_session_factory, dataset_uuid="dataset_uuid", frac=1.1, + store=store_session_factory, + dataset_uuid="dataset_uuid", + table_name="table", + frac=1.1, ) with pytest.raises(ValueError, match="Invalid value for parameter `frac`"): collect_dataset_metadata( - store=store_session_factory, dataset_uuid="dataset_uuid", frac=0.0, + store=store_session_factory, + dataset_uuid="dataset_uuid", + table_name="table", + frac=0.0, ) diff --git a/tests/io/dask/dataframe/test_update.py b/tests/io/dask/dataframe/test_update.py index 712f8d0d..4d884ad6 100644 --- a/tests/io/dask/dataframe/test_update.py +++ b/tests/io/dask/dataframe/test_update.py @@ -14,31 +14,25 @@ def bound_update_dataset(): return _update_dataset -def _id(part): - if isinstance(part, pd.DataFrame): - return part - else: - return part[0] +def _unwrap_partition(part): + return next(iter(dict(part["data"]).values())) def _update_dataset(partitions, *args, **kwargs): # TODO: Simplify once parse_input_to_metapartition is removed / obsolete - if isinstance(partitions, pd.DataFrame): + table_name = "core" partitions = dd.from_pandas(partitions, npartitions=1) - elif partitions is not None: - delayed_partitions = [dask.delayed(_id)(part) for part in partitions] + elif any(partitions): + table_name = next(iter(dict(partitions[0]["data"]).keys())) + delayed_partitions = [ + dask.delayed(_unwrap_partition)(part) for part in partitions + ] partitions = dd.from_delayed(delayed_partitions) else: + table_name = "core" partitions = None - - # Replace `table_name` with `table` keyword argument to enable shared test code - # via `bound_update_dataset` fixture - if "table_name" in kwargs: - kwargs["table"] = kwargs["table_name"] - del kwargs["table_name"] - - ddf = update_dataset_from_ddf(partitions, *args, **kwargs) + ddf = update_dataset_from_ddf(partitions, *args, table=table_name, **kwargs) s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL) ddf = pickle.loads(s) @@ -50,6 +44,22 @@ def _return_none(): return None +def test_delayed_as_delete_scope(store_factory, df_all_types): + # Check that delayed objects are allowed as delete scope. + tasks = update_dataset_from_ddf( + dd.from_pandas(df_all_types, npartitions=1), + store_factory, + dataset_uuid="output_dataset_uuid", + table="core", + delete_scope=dask.delayed(_return_none)(), + ) + + s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) + tasks = pickle.loads(s) + + tasks.compute() + + @pytest.mark.parametrize("shuffle", [True, False]) def test_update_dataset_from_ddf_empty(store_factory, shuffle): with pytest.raises(ValueError, match="Cannot store empty datasets"): diff --git a/tests/io/dask/delayed/test_merge.py b/tests/io/dask/delayed/test_merge.py new file mode 100644 index 00000000..750dac26 --- /dev/null +++ b/tests/io/dask/delayed/test_merge.py @@ -0,0 +1,19 @@ +import pickle + +import dask +import pytest + +from kartothek.io.dask.delayed import merge_datasets_as_delayed +from kartothek.io.testing.merge import * # noqa + + +def _merge_datasets(*args, **kwargs): + df_list = merge_datasets_as_delayed(*args, **kwargs) + s = pickle.dumps(df_list, pickle.HIGHEST_PROTOCOL) + df_list = pickle.loads(s) + return dask.compute(df_list)[0] + + +@pytest.fixture +def bound_merge_datasets(request): + return _merge_datasets diff --git a/tests/io/dask/delayed/test_read.py b/tests/io/dask/delayed/test_read.py index 3ead1b15..30c02891 100644 --- a/tests/io/dask/delayed/test_read.py +++ b/tests/io/dask/delayed/test_read.py @@ -3,20 +3,29 @@ import pytest -from kartothek.io.dask.delayed import read_dataset_as_delayed +from kartothek.io.dask.delayed import ( + read_dataset_as_delayed, + read_dataset_as_delayed_metapartitions, + read_table_as_delayed, +) from kartothek.io.testing.read import * # noqa -@pytest.fixture(params=["table"]) +@pytest.fixture(params=["dataframe", "metapartition", "table"]) def output_type(request): return request.param def _load_dataframes(output_type, *args, **kwargs): - if "tables" in kwargs: - param_tables = kwargs.pop("tables") - kwargs["table"] = param_tables - func = partial(read_dataset_as_delayed) + if output_type == "dataframe": + func = read_dataset_as_delayed + elif output_type == "metapartition": + func = read_dataset_as_delayed_metapartitions + elif output_type == "table": + if "tables" in kwargs: + param_tables = kwargs.pop("tables") + kwargs["table"] = param_tables + func = partial(read_table_as_delayed) tasks = func(*args, **kwargs) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) diff --git a/tests/io/dask/test_common_cube.py b/tests/io/dask/test_common_cube.py index fd835ad6..118a2180 100644 --- a/tests/io/dask/test_common_cube.py +++ b/tests/io/dask/test_common_cube.py @@ -21,7 +21,7 @@ def test_cube_with_valid_indices_is_not_modified_by_validation(): { "dataset_uuid": "source", "dataset_metadata_version": 4, - "schema": FakeSeedTableMetadata(), + "table_meta": {"table": FakeSeedTableMetadata()}, "partition_keys": ["p"], "indices": { "d1": {"1": ["part_1"]}, @@ -34,7 +34,7 @@ def test_cube_with_valid_indices_is_not_modified_by_validation(): { "dataset_uuid": "extra", "dataset_metadata_version": 4, - "schema": FakeExtraTableMetadata(), + "table_meta": {"table": FakeExtraTableMetadata()}, "partition_keys": ["p"], "indices": {"i1": {"1": ["part_1"]}}, } @@ -62,7 +62,7 @@ def test_existing_indices_are_added_when_missing_in_cube(): { "dataset_uuid": "source", "dataset_metadata_version": 4, - "schema": FakeExtraTableMetadata(), + "table_meta": {"table": FakeSeedTableMetadata()}, "partition_keys": ["p"], "indices": { "d1": {"1": ["part_1"]}, @@ -76,7 +76,7 @@ def test_existing_indices_are_added_when_missing_in_cube(): { "dataset_uuid": "extra", "dataset_metadata_version": 4, - "schema": FakeExtraTableMetadata(), + "table_meta": {"table": FakeExtraTableMetadata()}, "partition_keys": ["p"], "indices": {"i1": {"1": ["part_1"]}}, } @@ -104,7 +104,7 @@ def test_raises_when_cube_defines_index_not_in_dataset(): { "dataset_uuid": "source", "dataset_metadata_version": 4, - "schema": FakeSeedTableMetadata(), + "table_meta": {"table": FakeSeedTableMetadata()}, "partition_keys": ["p"], "indices": { "d1": {"1": ["part_1"]}, @@ -117,7 +117,7 @@ def test_raises_when_cube_defines_index_not_in_dataset(): { "dataset_uuid": "extra", "dataset_metadata_version": 4, - "schema": FakeExtraTableMetadata(), + "table_meta": {"table": FakeExtraTableMetadata()}, "partition_keys": ["p"], "indices": {"i1": {"1": ["part_1"]}}, } @@ -145,7 +145,7 @@ def test_no_indices_are_suppressed_when_they_already_exist(): { "dataset_uuid": "source", "dataset_metadata_version": 4, - "schema": FakeSeedTableMetadata(), + "table_meta": {"table": FakeSeedTableMetadata()}, "partition_keys": ["p"], "indices": { "d1": {"1": ["part_1"]}, @@ -158,7 +158,7 @@ def test_no_indices_are_suppressed_when_they_already_exist(): { "dataset_uuid": "extra", "dataset_metadata_version": 4, - "schema": FakeExtraTableMetadata(), + "table_meta": {"table": FakeExtraTableMetadata()}, "partition_keys": ["p"], "indices": {"i1": {"1": ["part_1"]}}, } diff --git a/tests/io/eager/test_commit.py b/tests/io/eager/test_commit.py index ee749dce..d711d7fe 100644 --- a/tests/io/eager/test_commit.py +++ b/tests/io/eager/test_commit.py @@ -7,6 +7,7 @@ from kartothek.core.common_metadata import make_meta from kartothek.core.dataset import DatasetMetadata +from kartothek.core.index import ExplicitSecondaryIndex from kartothek.io.eager import ( commit_dataset, create_empty_dataset_header, @@ -14,21 +15,25 @@ store_dataframes_as_dataset, write_single_partition, ) +from kartothek.io_components.metapartition import SINGLE_TABLE def test_commit_dataset_from_metapartition(dataset_function, store): - new_data = [ - pd.DataFrame( - OrderedDict( - [ - ("P", [5]), - ("L", [5]), - ("TARGET", [5]), - ("DATE", [datetime.date(2016, 3, 23)]), - ] - ) - ) - ] + new_data = { + "data": { + SINGLE_TABLE: pd.DataFrame( + OrderedDict( + [ + ("P", [5]), + ("L", [5]), + ("TARGET", [5]), + ("DATE", [datetime.date(2016, 3, 23)]), + ] + ) + ), + "helper": pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])])), + } + } new_partition = write_single_partition( store=store, dataset_uuid=dataset_function.uuid, data=new_data ) @@ -59,17 +64,102 @@ def test_commit_dataset_from_metapartition(dataset_function, store): # Read the data and check whether the rows above are included. # This checks whether all necessary informations were updated in the header # (e.g. files attributes of the partitions) - actual = read_table(store=store, dataset_uuid=dataset_function.uuid) + actual = read_table( + store=store, table=SINGLE_TABLE, dataset_uuid=dataset_function.uuid + ) df_expected = pd.DataFrame( OrderedDict( [ ( "DATE", + pd.to_datetime( + [ + datetime.date(2016, 3, 23), + datetime.date(2010, 1, 1), + datetime.date(2009, 12, 31), + ] + ), + ), + ("L", [5, 1, 2]), + ("P", [5, 1, 2]), + ("TARGET", [5, 1, 2]), + ] + ) + ) + actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True) + + assert_frame_equal(df_expected, actual) + + +def test_commit_dataset_from_dict(dataset_function, store): + + new_data = { + "data": { + SINGLE_TABLE: pd.DataFrame( + OrderedDict( [ - datetime.date(2016, 3, 23), - datetime.date(2010, 1, 1), - datetime.date(2009, 12, 31), - ], + ("P", [5]), + ("L", [5]), + ("TARGET", [5]), + ("DATE", [datetime.date(2016, 3, 23)]), + ] + ) + ), + "helper": pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])])), + } + } + new_metapartition = write_single_partition( + store=store, dataset_uuid=dataset_function.uuid, data=new_data + ) + new_partition = [ + { + "label": new_metapartition.label, + "data": [(SINGLE_TABLE, None), ("helper", None)], + } + ] + pre_commit_dataset = DatasetMetadata.load_from_store( + uuid=dataset_function.uuid, store=store + ) + # Cannot assert equal since the metadata is differently ordered + assert pre_commit_dataset == dataset_function + + updated_dataset = commit_dataset( + store=store, + dataset_uuid=dataset_function.uuid, + new_partitions=new_partition, + delete_scope=None, + partition_on=None, + ) + assert updated_dataset != dataset_function + assert updated_dataset.explicit_partitions is True + + assert updated_dataset.uuid == dataset_function.uuid + assert len(updated_dataset.partitions) == len(dataset_function.partitions) + 1 + + # ensure that the new dataset is actually the one on disc + loaded_dataset = DatasetMetadata.load_from_store( + uuid=updated_dataset.uuid, store=store + ) + assert loaded_dataset == updated_dataset + + # Read the data and check whether the rows above are included. + # This checks whether all necessary informations were updated in the header + # (e.g. files attributes of the partitions) + actual = read_table( + store=store, table=SINGLE_TABLE, dataset_uuid=dataset_function.uuid + ) + df_expected = pd.DataFrame( + OrderedDict( + [ + ( + "DATE", + pd.to_datetime( + [ + datetime.date(2016, 3, 23), + datetime.date(2010, 1, 1), + datetime.date(2009, 12, 31), + ] + ), ), ("L", [5, 1, 2]), ("P", [5, 1, 2]), @@ -82,7 +172,6 @@ def test_commit_dataset_from_metapartition(dataset_function, store): assert_frame_equal(df_expected, actual) -# TODO: document changes for write_single_partition / commit workflow def test_commit_dataset_from_nested_metapartition(store): """ Check it is possible to use `commit_dataset` with nested metapartitions as input. @@ -94,7 +183,7 @@ def test_commit_dataset_from_nested_metapartition(store): create_empty_dataset_header( store=store, dataset_uuid="uuid", - schema=make_meta(df, "table", ["a"]), + table_meta={"table": make_meta(df, "table", ["a"])}, partition_on=["a"], ) @@ -102,7 +191,10 @@ def test_commit_dataset_from_nested_metapartition(store): for x in range(2): partitions.append( write_single_partition( - store=store, dataset_uuid="uuid", data=df, partition_on=["a"], + store=store, + dataset_uuid="uuid", + data={"data": {"table": df}}, + partition_on=["a"], ) ) @@ -118,25 +210,26 @@ def test_initial_commit(store): df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, - schema=make_meta(df, origin="1"), + table_meta={"core": make_meta(df, origin="1")}, dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False + new_data = {"data": {"core": df}} new_metapartition = write_single_partition( - store=store, dataset_uuid=dataset.uuid, data=df + store=store, dataset_uuid=dataset.uuid, data=new_data ) + new_partition = [{"label": new_metapartition.label, "data": [("core", None)]}] updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, - # FIXME: is this breaking and if so, is it expected? - new_partitions=[new_metapartition], + new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True - actual = read_table(store=store, dataset_uuid=updated_dataset.uuid) + actual = read_table(store=store, table="core", dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame(OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual) @@ -144,15 +237,22 @@ def test_initial_commit(store): def test_commit_dataset_only_delete(store, metadata_version): partitions = [ - pd.DataFrame({"p": [1]}), - pd.DataFrame({"p": [2]}), + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": [1]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, + }, + { + "label": "cluster_2", + "data": [("core", pd.DataFrame({"p": [2]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, + }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=lambda: store, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", - secondary_indices="p", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) @@ -167,18 +267,23 @@ def test_commit_dataset_only_delete(store, metadata_version): partition_on=None, ) assert len(updated_dataset.partitions) == 1 + assert list(updated_dataset.partitions.keys()) == ["cluster_2"] assert updated_dataset.explicit_partitions is True def test_commit_dataset_delete_all(store, metadata_version): - partitions = [pd.DataFrame({"p": [1]})] - + partitions = [ + { + "label": "cluster_1", + "data": [("core", pd.DataFrame({"p": [1]}))], + "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, + } + ] dataset = store_dataframes_as_dataset( dfs=partitions, store=lambda: store, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", - secondary_indices="p", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) diff --git a/tests/io/eager/test_read.py b/tests/io/eager/test_read.py index 01278221..98418734 100644 --- a/tests/io/eager/test_read.py +++ b/tests/io/eager/test_read.py @@ -7,14 +7,17 @@ from kartothek.io.eager import ( read_dataset_as_dataframes, + read_dataset_as_metapartitions, read_table, store_dataframes_as_dataset, ) from kartothek.io.testing.read import * # noqa +from kartothek.io_components.metapartition import SINGLE_TABLE @pytest.fixture( - params=["dataframe", "table"], ids=["dataframe", "table"], + params=["dataframe", "metapartition", "table"], + ids=["dataframe", "metapartition", "table"], ) def output_type(request): # TODO: get rid of this parametrization and split properly into two functions @@ -22,7 +25,9 @@ def output_type(request): def _read_table(*args, **kwargs): - kwargs.pop("dispatch_by", None) + if "tables" in kwargs: + param_tables = kwargs.pop("tables") + kwargs["table"] = param_tables res = read_table(*args, **kwargs) if len(res): @@ -32,13 +37,13 @@ def _read_table(*args, **kwargs): return [res] -# FIXME: handle removal of metparittion function properly. -# FIXME: consolidate read_Dataset_as_dataframes (replaced by iter) def _read_dataset(output_type, *args, **kwargs): if output_type == "table": return _read_table elif output_type == "dataframe": return read_dataset_as_dataframes + elif output_type == "metapartition": + return read_dataset_as_metapartitions else: raise NotImplementedError() @@ -55,19 +60,24 @@ def backend_identifier(): def test_read_table_eager(dataset, store_session, use_categoricals): if use_categoricals: - categories = ["P"] + categories = {SINGLE_TABLE: ["P"]} else: categories = None df = read_table( - store=store_session, dataset_uuid="dataset_uuid", categoricals=categories, + store=store_session, + dataset_uuid="dataset_uuid", + table=SINGLE_TABLE, + categoricals=categories, ) expected_df = pd.DataFrame( { "P": [1, 2], "L": [1, 2], "TARGET": [1, 2], - "DATE": [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)], + "DATE": pd.to_datetime( + [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)] + ), } ) if categories: @@ -78,10 +88,46 @@ def test_read_table_eager(dataset, store_session, use_categoricals): pdt.assert_frame_equal(df, expected_df, check_dtype=True, check_like=True) + df_2 = read_table(store=store_session, dataset_uuid="dataset_uuid", table="helper") + expected_df_2 = pd.DataFrame({"P": [1, 2], "info": ["a", "b"]}) + + assert isinstance(df_2, pd.DataFrame) + + # No stability of partitions + df_2 = df_2.sort_values(by="P").reset_index(drop=True) + + pdt.assert_frame_equal(df_2, expected_df_2, check_dtype=True, check_like=True) + + df_3 = read_table( + store=store_session, + dataset_uuid="dataset_uuid", + table="helper", + predicates=[[("P", "==", 2)]], + ) + expected_df_3 = pd.DataFrame({"P": [2], "info": ["b"]}) + + assert isinstance(df_3, pd.DataFrame) + pdt.assert_frame_equal(df_3, expected_df_3, check_dtype=True, check_like=True) + + df_4 = read_table( + store=store_session, + dataset_uuid="dataset_uuid", + table="helper", + predicates=[[("info", "==", "a")]], + ) + expected_df_4 = pd.DataFrame({"P": [1], "info": ["a"]}) + + assert isinstance(df_4, pd.DataFrame) + + pdt.assert_frame_equal(df_4, expected_df_4, check_dtype=True, check_like=True) + def test_read_table_with_columns(dataset, store_session): df = read_table( - store=store_session, dataset_uuid="dataset_uuid", columns=["P", "L"], + store=store_session, + dataset_uuid="dataset_uuid", + table=SINGLE_TABLE, + columns=["P", "L"], ) expected_df = pd.DataFrame({"P": [1, 2], "L": [1, 2]}) @@ -97,6 +143,7 @@ def test_read_table_simple_list_for_cols_cats(dataset, store_session): df = read_table( store=store_session, dataset_uuid="dataset_uuid", + table=SINGLE_TABLE, columns=["P", "L"], categoricals=["P", "L"], ) @@ -121,7 +168,7 @@ def test_write_and_read_default_table_name(store_session): df_read_as_dfs = read_dataset_as_dataframes( "test_default_table_name", store_session ) - pd.testing.assert_frame_equal(df_write, df_read_as_dfs[0]) + pd.testing.assert_frame_equal(df_write, df_read_as_dfs[0]["table"]) @pytest.mark.parametrize("partition_on", [None, ["A", "B"]]) diff --git a/tests/io/eager/test_update.py b/tests/io/eager/test_update.py index 77a68c6c..86228292 100644 --- a/tests/io/eager/test_update.py +++ b/tests/io/eager/test_update.py @@ -28,7 +28,7 @@ def test_create_empty_header_from_pyarrow_schema(store_factory): dm = create_empty_dataset_header( store=store_factory, dataset_uuid=dataset_uuid, - schema=schema, + table_meta={"table": schema}, partition_on=["part"], ) @@ -36,7 +36,7 @@ def test_create_empty_header_from_pyarrow_schema(store_factory): write_single_partition( store=store_factory, dataset_uuid=dataset_uuid, - data=[df.loc[df["part"] == 1]], + data=[{"table": df.loc[df["part"] == 1]}], partition_on=["part"], ) ] diff --git a/tests/io/eager/test_write.py b/tests/io/eager/test_write.py index faacb56d..36c6098d 100644 --- a/tests/io/eager/test_write.py +++ b/tests/io/eager/test_write.py @@ -7,6 +7,7 @@ from kartothek.core.common_metadata import make_meta, read_schema_metadata from kartothek.core.dataset import DatasetMetadata +from kartothek.core.index import ExplicitSecondaryIndex from kartothek.core.uuid import gen_uuid from kartothek.io.eager import ( create_empty_dataset_header, @@ -34,40 +35,50 @@ def bound_store_dataframes(): def test_write_single_partition(store_factory, mock_uuid, metadata_version): create_empty_dataset_header( store=store_factory(), - schema=pd.DataFrame({"col": [1]}), + table_meta={ + "table1": pd.DataFrame({"col": [1]}), + "table2": pd.DataFrame({"other_col": ["a"]}), + }, dataset_uuid="some_dataset", metadata_version=metadata_version, - table_name="table1", ) - new_data = pd.DataFrame({"col": [1, 2]}) + new_data = { + "data": { + "table1": pd.DataFrame({"col": [1, 2]}), + "table2": pd.DataFrame({"other_col": ["a", "b"]}), + } + } keys_in_store = set(store_factory().keys()) new_mp = write_single_partition( - store=store_factory, - dataset_uuid="some_dataset", - data=new_data, - table_name="table1", + store=store_factory, dataset_uuid="some_dataset", data=new_data ) keys_in_store.add("some_dataset/table1/auto_dataset_uuid.parquet") + keys_in_store.add("some_dataset/table2/auto_dataset_uuid.parquet") assert set(store_factory().keys()) == keys_in_store expected_mp = MetaPartition( label="auto_dataset_uuid", # this will be a hash of the input - file="some_dataset/table1/auto_dataset_uuid.parquet", + files={ + "table1": "some_dataset/table1/auto_dataset_uuid.parquet", + "table2": "some_dataset/table2/auto_dataset_uuid.parquet", + }, metadata_version=4, - schema=make_meta(pd.DataFrame({"col": [1, 2]}), origin="table1"), + table_meta={ + "table1": make_meta(pd.DataFrame({"col": [1, 2]}), origin="table1"), + "table2": make_meta( + pd.DataFrame({"other_col": ["a", "b"]}), origin="table2" + ), + }, ) assert new_mp == expected_mp with pytest.raises(ValueError): # col is an integer column so this is incompatible. - new_data = pd.DataFrame({"col": [datetime.date(2010, 1, 1)]}) + new_data["data"]["table1"] = pd.DataFrame({"col": [datetime.date(2010, 1, 1)]}) write_single_partition( - store=store_factory, - dataset_uuid="some_dataset", - data=new_data, - table_name="table1", + store=store_factory, dataset_uuid="some_dataset", data=new_data ) @@ -75,14 +86,14 @@ def test_create_dataset_header_minimal_version(store, metadata_storage_format): with pytest.raises(NotImplementedError): create_empty_dataset_header( store=store, - schema=pd.DataFrame({"col": [1]}), + table_meta={"table": pd.DataFrame({"col": [1]})}, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=3, ) create_empty_dataset_header( store=store, - schema=pd.DataFrame({"col": [1]}), + table_meta={"table": pd.DataFrame({"col": [1]})}, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, @@ -90,10 +101,10 @@ def test_create_dataset_header_minimal_version(store, metadata_storage_format): def test_create_dataset_header(store, metadata_storage_format, frozen_time): - schema = make_meta(pd.DataFrame({"col": [1]}), origin="1") + table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")} new_dataset = create_empty_dataset_header( store=store, - schema=schema, + table_meta=table_meta, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, @@ -103,7 +114,7 @@ def test_create_dataset_header(store, metadata_storage_format, frozen_time): uuid="new_dataset_uuid", metadata_version=4, explicit_partitions=False, - schema=schema, + table_meta=table_meta, ) assert new_dataset == expected_dataset @@ -114,7 +125,7 @@ def test_create_dataset_header(store, metadata_storage_format, frozen_time): assert loaded == expected_dataset # If the read succeeds, the schema is written - read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store) + read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store, table="table") # TODO: move `store_dataframes_as_dataset` tests to generic tests or remove if redundant @@ -123,10 +134,12 @@ def test_store_dataframes_as_dataset_no_pipeline_partition_on(store): {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) + df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) + dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", - dfs=[df], + dfs=[{"core": df, "helper": df2}], partition_on="P", metadata_version=4, ) @@ -139,20 +152,23 @@ def test_store_dataframes_as_dataset_no_pipeline_partition_on(store): assert dataset == stored_dataset -def test_store_dataframes_as_dataset_partition_on_inconsistent(store): +@pytest.mark.parametrize("test_input", ["NOT_IN_COLUMNS"]) +def test_store_dataframes_as_dataset_partition_on_inconsistent(test_input, store): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) + df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) + with pytest.raises(ValueError) as excinfo: store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", - dfs=[df], - partition_on=["NOT_IN_COLUMNS"], + dfs=[{"core": df, "helper": df2}], + partition_on=[test_input], metadata_version=4, ) - assert str(excinfo.value) == "Partition column(s) missing: NOT_IN_COLUMNS" + assert str(excinfo.value) == "Partition column(s) missing: {}".format(test_input) def test_store_dataframes_as_dataset_no_pipeline(metadata_version, store): @@ -160,10 +176,12 @@ def test_store_dataframes_as_dataset_no_pipeline(metadata_version, store): {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) + df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) + dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", - dfs=[df], + dfs=[{"core": df, "helper": df2}], metadata_version=metadata_version, ) @@ -176,12 +194,35 @@ def test_store_dataframes_as_dataset_no_pipeline(metadata_version, store): assert dataset == stored_dataset +def test_store_dataframes_as_dataset_dfs_input_formats(store): + df1 = pd.DataFrame({"B": [pd.Timestamp("2019")]}) + df2 = pd.DataFrame({"A": [1.4]}) + formats = [ + {"data": {"D": df1, "S": df2}}, + {"D": df1, "S": df2}, + {"data": [("D", df1), ("S", df2)]}, + [("D", df1), ("S", df2)], + ] + for input_format in formats: + dataset = store_dataframes_as_dataset( + store=store, dataset_uuid="dataset_uuid", dfs=[input_format], overwrite=True + ) + stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) + assert dataset == stored_dataset + + def test_store_dataframes_as_dataset_mp(metadata_version, store): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - mp = MetaPartition(label=gen_uuid(), data=df, metadata_version=metadata_version,) + df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) + + mp = MetaPartition( + label=gen_uuid(), + data={"core": df, "helper": df2}, + metadata_version=metadata_version, + ) dataset = store_dataframes_as_dataset( store=store, @@ -204,8 +245,13 @@ def test_write_single_partition_different_partitioning(store_factory): OrderedDict([("location", ["0", "1", "2"]), ("other", ["a", "a", "a"])]) ) - input_ = [df] - + input_ = [ + { + "label": "label", + "data": [("order_proposals", df)], + "indices": {"location": {k: ["label"] for k in df["location"].unique()}}, + } + ] dataset = store_dataframes_as_dataset( dfs=input_, store=store_factory, @@ -214,11 +260,13 @@ def test_write_single_partition_different_partitioning(store_factory): partition_on=["other"], ) - new_data = [ - pd.DataFrame( - OrderedDict([("other", ["b", "b", "b"]), ("location", ["0", "1", "2"])]) - ) - ] + new_data = { + "data": { + "order_proposals": pd.DataFrame( + OrderedDict([("other", ["b", "b", "b"]), ("location", ["0", "1", "2"])]) + ) + } + } initial_keys = len(list(store_factory().keys())) with pytest.raises(ValueError): write_single_partition( @@ -236,8 +284,29 @@ def test_write_single_partition_different_partitioning(store_factory): ) assert initial_keys + 1 == len(list(store_factory().keys())) + new_data["label"] = "some_other_label" # If no partitioning is given, it will be determined based on the existing dataset write_single_partition( store=store_factory, dataset_uuid=dataset.uuid, data=new_data ) assert initial_keys + 2 == len(list(store_factory().keys())) + + +def test_store_dataframes_as_dataset_does_not_allow_invalid_indices(store_factory): + partitions = [ + { + "label": "part1", + "data": [("core", pd.DataFrame({"p": [1, 2]}))], + "indices": {"x": ExplicitSecondaryIndex("x", {1: ["part1"], 2: ["part2"]})}, + } + ] + + with pytest.raises( + ValueError, match="In table core, no column corresponding to index x" + ): + store_dataframes_as_dataset( + dfs=partitions, + store=store_factory, + metadata={"dataset": "metadata"}, + dataset_uuid="dataset_uuid", + ) diff --git a/tests/io/iter/test_read.py b/tests/io/iter/test_read.py index ccf6f804..ef0838d9 100644 --- a/tests/io/iter/test_read.py +++ b/tests/io/iter/test_read.py @@ -9,7 +9,7 @@ from kartothek.io.testing.read import * # noqa -@pytest.fixture(params=["dataframe"]) +@pytest.fixture(params=["dataframe", "metapartition"]) def output_type(request): return request.param @@ -17,6 +17,8 @@ def output_type(request): def _load_dataframes(output_type, *args, **kwargs): if output_type == "dataframe": func = read_dataset_as_dataframes__iterator + elif output_type == "metapartition": + func = read_dataset_as_metapartitions__iterator else: raise ValueError("Unknown output type {}".format(output_type)) return list(func(*args, **kwargs)) diff --git a/tests/io_components/test_dataset.py b/tests/io_components/test_dataset.py index 3975f9f1..d4cfd7a8 100644 --- a/tests/io_components/test_dataset.py +++ b/tests/io_components/test_dataset.py @@ -16,7 +16,7 @@ def test_dataset_get_indices_as_dataframe_partition_keys_only( dataset_with_index, store_session ): expected = pd.DataFrame( - {"P": [1, 2]}, + OrderedDict([("P", [1, 2])]), index=pd.Index(["P=1/cluster_1", "P=2/cluster_2"], name="partition"), ) ds = dataset_with_index.load_partition_indices() diff --git a/tests/io_components/test_metapartition.py b/tests/io_components/test_metapartition.py index 0d552132..daea10d1 100644 --- a/tests/io_components/test_metapartition.py +++ b/tests/io_components/test_metapartition.py @@ -1,3 +1,7 @@ +# -*- coding: utf-8 -*- + + +import string from collections import OrderedDict from datetime import date, datetime @@ -20,21 +24,29 @@ from kartothek.serialization import DataFrameSerializer, ParquetSerializer -def test_store_single_dataframe_as_partition(store, metadata_version): +def test_store_single_dataframe_as_partition( + store, metadata_storage_format, metadata_version +): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) - mp = MetaPartition(label="test_label", data=df, metadata_version=metadata_version) + mp = MetaPartition( + label="test_label", data={"core": df}, metadata_version=metadata_version + ) meta_partition = mp.store_dataframes( - store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid" + store=store, + df_serializer=ParquetSerializer(), + dataset_uuid="dataset_uuid", + store_metadata=True, + metadata_storage_format=metadata_storage_format, ) - assert meta_partition.data is None + assert len(meta_partition.data) == 0 - expected_key = "dataset_uuid/table/test_label.parquet" + expected_key = "dataset_uuid/core/test_label.parquet" - assert meta_partition.file == expected_key + assert meta_partition.files == {"core": expected_key} assert meta_partition.label == "test_label" files_in_store = list(store.keys()) @@ -47,37 +59,123 @@ def test_store_single_dataframe_as_partition(store, metadata_version): assert len(files_in_store) == expected_num_files - 1 -def test_load_dataframe_logical_conjunction(store, metadata_version): +def test_store_single_dataframe_as_partition_no_metadata(store, metadata_version): + df = pd.DataFrame( + {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} + ) + mp = MetaPartition( + label="test_label", data={"core": df}, metadata_version=metadata_version + ) + partition = mp.store_dataframes( + store=store, + df_serializer=ParquetSerializer(), + dataset_uuid="dataset_uuid", + store_metadata=False, + ) + + assert len(partition.data) == 0 + + expected_file = "dataset_uuid/core/test_label.parquet" + + assert partition.files == {"core": expected_file} + assert partition.label == "test_label" + + # One meta one actual file + files_in_store = list(store.keys()) + assert len(files_in_store) == 1 + + stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_file) + pdt.assert_frame_equal(df, stored_df) + + +def test_load_dataframe_logical_conjunction( + store, meta_partitions_files_only, metadata_version, metadata_storage_format +): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) mp = MetaPartition( label="cluster_1", - data=df, + data={"core": df}, metadata_version=metadata_version, logical_conjunction=[("P", ">", 4)], ) meta_partition = mp.store_dataframes( - store=store, df_serializer=None, dataset_uuid="dataset_uuid" + store=store, + df_serializer=None, + dataset_uuid="dataset_uuid", + store_metadata=True, + metadata_storage_format=metadata_storage_format, ) predicates = None loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates) - data = pd.DataFrame( - {"P": [5, 6, 7, 8, 9], "L": [5, 6, 7, 8, 9], "TARGET": [15, 16, 17, 18, 19]} - ).set_index(np.arange(5, 10)) - pdt.assert_frame_equal(loaded_mp.data, data) + data = { + "core": pd.DataFrame( + {"P": [5, 6, 7, 8, 9], "L": [5, 6, 7, 8, 9], "TARGET": [15, 16, 17, 18, 19]} + ).set_index(np.arange(5, 10)) + } + pdt.assert_frame_equal(loaded_mp.data["core"], data["core"]) predicates = [[("L", ">", 6), ("TARGET", "<", 18)]] loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates) - data = pd.DataFrame({"P": [7], "L": [7], "TARGET": [17]}).set_index(np.array([7])) - pdt.assert_frame_equal(loaded_mp.data, data) + data = { + "core": pd.DataFrame({"P": [7], "L": [7], "TARGET": [17]}).set_index( + np.array([7]) + ) + } + pdt.assert_frame_equal(loaded_mp.data["core"], data["core"]) predicates = [[("L", ">", 2), ("TARGET", "<", 17)], [("TARGET", "==", 19)]] loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates) - data = pd.DataFrame( - {"P": [5, 6, 9], "L": [5, 6, 9], "TARGET": [15, 16, 19]} - ).set_index(np.array([5, 6, 9])) - pdt.assert_frame_equal(loaded_mp.data, data) + data = { + "core": pd.DataFrame( + {"P": [5, 6, 9], "L": [5, 6, 9], "TARGET": [15, 16, 19]} + ).set_index(np.array([5, 6, 9])) + } + pdt.assert_frame_equal(loaded_mp.data["core"], data["core"]) + + +def test_store_multiple_dataframes_as_partition( + store, metadata_storage_format, metadata_version +): + df = pd.DataFrame( + {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} + ) + df_2 = pd.DataFrame({"P": np.arange(0, 10), "info": string.ascii_lowercase[:10]}) + mp = MetaPartition( + label="cluster_1", + data={"core": df, "helper": df_2}, + metadata_version=metadata_version, + ) + meta_partition = mp.store_dataframes( + store=store, + df_serializer=None, + dataset_uuid="dataset_uuid", + store_metadata=True, + metadata_storage_format=metadata_storage_format, + ) + + expected_file = "dataset_uuid/core/cluster_1.parquet" + expected_file_helper = "dataset_uuid/helper/cluster_1.parquet" + + assert meta_partition.files == { + "core": expected_file, + "helper": expected_file_helper, + } + assert meta_partition.label == "cluster_1" + + files_in_store = list(store.keys()) + assert len(files_in_store) == 2 + + stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_file) + pdt.assert_frame_equal(df, stored_df) + files_in_store.remove(expected_file) + + stored_df = DataFrameSerializer.restore_dataframe( + store=store, key=expected_file_helper + ) + pdt.assert_frame_equal(df_2, stored_df) + files_in_store.remove(expected_file_helper) @pytest.mark.parametrize("predicate_pushdown_to_io", [True, False]) @@ -86,48 +184,62 @@ def test_load_dataframes( ): expected_df = pd.DataFrame( OrderedDict( - [("P", [1]), ("L", [1]), ("TARGET", [1]), ("DATE", [date(2010, 1, 1)])] + [ + ("P", [1]), + ("L", [1]), + ("TARGET", [1]), + ("DATE", pd.to_datetime([date(2010, 1, 1)])), + ] ) ) + expected_df_2 = pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])])) mp = meta_partitions_files_only[0] - assert mp.file - assert mp.data is not None + assert len(mp.files) > 0 + assert len(mp.data) == 0 mp = meta_partitions_files_only[0].load_dataframes( store=store_session, predicate_pushdown_to_io=predicate_pushdown_to_io ) - assert mp.data is not None + assert len(mp.data) == 2 data = mp.data - pdt.assert_frame_equal(data, expected_df, check_dtype=False) + pdt.assert_frame_equal(data[SINGLE_TABLE], expected_df, check_dtype=False) + pdt.assert_frame_equal(data["helper"], expected_df_2, check_dtype=False) empty_mp = MetaPartition("empty_mp", metadata_version=mp.metadata_version) empty_mp.load_dataframes( store_session, predicate_pushdown_to_io=predicate_pushdown_to_io ) - assert empty_mp.data is None + assert empty_mp.data == {} def test_remove_dataframes(meta_partitions_files_only, store_session): mp = meta_partitions_files_only[0].load_dataframes(store=store_session) - assert mp.data is not None + assert len(mp.data) == 2 mp = mp.remove_dataframes() - assert mp.data is None + assert mp.data == {} def test_load_dataframes_selective(meta_partitions_files_only, store_session): expected_df = pd.DataFrame( OrderedDict( - [("P", [1]), ("L", [1]), ("TARGET", [1]), ("DATE", [date(2010, 1, 1)])] + [ + ("P", [1]), + ("L", [1]), + ("TARGET", [1]), + ("DATE", pd.to_datetime([date(2010, 1, 1)])), + ] ) ) mp = meta_partitions_files_only[0] - assert mp.file is not None - assert mp.data is not None - mp = meta_partitions_files_only[0].load_dataframes(store=store_session) - + assert len(mp.files) > 0 + assert len(mp.data) == 0 + mp = meta_partitions_files_only[0].load_dataframes( + store=store_session, tables=[SINGLE_TABLE] + ) + assert len(mp.data) == 1 data = mp.data - pdt.assert_frame_equal(data, expected_df, check_dtype=False) + pdt.assert_frame_equal(data[SINGLE_TABLE], expected_df, check_dtype=False) def test_load_dataframes_columns_projection( @@ -135,36 +247,60 @@ def test_load_dataframes_columns_projection( ): expected_df = pd.DataFrame(OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [1])])) mp = meta_partitions_evaluation_files_only[0] - assert mp.file is not None - assert mp.data is not None + assert len(mp.files) > 0 + assert len(mp.data) == 0 mp = meta_partitions_evaluation_files_only[0].load_dataframes( - store=store_session, columns=["P", "L", "HORIZON"] + store=store_session, tables=["PRED"], columns={"PRED": ["P", "L", "HORIZON"]} ) - + assert len(mp.data) == 1 data = mp.data - pdt.assert_frame_equal(data, expected_df, check_dtype=False) + pdt.assert_frame_equal(data["PRED"], expected_df, check_dtype=False) def test_load_dataframes_columns_raises_missing( meta_partitions_evaluation_files_only, store_session ): mp = meta_partitions_evaluation_files_only[0] - assert mp.file is not None - assert mp.data is not None + assert len(mp.files) > 0 + assert len(mp.data) == 0 with pytest.raises(ValueError) as e: meta_partitions_evaluation_files_only[0].load_dataframes( - store=store_session, columns=["P", "L", "HORIZON", "foo", "bar"] + store=store_session, + tables=["PRED"], + columns={"PRED": ["P", "L", "HORIZON", "foo", "bar"]}, ) assert str(e.value) == "Columns cannot be found in stored dataframe: bar, foo" +def test_load_dataframes_columns_table_missing( + meta_partitions_evaluation_files_only, store_session +): + # test behavior of load_dataframes for columns argument given + # specifying table that doesn't exist + mp = meta_partitions_evaluation_files_only[0] + assert len(mp.files) > 0 + assert len(mp.data) == 0 + with pytest.raises( + ValueError, + match=r"You are trying to read columns from invalid table\(s\). .*PRED_typo.*", + ): + mp.load_dataframes( + store=store_session, + columns={"PRED_typo": ["P", "L", "HORIZON", "foo", "bar"]}, + ) + + # ensure typo in tables argument doesn't raise, as specified in docstring + dfs = mp.load_dataframes(store=store_session, tables=["PRED_typo"]) + assert len(dfs) > 0 + + def test_from_dict(): df = pd.DataFrame({"a": [1]}) - dct = {"data": df, "label": "test_label"} + dct = {"data": {"core": df}, "label": "test_label"} meta_partition = MetaPartition.from_dict(dct) - pdt.assert_frame_equal(meta_partition.data, df) + pdt.assert_frame_equal(meta_partition.data["core"], df) assert meta_partition.metadata_version == DEFAULT_METADATA_VERSION @@ -175,74 +311,91 @@ def test_eq(): df_diff_col = pd.DataFrame({"b": [1]}) df_diff_type = pd.DataFrame({"b": [1.0]}) - meta_partition = MetaPartition.from_dict({"label": "test_label", "data": df}) + meta_partition = MetaPartition.from_dict( + {"label": "test_label", "data": {"core": df}} + ) assert meta_partition == meta_partition meta_partition_same = MetaPartition.from_dict( - {"label": "test_label", "data": df_same} + {"label": "test_label", "data": {"core": df_same}} ) assert meta_partition == meta_partition_same meta_partition_diff_label = MetaPartition.from_dict( - {"label": "another_label", "data": df} + {"label": "another_label", "data": {"core": df}} ) assert meta_partition != meta_partition_diff_label assert meta_partition_diff_label != meta_partition meta_partition_diff_files = MetaPartition.from_dict( - {"label": "another_label", "data": df, "file": "something"} + {"label": "another_label", "data": {"core": df}, "files": {"core": "something"}} ) assert meta_partition != meta_partition_diff_files assert meta_partition_diff_files != meta_partition meta_partition_diff_col = MetaPartition.from_dict( - {"label": "test_label", "data": df_diff_col} + {"label": "test_label", "data": {"core": df_diff_col}} ) assert meta_partition != meta_partition_diff_col assert meta_partition_diff_col != meta_partition meta_partition_diff_type = MetaPartition.from_dict( - {"label": "test_label", "data": df_diff_type} + {"label": "test_label", "data": {"core": df_diff_type}} ) assert meta_partition != meta_partition_diff_type assert meta_partition_diff_type != meta_partition meta_partition_diff_metadata = MetaPartition.from_dict( - {"label": "test_label", "data": df_diff_type} + { + "label": "test_label", + "data": {"core": df_diff_type}, + "dataset_metadata": {"some": "metadata"}, + } ) assert meta_partition != meta_partition_diff_metadata assert meta_partition_diff_metadata != meta_partition meta_partition_different_df = MetaPartition.from_dict( - {"label": "test_label", "data": df_other} + {"label": "test_label", "data": {"core": df_other}} ) assert not meta_partition == meta_partition_different_df + meta_partition_different_label = MetaPartition.from_dict( + {"label": "test_label", "data": {"not_core": df_same}} + ) + assert not meta_partition == meta_partition_different_label + meta_partition_empty_data = MetaPartition.from_dict( - {"label": "test_label", "data": None} + {"label": "test_label", "data": {}} ) assert meta_partition_empty_data == meta_partition_empty_data + meta_partition_more_data = MetaPartition.from_dict( + {"label": "test_label", "data": {"core": df, "not_core": df}} + ) + assert not (meta_partition == meta_partition_more_data) + assert not meta_partition == "abc" def test_add_nested_to_plain(): mp = MetaPartition( label="label_1", - file="file", - data=pd.DataFrame({"test": [1, 2, 3]}), + files={"core": "file"}, + data={"core": pd.DataFrame({"test": [1, 2, 3]})}, indices={"test": [1, 2, 3]}, + dataset_metadata={"dataset": "metadata"}, ) to_nest = [ MetaPartition( label="label_2", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ), MetaPartition( label="label_22", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ), ] @@ -258,15 +411,17 @@ def test_add_nested_to_nested(): mps1 = [ MetaPartition( label="label_1", - file="file", - data=pd.DataFrame({"test": [1, 2, 3]}), + files={"core": "file"}, + data={"core": pd.DataFrame({"test": [1, 2, 3]})}, indices={"test": [1, 2, 3]}, + dataset_metadata={"dataset": "metadata"}, ), MetaPartition( label="label_33", - file="file", - data=pd.DataFrame({"test": [1, 2, 3]}), + files={"core": "file"}, + data={"core": pd.DataFrame({"test": [1, 2, 3]})}, indices={"test": [1, 2, 3]}, + dataset_metadata={"dataset": "metadata"}, ), ] @@ -275,12 +430,12 @@ def test_add_nested_to_nested(): mps2 = [ MetaPartition( label="label_2", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ), MetaPartition( label="label_22", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ), ] @@ -298,14 +453,15 @@ def test_add_nested_to_nested(): def test_eq_nested(): mp_1 = MetaPartition( label="label_1", - file="file", - data=pd.DataFrame({"test": [1, 2, 3]}), + files={"core": "file"}, + data={"core": pd.DataFrame({"test": [1, 2, 3]})}, indices={"test": [1, 2, 3]}, + dataset_metadata={"dataset": "metadata"}, ) mp_2 = MetaPartition( label="label_2", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ) @@ -315,7 +471,9 @@ def test_eq_nested(): assert mp != mp_2 assert mp_2 != mp - mp_other = MetaPartition(label="label_3", data=pd.DataFrame({"test": [4, 5, 6]})) + mp_other = MetaPartition( + label="label_3", data={"core": pd.DataFrame({"test": [4, 5, 6]})} + ) mp_other = mp_1.add_metapartition(mp_other) assert mp != mp_other assert mp_other != mp @@ -324,19 +482,68 @@ def test_eq_nested(): def test_nested_incompatible_meta(): mp = MetaPartition( label="label_1", - data=pd.DataFrame({"test": np.array([1, 2, 3], dtype=np.int8)}), + data={"core": pd.DataFrame({"test": np.array([1, 2, 3], dtype=np.int8)})}, metadata_version=4, ) mp_2 = MetaPartition( label="label_2", - data=pd.DataFrame({"test": np.array([4, 5, 6], dtype=np.float64)}), + data={"core": pd.DataFrame({"test": np.array([4, 5, 6], dtype=np.float64)})}, metadata_version=4, ) with pytest.raises(ValueError): mp.add_metapartition(mp_2) +def test_concatenate_no_change(): + input_dct = { + "first_0": pd.DataFrame({"A": [1], "B": [1]}), + "second": pd.DataFrame({"A": [3], "B": [3], "C": [3]}), + } + dct = {"label": "test_label", "data": input_dct} + meta_partition = MetaPartition.from_dict(dct) + result = meta_partition.concat_dataframes() + assert result == meta_partition + + +def test_concatenate_identical_col_df(): + input_dct = { + "first_0": pd.DataFrame({"A": [1], "B": [1]}), + "first_1": pd.DataFrame({"A": [2], "B": [2]}), + "second": pd.DataFrame({"A": [3], "B": [3], "C": [3]}), + } + dct = {"label": "test_label", "data": input_dct} + meta_partition = MetaPartition.from_dict(dct) + result = meta_partition.concat_dataframes().data + + assert len(result) == 2 + assert "first" in result + first_expected = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + pdt.assert_frame_equal(result["first"], first_expected) + assert "second" in result + first_expected = pd.DataFrame({"A": [3], "B": [3], "C": [3]}) + pdt.assert_frame_equal(result["second"], first_expected) + + +def test_concatenate_identical_col_df_naming(): + input_dct = { + "some": pd.DataFrame({"A": [1], "B": [1]}), + "name": pd.DataFrame({"A": [2], "B": [2]}), + "second": pd.DataFrame({"A": [3], "B": [3], "C": [3]}), + } + dct = {"label": "test_label", "data": input_dct} + meta_partition = MetaPartition.from_dict(dct) + result = meta_partition.concat_dataframes().data + + assert len(result) == 2 + assert "some_name" in result + first_expected = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + pdt.assert_frame_equal(result["some_name"], first_expected) + assert "second" in result + first_expected = pd.DataFrame({"A": [3], "B": [3], "C": [3]}) + pdt.assert_frame_equal(result["second"], first_expected) + + def test_unique_label(): label_list = ["first_0", "first_1"] @@ -355,6 +562,88 @@ def test_unique_label(): assert _unique_label(label_list) == "something_else" +def test_merge_dataframes(): + df_core = pd.DataFrame( + { + "P": [1, 1, 1, 1, 3], + "L": [1, 2, 1, 2, 3], + "C": [1, 1, 2, 2, 3], + "TARGET": [1, 2, 3, 4, -1], + "info": ["a", "b", "c", "d", "e"], + } + ) + df_preds = pd.DataFrame( + { + "P": [1, 1, 1, 1], + "L": [1, 2, 1, 2], + "C": [1, 1, 2, 2], + "PRED": [11, 22, 33, 44], + "HORIZONS": [1, 1, 2, 2], + } + ) + mp = MetaPartition(label="part_label", data={"core": df_core, "pred": df_preds}) + + mp = mp.merge_dataframes(left="core", right="pred", output_label="merged") + + assert len(mp.data) == 1 + df_result = mp.data["merged"] + + df_expected = pd.DataFrame( + { + "P": [1, 1, 1, 1], + "L": [1, 2, 1, 2], + "C": [1, 1, 2, 2], + "PRED": [11, 22, 33, 44], + "TARGET": [1, 2, 3, 4], + "info": ["a", "b", "c", "d"], + "HORIZONS": [1, 1, 2, 2], + } + ) + pdt.assert_frame_equal(df_expected, df_result, check_like=True) + + +def test_merge_dataframes_kwargs(): + df_core = pd.DataFrame( + { + "P": [1, 1, 1, 1, 3], + "L": [1, 2, 1, 2, 3], + "C": [1, 1, 2, 2, 3], + "TARGET": [1, 2, 3, 4, -1], + "info": ["a", "b", "c", "d", "e"], + } + ) + df_preds = pd.DataFrame( + { + "P": [1, 1, 1, 1], + "L": [1, 2, 1, 2], + "C": [1, 1, 2, 2], + "PRED": [11, 22, 33, 44], + "HORIZONS": [1, 1, 2, 2], + } + ) + mp = MetaPartition(label="part_label", data={"core": df_core, "pred": df_preds}) + + mp = mp.merge_dataframes( + left="core", right="pred", output_label="merged", merge_kwargs={"how": "left"} + ) + + assert len(mp.data) == 1 + df_result = mp.data["merged"] + + df_expected = pd.DataFrame( + { + "P": [1, 1, 1, 1, 3], + "L": [1, 2, 1, 2, 3], + "C": [1, 1, 2, 2, 3], + "TARGET": [1, 2, 3, 4, -1], + "info": ["a", "b", "c", "d", "e"], + "PRED": [11, 22, 33, 44, np.NaN], + "HORIZONS": [1, 1, 2, 2, np.NaN], + } + ) + pdt.assert_frame_equal(df_expected, df_result, check_like=True) + + def test_merge_indices(): indices = [ MetaPartition( @@ -389,7 +678,7 @@ def test_build_indices(): [("location", ["Loc1", "Loc2"]), ("product", ["Product1", "Product2"])] ) ) - mp = MetaPartition(label="partition_label", data=df) + mp = MetaPartition(label="partition_label", data={"core": df}) result_mp = mp.build_indices(columns) result = result_mp.indices loc_index = ExplicitSecondaryIndex( @@ -406,13 +695,13 @@ def test_build_indices(): def test_add_metapartition(): mp = MetaPartition( label="label_1", - data=pd.DataFrame({"test": [1, 2, 3]}), + data={"core": pd.DataFrame({"test": [1, 2, 3]})}, indices={"test": [1, 2, 3]}, ) mp_2 = MetaPartition( label="label_2", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ) @@ -426,7 +715,7 @@ def test_add_metapartition(): with pytest.raises(AttributeError): new_mp.data with pytest.raises(AttributeError): - new_mp.file + new_mp.files with pytest.raises(AttributeError): new_mp.indices with pytest.raises(AttributeError): @@ -447,7 +736,7 @@ def test_add_metapartition(): # This tests whether it is possible to add to an already nested MetaPartition mp_3 = MetaPartition( label="label_3", - data=pd.DataFrame({"test": [7, 8, 9]}), + data={"core": pd.DataFrame({"test": [7, 8, 9]})}, indices={"test": [7, 8, 9]}, ) new_mp = new_mp.add_metapartition(mp_3) @@ -470,27 +759,25 @@ def test_add_metapartition(): def test_to_dict(metadata_version): - df = pd.DataFrame({"A": [1]}) - schema = make_meta(df, origin="test") mp = MetaPartition( label="label_1", - file="file", - data=df, + files={"core": "file"}, + data={"core": "placeholder"}, indices={"test": [1, 2, 3]}, metadata_version=metadata_version, - schema=schema, + table_meta={"core": {"test": "int8"}}, ) mp_dct = mp.to_dict() assert mp_dct == { "label": "label_1", - "data": df, - "file": "file", + "data": {"core": "placeholder"}, + "files": {"core": "file"}, "indices": {"test": [1, 2, 3]}, + "dataset_metadata": {}, "metadata_version": metadata_version, - "schema": schema, + "table_meta": {"core": {"test": "int8"}}, "partition_keys": [], "logical_conjunction": None, - "table_name": SINGLE_TABLE, } @@ -504,7 +791,11 @@ def test_add_metapartition_duplicate_labels(): def test_copy(): mp = MetaPartition( - label="label_1", file="file", data=pd.DataFrame(), indices={"test": [1, 2, 3]} + label="label_1", + files={"core": "file"}, + data={"core": pd.DataFrame()}, + indices={"test": [1, 2, 3]}, + dataset_metadata={"dataset": "metadata"}, ) new_mp = mp.copy() @@ -513,32 +804,38 @@ def test_copy(): # ... but not the same object assert id(new_mp) != id(mp) - new_mp = mp.copy(file="new_file") + new_mp = mp.copy(files={"new": "file"}) assert id(new_mp) != id(mp) - assert new_mp.file == "new_file" + assert new_mp.files == {"new": "file"} new_mp = mp.copy(indices={"new": [1, 2, 3]}) assert id(new_mp) != id(mp) assert new_mp.indices == {"new": [1, 2, 3]} + new_mp = mp.copy(dataset_metadata={"new": "metadata"}) + assert id(new_mp) != id(mp) + assert new_mp.dataset_metadata == {"new": "metadata"} + def test_nested_copy(): mp = MetaPartition( label="label_1", - file="file", - data=pd.DataFrame({"test": [1, 2, 3]}), + files={"core": "file"}, + data={"core": pd.DataFrame({"test": [1, 2, 3]})}, indices={"test": {1: "label_1", 2: "label_2", 3: "label_3"}}, + dataset_metadata={"dataset": "metadata"}, ) mp_2 = MetaPartition( label="label_2", - data=pd.DataFrame({"test": [4, 5, 6]}), + data={"core": pd.DataFrame({"test": [4, 5, 6]})}, indices={"test": [4, 5, 6]}, ) mp = mp.add_metapartition(mp_2) assert len(mp.metapartitions) == 2 new_mp = mp.copy() + assert new_mp.dataset_metadata == mp.dataset_metadata # Check if the copy is identical assert len(new_mp.metapartitions) == len(mp.metapartitions) assert new_mp == mp @@ -549,7 +846,11 @@ def test_nested_copy(): def test_partition_on_one_level(): original_df = pd.DataFrame({"test": [1, 2, 3], "some_values": [1, 2, 3]}) mp = MetaPartition( - label="label_1", file="file", data=original_df, metadata_version=4 + label="label_1", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) new_mp = mp.partition_on(["test"]) @@ -560,8 +861,8 @@ def test_partition_on_one_level(): for mp in new_mp: labels.add(mp.label) assert len(mp.data) == 1 - assert mp.data is not None - df = mp.data + assert "core" in mp.data + df = mp.data["core"] assert df._is_view # try to be agnostic about the order @@ -583,7 +884,11 @@ def test_partition_on_one_level_ts(): } ) mp = MetaPartition( - label="label_1", file="file", data=original_df, metadata_version=4 + label="label_1", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) new_mp = mp.partition_on(["test"]) @@ -594,8 +899,8 @@ def test_partition_on_one_level_ts(): for mp in new_mp: labels.add(mp.label) assert len(mp.data) == 1 - assert mp.data is not None - df = mp.data + assert "core" in mp.data + df = mp.data["core"] assert df._is_view # try to be agnostic about the order @@ -615,30 +920,35 @@ def test_partition_on_roundtrip(store): original_df = pd.DataFrame( OrderedDict([("test", [1, 2, 3]), ("some_values", [1, 2, 3])]) ) - mp = MetaPartition(label="label_1", data=original_df, metadata_version=4) + mp = MetaPartition( + label="label_1", + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, + ) new_mp = mp.partition_on(["test"]) new_mp = new_mp.store_dataframes(store=store, dataset_uuid="some_uuid") - store_schema_metadata(new_mp.schema, "some_uuid", store) + store_schema_metadata(new_mp.table_meta["core"], "some_uuid", store, "core") # Test immediately after dropping and later once with new metapartition to check table meta reloading new_mp = new_mp.load_dataframes(store=store) assert len(new_mp.metapartitions) == 3 dfs = [] for internal_mp in new_mp: - dfs.append(internal_mp.data) + dfs.append(internal_mp.data["core"]) actual_df = pd.concat(dfs).sort_values(by="test").reset_index(drop=True) pdt.assert_frame_equal(original_df, actual_df) for i in range(1, 4): # Check with fresh metapartitions new_mp = MetaPartition( - label=f"test={i}/label_1", - file=f"some_uuid/table/test={i}/label_1.parquet", + label="test={}/label_1".format(i), + files={"core": "some_uuid/core/test={}/label_1.parquet".format(i)}, metadata_version=4, ) new_mp = new_mp.load_dataframes(store=store) - actual_df = new_mp.data + actual_df = new_mp.data["core"] expected_df = pd.DataFrame(OrderedDict([("test", [i]), ("some_values", [i])])) pdt.assert_frame_equal(expected_df, actual_df) @@ -650,7 +960,11 @@ def test_partition_on_raises_no_cols_left(empty): if empty: original_df = original_df.loc[[]] mp = MetaPartition( - label="label_1", file="file", data=original_df, metadata_version=4 + label="label_1", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) with pytest.raises(ValueError) as e: mp.partition_on(["test"]) @@ -663,7 +977,11 @@ def test_partition_on_raises_pocols_missing(empty): if empty: original_df = original_df.loc[[]] mp = MetaPartition( - label="label_1", file="file", data=original_df, metadata_version=4 + label="label_1", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) with pytest.raises(ValueError) as e: mp.partition_on(["test", "foo", "bar"]) @@ -672,7 +990,7 @@ def test_partition_on_raises_pocols_missing(empty): def test_partition_urlencode(): original_df = pd.DataFrame({"ÖŒå": [1, 2, 3], "some_values": [1, 2, 3]}) - mp = MetaPartition(label="label_1", data=original_df, metadata_version=4) + mp = MetaPartition(label="label_1", data={"core": original_df}, metadata_version=4) new_mp = mp.partition_on(["ÖŒå"]) @@ -682,8 +1000,8 @@ def test_partition_urlencode(): for mp in new_mp: labels.add(mp.label) assert len(mp.data) == 1 - assert mp.data is not None - df = mp.data + assert "core" in mp.data + df = mp.data["core"] assert df._is_view # try to be agnostic about the order @@ -708,7 +1026,11 @@ def test_partition_two_level(): } ) mp = MetaPartition( - label="label_1", file="file", data=original_df, metadata_version=4 + label="label_1", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) new_mp = mp.partition_on(["level1", "level2"]) @@ -718,8 +1040,8 @@ def test_partition_two_level(): for mp in new_mp: labels.append(mp.label) assert len(mp.data) == 1 - assert mp.data is not None - df = mp.data + assert "core" in mp.data + df = mp.data["core"] assert df._is_view # try to be agnostic about the order @@ -747,10 +1069,18 @@ def test_partition_on_nested(): } ) mp = MetaPartition( - label="label_1", file="file", data=original_df, metadata_version=4 + label="label_1", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) mp2 = MetaPartition( - label="label_2", file="file", data=original_df, metadata_version=4 + label="label_2", + files={"core": "file"}, + data={"core": original_df}, + dataset_metadata={"dataset": "metadata"}, + metadata_version=4, ) mp = mp.add_metapartition(mp2) new_mp = mp.partition_on(["level1", "level2"]) @@ -760,8 +1090,8 @@ def test_partition_on_nested(): for mp in new_mp: labels.append(mp.label) assert len(mp.data) == 1 - assert mp.data is not None - df = mp.data + assert "core" in mp.data + df = mp.data["core"] assert df._is_view # try to be agnostic about the order @@ -786,6 +1116,28 @@ def test_partition_on_nested(): assert sorted(labels) == sorted(expected_labels) +def test_partition_on_multiple_tables_empty_table(): + original_df = pd.DataFrame({"level1": [1, 2, 3], "no_index_col": np.arange(0, 3)}) + mp = MetaPartition( + label="label_1", + data=OrderedDict( + [ + ("core", original_df), + ("empty_table", pd.DataFrame(columns=["level1", "another_col"])), + ] + ), + metadata_version=4, + ) + new_mp = mp.partition_on("level1") + + labels = [] + for mp in new_mp: + labels.append(mp.label) + assert "empty_table" in mp.data + assert mp.data["empty_table"].empty + assert set(mp.data["empty_table"].columns) == {"another_col"} + + def test_partition_on_stable_order(): """ Assert that the partition_on algo is stable wrt to row ordering @@ -799,38 +1151,43 @@ def test_partition_on_stable_order(): df = pd.DataFrame( {"partition_key": random_index, "sorted_col": range(total_values)} ) - mp = MetaPartition(label="label_1", data=df, metadata_version=4) + mp = MetaPartition( + label="label_1", data=OrderedDict([("table", df)]), metadata_version=4 + ) new_mp = mp.partition_on("partition_key") for sub_mp in new_mp: - sub_df = sub_mp.data + sub_df = sub_mp.data["table"] assert sub_df.sorted_col.is_monotonic def test_table_meta(store): mp = MetaPartition( label="label_1", - data=pd.DataFrame( - { - "i32": np.array([1, 2, 3, 1, 2, 3], dtype="int32"), - "float": np.array([1, 1, 1, 2, 2, 2], dtype="float64"), - } - ), + data={ + "core": pd.DataFrame( + { + "i32": np.array([1, 2, 3, 1, 2, 3], dtype="int32"), + "float": np.array([1, 1, 1, 2, 2, 2], dtype="float64"), + } + ) + }, metadata_version=4, ) - assert mp.schema is not None + assert len(mp.table_meta) == 1 + assert "core" in mp.table_meta expected_meta = make_meta( pd.DataFrame( {"i32": np.array([], dtype="int32"), "float": np.array([], dtype="float64")} ), origin="1", ) - actual_meta = mp.schema + actual_meta = mp.table_meta["core"] assert actual_meta == expected_meta mp = mp.store_dataframes(store, "dataset_uuid") - actual_meta = mp.schema + actual_meta = mp.table_meta["core"] assert actual_meta == expected_meta @@ -844,8 +1201,8 @@ def test_partition_on_explicit_index(): ) mp = MetaPartition( label="label_1", - file="file", - data=original_df, + files={"core": "file"}, + data={"core": original_df}, indices={ "explicit_index_col": {value: ["label_1"] for value in np.arange(0, 6)} }, @@ -894,17 +1251,17 @@ def test_reconstruct_index_duplicates(store): key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") - store_schema_metadata(schema, "uuid", store) + store_schema_metadata(schema, "uuid", store, "table") mp = MetaPartition( label="dontcare", - file=key, + files={"table": key}, metadata_version=4, - schema=schema, + table_meta={"table": schema}, partition_keys=["index_col"], ) mp = mp.load_dataframes(store) - df_actual = mp.data + df_actual = mp.data["table"] df_expected = pd.DataFrame( OrderedDict([("index_col", [2, 2]), ("column", list("ab"))]) ) @@ -922,18 +1279,18 @@ def test_reconstruct_index_categories(store): key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") - store_schema_metadata(schema, "uuid", store) + store_schema_metadata(schema, "uuid", store, "table") mp = MetaPartition( label="index_col=2/dontcare", - file=key, + files={"table": key}, metadata_version=4, - schema=schema, + table_meta={"table": schema}, partition_keys=["index_col", "second_index_col"], ) categories = ["second_index_col", "index_col"] - mp = mp.load_dataframes(store, categoricals=categories) - df_actual = mp.data + mp = mp.load_dataframes(store, categoricals={"table": categories}) + df_actual = mp.data["table"] df_expected = pd.DataFrame( OrderedDict( [ @@ -958,20 +1315,20 @@ def test_reconstruct_index_empty_df(store, categoricals): key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") - store_schema_metadata(schema, "uuid", store) + store_schema_metadata(schema, "uuid", store, "table") mp = MetaPartition( label="index_col=2/dontcare", - file=key, + files={"table": key}, metadata_version=4, - schema=schema, + table_meta={"table": schema}, partition_keys=["index_col"], ) categoricals = None if categoricals: - categoricals = ["index_col"] + categoricals = {"table": ["index_col"]} mp = mp.load_dataframes(store, categoricals=categoricals) - df_actual = mp.data + df_actual = mp.data["table"] df_expected = pd.DataFrame( OrderedDict([("index_col", [2, 2]), ("column", list("ab"))]) ) @@ -994,18 +1351,18 @@ def test_reconstruct_date_index(store, metadata_version, dates_as_object): key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") - store_schema_metadata(schema, "uuid", store) + store_schema_metadata(schema, "uuid", store, "table") mp = MetaPartition( label="dontcare", - file=key, + files={"table": key}, metadata_version=metadata_version, - schema=schema, + table_meta={"table": schema}, partition_keys=["index_col"], ) mp = mp.load_dataframes(store, dates_as_object=dates_as_object) - df_actual = mp.data + df_actual = mp.data["table"] if dates_as_object: dt_constructor = date else: @@ -1029,23 +1386,26 @@ def test_iter_empty_metapartition(): def test_concat_metapartition(df_all_types): - mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4) - mp2 = MetaPartition(label="second", data=df_all_types, metadata_version=4) + mp1 = MetaPartition(label="first", data={"table": df_all_types}, metadata_version=4) + mp2 = MetaPartition( + label="second", data={"table": df_all_types}, metadata_version=4 + ) new_mp = MetaPartition.concat_metapartitions([mp1, mp2]) # what the label actually is, doesn't matter so much assert new_mp.label is not None + assert new_mp.tables == ["table"] df_expected = pd.concat([df_all_types, df_all_types]) - df_actual = new_mp.data + df_actual = new_mp.data["table"] pdt.assert_frame_equal(df_actual, df_expected) def test_concat_metapartition_wrong_types(df_all_types): - mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4) + mp1 = MetaPartition(label="first", data={"table": df_all_types}, metadata_version=4) df_corrupt = df_all_types.copy() df_corrupt["int8"] = "NoInteger" - mp2 = MetaPartition(label="second", data=df_corrupt, metadata_version=4) + mp2 = MetaPartition(label="second", data={"table": df_corrupt}, metadata_version=4) with pytest.raises(ValueError, match="Schema violation"): MetaPartition.concat_metapartitions([mp1, mp2]) @@ -1054,21 +1414,22 @@ def test_concat_metapartition_wrong_types(df_all_types): def test_concat_metapartition_partitioned(df_all_types): mp1 = MetaPartition( label="int8=1/1234", - data=df_all_types, + data={"table": df_all_types}, metadata_version=4, partition_keys=["int8"], ) mp2 = MetaPartition( label="int8=1/4321", - data=df_all_types, + data={"table": df_all_types}, metadata_version=4, partition_keys=["int8"], ) new_mp = MetaPartition.concat_metapartitions([mp1, mp2]) + assert new_mp.tables == ["table"] df_expected = pd.concat([df_all_types, df_all_types]) - df_actual = new_mp.data + df_actual = new_mp.data["table"] pdt.assert_frame_equal(df_actual, df_expected) assert new_mp.partition_keys == ["int8"] @@ -1077,13 +1438,13 @@ def test_concat_metapartition_partitioned(df_all_types): def test_concat_metapartition_different_partitioning(df_all_types): mp1 = MetaPartition( label="int8=1/1234", - data=df_all_types, + data={"table": df_all_types}, metadata_version=4, partition_keys=["int8"], ) mp2 = MetaPartition( label="float8=1.0/4321", - data=df_all_types, + data={"table": df_all_types}, metadata_version=4, partition_keys=["float8"], ) @@ -1095,21 +1456,21 @@ def test_concat_metapartition_different_partitioning(df_all_types): def test_concat_metapartition_categoricals(df_all_types): mp1 = MetaPartition( label="first", - data=pd.DataFrame({"a": [0, 0], "b": ["a", "a"]}, dtype="category"), + data={"table": pd.DataFrame({"a": [0, 0], "b": ["a", "a"]}, dtype="category")}, metadata_version=4, partition_keys=["a"], ) mp2 = MetaPartition( label="second", - data=pd.DataFrame({"a": [1, 1], "b": ["a", "b"]}, dtype="category"), + data={"table": pd.DataFrame({"a": [1, 1], "b": ["a", "b"]}, dtype="category")}, metadata_version=4, partition_keys=["a"], ) new_mp = MetaPartition.concat_metapartitions([mp1, mp2]) - assert new_mp.table_name == "table" - assert pd.api.types.is_categorical_dtype(new_mp.data["b"].dtype) + assert new_mp.tables == ["table"] + assert pd.api.types.is_categorical_dtype(new_mp.data["table"]["b"].dtype) # We can't partition on null columns (gh-262) @@ -1121,7 +1482,9 @@ def test_partition_on_scalar_intermediate(df_not_nested, col): Test against a bug where grouping leaves a scalar value """ assert len(df_not_nested) == 1 - mp = MetaPartition(label="somelabel", data=df_not_nested, metadata_version=4) + mp = MetaPartition( + label="somelabel", data={"table": df_not_nested}, metadata_version=4 + ) new_mp = mp.partition_on(col) assert len(new_mp) == 1 @@ -1129,7 +1492,7 @@ def test_partition_on_scalar_intermediate(df_not_nested, col): def test_partition_on_with_primary_index_invalid(df_not_nested): mp = MetaPartition( label="pkey=1/pkey2=2/base_label", - data=df_not_nested, + data={"table": df_not_nested}, partition_keys=["pkey", "pkey2"], metadata_version=4, ) @@ -1152,7 +1515,7 @@ def test_partition_on_with_primary_index_invalid(df_not_nested): def test_partition_on_with_primary_index(df_not_nested): mp = MetaPartition( label="pkey=1/base_label", - data=df_not_nested, + data={"table": df_not_nested}, partition_keys=["pkey"], metadata_version=4, ) @@ -1200,12 +1563,12 @@ def test_column_string_cast(df_all_types, store, metadata_version): key = ser.store(store, "uuid/table/something", df_all_types) mp = MetaPartition( label="something", - file=key, - schema=make_meta(df_all_types, origin="table"), + files={"table": key}, + table_meta={"table": make_meta(df_all_types, origin="table")}, metadata_version=metadata_version, ) mp = mp.load_dataframes(store) - df = mp.data + df = mp.data["table"] assert all(original_columns == df.columns) @@ -1215,11 +1578,18 @@ def test_partition_on_valid_schemas(): sub partitions may be different """ df = pd.DataFrame({"partition_col": [0, 1], "values": [None, "str"]}) - mp = MetaPartition(label="base_label", data=df, metadata_version=4) + mp = MetaPartition(label="base_label", data={"table": df}, metadata_version=4) mp = mp.partition_on(["partition_col"]) assert len(mp) == 2 expected_meta = make_meta(df, origin="1", partition_keys="partition_col") - assert mp.schema == expected_meta + assert mp.table_meta["table"] == expected_meta + + +def test_dataframe_input_to_metapartition(): + with pytest.raises(ValueError): + parse_input_to_metapartition(tuple([1])) + with pytest.raises(ValueError): + parse_input_to_metapartition("abc") def test_input_to_metaframes_empty(): @@ -1235,28 +1605,74 @@ def test_input_to_metaframes_simple(): assert isinstance(mp, MetaPartition) assert len(mp.data) == 1 - assert mp.file is None + assert len(mp.files) == 0 - df = mp.data + df = list(mp.data.values())[0] pdt.assert_frame_equal(df, df_input) assert isinstance(mp.label, str) +def test_input_to_metaframes_dict(): + df_input = { + "label": "cluster_1", + "data": [ + ("some_file", pd.DataFrame({"A": [1]})), + ("some_other_file", pd.DataFrame({"B": [2]})), + ], + } + mp = parse_input_to_metapartition(obj=df_input) + assert isinstance(mp, MetaPartition) + assert len(mp.data) == 2 + assert len(mp.files) == 0 + + assert mp.label == "cluster_1" + + data = mp.data + + df = data["some_file"] + pdt.assert_frame_equal( + df, pd.DataFrame({"A": [1]}), check_dtype=False, check_like=True + ) + + df2 = data["some_other_file"] + pdt.assert_frame_equal( + df2, pd.DataFrame({"B": [2]}), check_dtype=False, check_like=True + ) + + def test_parse_nested_input_schema_compatible_but_different(): # Ensure that input can be parsed even though the schemas are not identical but compatible - df_input = [[pd.DataFrame({"A": [None]}), pd.DataFrame({"A": ["str"]})]] + df_input = [ + [ + {"data": {"table": pd.DataFrame({"A": [None]})}}, + {"data": {"table": pd.DataFrame({"A": ["str"]})}}, + ] + ] mp = parse_input_to_metapartition(df_input, metadata_version=4) expected_schema = make_meta(pd.DataFrame({"A": ["str"]}), origin="expected") - assert mp.schema == expected_schema + assert mp.table_meta["table"] == expected_schema + + +def test_parse_input_schema_formats(): + df = pd.DataFrame({"B": [pd.Timestamp("2019")]}) + formats_obj = [ + {"data": {"table1": df}}, + {"table1": df}, + {"data": [("table1", df)]}, + [("table1", df)], + ] + for obj in formats_obj: + mp = parse_input_to_metapartition(obj=obj, metadata_version=4) + assert mp.data == {"table1": df} def test_get_parquet_metadata(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) - mp = MetaPartition(label="test_label", data=df) - meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid") + mp = MetaPartition(label="test_label", data={"core": df},) + meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid",) - actual = meta_partition.get_parquet_metadata(store=store) + actual = meta_partition.get_parquet_metadata(store=store, table_name="core") actual.drop(labels="serialized_size", axis=1, inplace=True) actual.drop(labels="row_group_compressed_size", axis=1, inplace=True) actual.drop(labels="row_group_uncompressed_size", axis=1, inplace=True) @@ -1275,10 +1691,10 @@ def test_get_parquet_metadata(store): def test_get_parquet_metadata_empty_df(store): df = pd.DataFrame() - mp = MetaPartition(label="test_label", data=df) - meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid") + mp = MetaPartition(label="test_label", data={"core": df},) + meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid",) - actual = meta_partition.get_parquet_metadata(store=store) + actual = meta_partition.get_parquet_metadata(store=store, table_name="core") actual.drop( columns=[ "serialized_size", @@ -1304,13 +1720,13 @@ def test_get_parquet_metadata_empty_df(store): def test_get_parquet_metadata_row_group_size(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) - mp = MetaPartition(label="test_label", data=df) + mp = MetaPartition(label="test_label", data={"core": df},) ps = ParquetSerializer(chunk_size=5) meta_partition = mp.store_dataframes( store=store, dataset_uuid="dataset_uuid", df_serializer=ps ) - actual = meta_partition.get_parquet_metadata(store=store) + actual = meta_partition.get_parquet_metadata(store=store, table_name="core") actual.drop( columns=[ "serialized_size", @@ -1333,25 +1749,12 @@ def test_get_parquet_metadata_row_group_size(store): pd.testing.assert_frame_equal(actual, expected) -def test__reconstruct_index_columns(): - df = pd.DataFrame({"x": [0], "a": [-1], "b": [-2], "c": [-3]}) - mp = MetaPartition(label="test_label", data=df) - df_with_index_columns = mp._reconstruct_index_columns( - df=df[["x"]], - key_indices=[("a", 1), ("b", 2), ("c", 3)], - columns=["x", "c"], - categories=None, - date_as_object=False, - ) - # Index columns first - pdt.assert_frame_equal(df_with_index_columns, pd.DataFrame({"c": [3], "x": [0]})) - +def test_get_parquet_metadata_table_name_not_str(store): + df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) + mp = MetaPartition(label="test_label", data={"core": df, "another_table": df},) + meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid",) -def test_partition_on_keeps_table_name(): - mp = MetaPartition( - label="label_1", - data=pd.DataFrame({"P": [1, 2, 1, 2], "L": [1, 1, 2, 2]}), - table_name="non-default-name", - ) - repartitioned_mp = mp.partition_on(["P"]) - assert repartitioned_mp.table_name == "non-default-name" + with pytest.raises(TypeError): + meta_partition.get_parquet_metadata( + store=store, table_name=["core", "another_table"] + ) diff --git a/tests/io_components/test_mutate.py b/tests/io_components/test_mutate.py new file mode 100644 index 00000000..e3e87024 --- /dev/null +++ b/tests/io_components/test_mutate.py @@ -0,0 +1,234 @@ +import types + +import pandas as pd +import pytest + +from kartothek.io_components.merge import align_datasets +from kartothek.io_components.metapartition import MetaPartition +from kartothek.io_components.write import store_dataset_from_partitions + + +def test_align_datasets_prefix(dataset, evaluation_dataset, store_session): + generator = align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=evaluation_dataset.uuid, + store=store_session, + match_how="prefix", + ) + assert isinstance(generator, types.GeneratorType) + list_metapartitions = list(generator) + + # Two separate cluster_groups (e.g. cluster_1*) + assert len(list_metapartitions) == 2 + + mp_list = list_metapartitions[0] + + assert len(mp_list) == 3, [mp.label for mp in mp_list] + + mp_list = list_metapartitions[1] + assert len(mp_list) == 3, [mp.label for mp in mp_list] + + # Test sorting of datasets by length, i.e. order of dataframes is different + generator = align_datasets( + left_dataset_uuid=evaluation_dataset.uuid, + right_dataset_uuid=dataset.uuid, + store=store_session, + match_how="prefix", + ) + list_metapartitions = list(generator) + mp_list = list_metapartitions[0] + + +def test_align_datasets_prefix__equal_number_of_partitions( + dataset, evaluation_dataset, store_session +): + """ + Test a scenario where the simple prefix match algorithm didn't find any + matches in case of equal number of partitions in both datasets. + """ + + # Create a reference dataset which matches the problem (equal number of + # partitions and suitable for prefix matching) + mp = MetaPartition(label="cluster_1_1", metadata_version=dataset.metadata_version) + mp2 = MetaPartition(label="cluster_2_1", metadata_version=dataset.metadata_version) + metapartitions = [mp, mp2] + store_dataset_from_partitions( + partition_list=metapartitions, + dataset_uuid="reference_dataset_uuid", + store=store_session, + ) + + generator = align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid="reference_dataset_uuid", + store=store_session, + match_how="prefix", + ) + assert isinstance(generator, types.GeneratorType) + list_metapartitions = list(generator) + + # Two separate cluster_groups (e.g. cluster_1*) + assert len(list_metapartitions) == 2 + + mp_list = list_metapartitions[0] + + assert len(mp_list) == 2 + + mp_list = list_metapartitions[1] + assert len(mp_list) == 2 + + # Test sorting of datasets by length, i.e. order of dataframes is different + generator = align_datasets( + left_dataset_uuid=evaluation_dataset.uuid, + right_dataset_uuid=dataset.uuid, + store=store_session, + match_how="prefix", + ) + list_metapartitions = list(generator) + mp_list = list_metapartitions[0] + + +def test_align_datasets_exact(dataset, evaluation_dataset, store_session): + with pytest.raises(RuntimeError): + list( + align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=evaluation_dataset.uuid, + store=store_session, + match_how="exact", + ) + ) + + generator = align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=dataset.uuid, + store=store_session, + match_how="exact", + ) + assert isinstance(generator, types.GeneratorType) + list_metapartitions = list(generator) + + # Two separate cluster_groups (e.g. cluster_1*) + assert len(list_metapartitions) == 2 + + mp_list = list_metapartitions[0] + assert len(mp_list) == 2, [mp.label for mp in mp_list] + assert [mp.label for mp in mp_list] == ["cluster_1", "cluster_1"] + + mp_list = list_metapartitions[1] + assert len(mp_list) == 2, [mp.label for mp in mp_list] + assert [mp.label for mp in mp_list] == ["cluster_2", "cluster_2"] + + +def test_align_datasets_left(dataset, evaluation_dataset, store_session): + generator = align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=evaluation_dataset.uuid, + store=store_session, + match_how="left", + ) + assert isinstance(generator, types.GeneratorType) + list_metapartitions = list(generator) + + assert len(list_metapartitions) == len(dataset.partitions) + + mp_list = list_metapartitions[0] + assert len(mp_list) == 5, [mp.label for mp in mp_list] + expected = ["cluster_1", "cluster_1_1", "cluster_1_2", "cluster_2_1", "cluster_2_2"] + assert [mp.label for mp in mp_list] == expected + + mp_list = list_metapartitions[1] + assert len(mp_list) == 5, [mp.label for mp in mp_list] + expected = ["cluster_2", "cluster_1_1", "cluster_1_2", "cluster_2_1", "cluster_2_2"] + assert [mp.label for mp in mp_list] == expected + + +def test_align_datasets_right(dataset, evaluation_dataset, store_session): + generator = align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=evaluation_dataset.uuid, + store=store_session, + match_how="right", + ) + assert isinstance(generator, types.GeneratorType) + list_metapartitions = list(generator) + + assert len(list_metapartitions) == len(evaluation_dataset.partitions) + + mp_list = list_metapartitions[0] + assert len(mp_list) == 3, [mp.label for mp in mp_list] + expected = ["cluster_1_1", "cluster_1", "cluster_2"] + assert [mp.label for mp in mp_list] == expected + + mp_list = list_metapartitions[1] + assert len(mp_list) == 3, [mp.label for mp in mp_list] + expected = ["cluster_1_2", "cluster_1", "cluster_2"] + assert [mp.label for mp in mp_list] == expected + + mp_list = list_metapartitions[2] + assert len(mp_list) == 3, [mp.label for mp in mp_list] + expected = ["cluster_2_1", "cluster_1", "cluster_2"] + assert [mp.label for mp in mp_list] == expected + + mp_list = list_metapartitions[3] + assert len(mp_list) == 3, [mp.label for mp in mp_list] + expected = ["cluster_2_2", "cluster_1", "cluster_2"] + assert [mp.label for mp in mp_list] == expected + + +def test_align_datasets_callable(dataset, evaluation_dataset, store_session): + def comp(left, right): + return left == right + + with pytest.raises(RuntimeError): + list( + align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=evaluation_dataset.uuid, + store=store_session, + match_how=comp, + ) + ) + + generator = align_datasets( + left_dataset_uuid=dataset.uuid, + right_dataset_uuid=dataset.uuid, + store=store_session, + match_how=comp, + ) + assert isinstance(generator, types.GeneratorType) + list_metapartitions = list(generator) + + # Two separate cluster_groups (e.g. cluster_1*) + assert len(list_metapartitions) == 2 + + mp_list = list_metapartitions[0] + assert len(mp_list) == 2, [mp.label for mp in mp_list] + assert [mp.label for mp in mp_list] == ["cluster_1", "cluster_1"] + + mp_list = list_metapartitions[1] + assert len(mp_list) == 2, [mp.label for mp in mp_list] + assert [mp.label for mp in mp_list] == ["cluster_2", "cluster_2"] + + +def test_merge_metapartitions(): + df = pd.DataFrame({"P": [1, 1], "L": [1, 2], "TARGET": [1, 2]}) + df_2 = pd.DataFrame({"P": [1], "info": "a"}) + mp = MetaPartition(label="cluster_1", data={"core": df, "helper": df_2}) + df_3 = pd.DataFrame({"P": [1, 1], "L": [1, 2], "PRED": [0.1, 0.2]}) + + mp2 = MetaPartition(label="cluster_1", data={"predictions": df_3}) + merged_mp = MetaPartition.merge_metapartitions(metapartitions=[mp, mp2]) + + df = pd.DataFrame( + { + "P": [1, 1], + "L": [1, 2], + "TARGET": [1, 2], + "info": ["a", "a"], + "PRED": [0.1, 0.2], + } + ) + + assert merged_mp.label == "cluster_1" + assert len(merged_mp.data) == 3 diff --git a/tests/io_components/test_read.py b/tests/io_components/test_read.py index a33a1a61..ed967052 100644 --- a/tests/io_components/test_read.py +++ b/tests/io_components/test_read.py @@ -1,5 +1,6 @@ import math import types +from collections import OrderedDict import numpy as np import pandas as pd @@ -13,14 +14,50 @@ def test_dispatch_metapartitions(dataset, store_session): part_generator = dispatch_metapartitions(dataset.uuid, store_session) + assert isinstance(part_generator, types.GeneratorType) + partitions = OrderedDict([(part.label, part) for part in part_generator]) + + assert len(partitions) == 2 + mp = partitions["cluster_1"] + assert isinstance(mp, MetaPartition) + + mp = partitions["cluster_2"] + assert isinstance(mp, MetaPartition) + + assert set(mp.table_meta.keys()) == {SINGLE_TABLE, "helper"} + + +def test_dispatch_metapartitions_label_filter(dataset, store_session): + def label_filter(part_label): + return "cluster_1" in part_label + + part_generator = dispatch_metapartitions( + dataset.uuid, store_session, label_filter=label_filter + ) + + assert isinstance(part_generator, types.GeneratorType) + partitions = OrderedDict([(part.label, part) for part in part_generator]) + + assert len(partitions) == 1 + mp = partitions["cluster_1"] + assert isinstance(mp, MetaPartition) + + +def test_dispatch_metapartitions_without_dataset_metadata(dataset, store_session): + part_generator = dispatch_metapartitions( + dataset.uuid, store_session, load_dataset_metadata=False + ) + assert isinstance(part_generator, types.GeneratorType) partitions = list(part_generator) assert len(partitions) == 2 - assert len({mp.label for mp in partitions}) == 2 - for mp in partitions: - assert isinstance(mp, MetaPartition) - assert mp.table_name == SINGLE_TABLE + + mp = partitions[0] + assert mp.dataset_metadata == {} + + mp = partitions[1] + assert mp.dataset_metadata == {} @pytest.mark.parametrize( @@ -86,9 +123,21 @@ def test_dispatch_metapartitions_concat_regression(store): partition_on=["p"], ) - mps = list(dispatch_metapartitions(dataset.uuid, store)) + mps = list( + dispatch_metapartitions( + dataset.uuid, store, concat_partitions_on_primary_index=False + ) + ) assert len(mps) == 2 + with pytest.deprecated_call(): + mps = list( + dispatch_metapartitions( + dataset.uuid, store, concat_partitions_on_primary_index=True + ) + ) + assert len(mps) == 1 + mps = list(dispatch_metapartitions(dataset.uuid, store, dispatch_by=["p"])) assert len(mps) == 1 @@ -170,7 +219,7 @@ def test_dispatch_metapartitions_complex_or_predicates(store_factory): dataset_uuid, store_factory, predicates=predicates ) ] - actual = pd.concat([mp.data for mp in mps]) + actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={ @@ -188,7 +237,7 @@ def test_dispatch_metapartitions_complex_or_predicates(store_factory): dataset_uuid, store_factory, predicates=predicates ) ] - actual = pd.concat([mp.data for mp in mps]) + actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={"A": [0, 1, 2], "B": ["A", "B", "A"], "C": [-10, -9, -8]} @@ -202,7 +251,7 @@ def test_dispatch_metapartitions_complex_or_predicates(store_factory): dataset_uuid, store_factory, predicates=predicates ) ] - actual = pd.concat([mp.data for mp in mps]) + actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={ @@ -220,7 +269,7 @@ def test_dispatch_metapartitions_complex_or_predicates(store_factory): dataset_uuid, store_factory, predicates=predicates ) ] - actual = pd.concat([mp.data for mp in mps]) + actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={ diff --git a/tests/io_components/test_write.py b/tests/io_components/test_write.py index 86086405..59e3d7ab 100644 --- a/tests/io_components/test_write.py +++ b/tests/io_components/test_write.py @@ -35,7 +35,7 @@ def test_store_dataset_from_partitions(meta_partitions_files_only, store, frozen # Dataset metadata: 1 file expected_number_files = 1 # common metadata for v4 datasets - expected_number_files += 1 + expected_number_files += 2 assert len(store_files) == expected_number_files # Ensure the dataset can be loaded properly @@ -46,15 +46,15 @@ def test_store_dataset_from_partitions(meta_partitions_files_only, store, frozen def test_store_dataset_from_partitions_update(store, metadata_version, frozen_time): mp1 = MetaPartition( label="cluster_1", - data=pd.DataFrame({"p": [1]}), - file="1.parquet", + data={"df": pd.DataFrame({"p": [1]})}, + files={"df": "1.parquet"}, indices={"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, metadata_version=metadata_version, ) mp2 = MetaPartition( label="cluster_2", - data=pd.DataFrame({"p": [2]}), - file="2.parquet", + data={"df": pd.DataFrame({"p": [2]})}, + files={"df": "2.parquet"}, indices={"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, metadata_version=metadata_version, ) @@ -68,8 +68,8 @@ def test_store_dataset_from_partitions_update(store, metadata_version, frozen_ti mp3 = MetaPartition( label="cluster_3", - data=pd.DataFrame({"p": [3]}), - file="3.parquet", + data={"df": pd.DataFrame({"p": [3]})}, + files={"df": "3.parquet"}, indices={"p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]})}, metadata_version=metadata_version, ) diff --git a/tests/utils/test_ktk_adapters.py b/tests/utils/test_ktk_adapters.py index 542aa21c..16cd6867 100644 --- a/tests/utils/test_ktk_adapters.py +++ b/tests/utils/test_ktk_adapters.py @@ -11,11 +11,12 @@ from kartothek.core.cube.cube import Cube from kartothek.core.dataset import DatasetMetadata from kartothek.io.dask.bag import store_bag_as_dataset -from kartothek.io_components.metapartition import MetaPartition +from kartothek.io_components.metapartition import SINGLE_TABLE, MetaPartition from kartothek.io_components.read import dispatch_metapartitions from kartothek.utils.ktk_adapters import ( get_dataset_columns, get_dataset_keys, + get_dataset_schema, get_partition_dataframe, get_physical_partition_stats, metadata_factory_from_dataset, @@ -41,7 +42,9 @@ def ds(function_store, cube_has_ts_col): mps = [ MetaPartition( - label="mp{}".format(i), data=df, metadata_version=KTK_CUBE_METADATA_VERSION, + label="mp{}".format(i), + data={SINGLE_TABLE: df}, + metadata_version=KTK_CUBE_METADATA_VERSION, ) .partition_on(["p"] + (["KLEE_TS"] if cube_has_ts_col else [])) .build_indices(["i"]) @@ -59,6 +62,10 @@ def ds(function_store, cube_has_ts_col): ).compute() +def test_get_dataset_schema(ds): + assert get_dataset_schema(ds) == ds.table_meta[SINGLE_TABLE] + + def test_get_dataset_columns(ds): cols = get_dataset_columns(ds) assert cols == {"_foo", "i", "p", "x"}