From 61513d39d06d904ee76dfb7599cd2bf51bb4b90e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 19 Mar 2024 09:31:52 -0700 Subject: [PATCH 01/38] Unpin dask/distributed for development --- continuous_integration/docker/conda.txt | 2 +- continuous_integration/docker/main.dockerfile | 2 +- continuous_integration/environment-3.10.yaml | 2 +- continuous_integration/environment-3.11.yaml | 2 +- continuous_integration/environment-3.12.yaml | 2 +- continuous_integration/gpuci/environment-3.10.yaml | 2 +- continuous_integration/gpuci/environment-3.9.yaml | 2 +- continuous_integration/recipe/meta.yaml | 2 +- docs/environment.yml | 2 +- docs/requirements-docs.txt | 2 +- pyproject.toml | 4 ++-- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/continuous_integration/docker/conda.txt b/continuous_integration/docker/conda.txt index 64892c882..3849238c4 100644 --- a/continuous_integration/docker/conda.txt +++ b/continuous_integration/docker/conda.txt @@ -1,5 +1,5 @@ python>=3.9 -dask==2024.1.1 +dask>=2024.1.1 pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 diff --git a/continuous_integration/docker/main.dockerfile b/continuous_integration/docker/main.dockerfile index 2a8c2ed5d..c2da576a0 100644 --- a/continuous_integration/docker/main.dockerfile +++ b/continuous_integration/docker/main.dockerfile @@ -16,7 +16,7 @@ RUN mamba install -y \ # build requirements "maturin>=1.3,<1.4" \ # core dependencies - "dask==2024.1.1" \ + "dask>=2024.1.1" \ "pandas>=1.4.0" \ "fastapi>=0.92.0" \ "httpx>=0.24.1" \ diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index 56333e288..30fd86a8f 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask==2024.1.1 +- dask>=2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index 046d0a3a9..fd80633a3 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask==2024.1.1 +- dask>=2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml index 6b6e15223..98b5fd583 100644 --- a/continuous_integration/environment-3.12.yaml +++ b/continuous_integration/environment-3.12.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask==2024.1.1 +- dask>=2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 8de52bf7b..b19d47840 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -9,7 +9,7 @@ channels: dependencies: - c-compiler - zlib -- dask==2024.1.1 +- dask>=2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 7ceee8cfb..572463de4 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -9,7 +9,7 @@ channels: dependencies: - c-compiler - zlib -- dask==2024.1.1 +- dask>=2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 914f9da0b..cf78f71f8 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,7 @@ requirements: - xz # [linux64] run: - python - - dask ==2024.1.1 + - dask >=2024.1.1 - pandas >=1.4.0 - fastapi >=0.92.0 - httpx >=0.24.1 diff --git a/docs/environment.yml b/docs/environment.yml index 98b2f0f08..13befb971 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -6,7 +6,7 @@ dependencies: - sphinx>=4.0.0 - sphinx-tabs - dask-sphinx-theme>=2.0.3 - - dask==2024.1.1 + - dask>=2024.1.1 - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 689599446..0f121bf7a 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 -dask==2024.1.1 +dask>=2024.1.1 pandas>=1.4.0 fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/pyproject.toml b/pyproject.toml index bcbd0a06e..fd0358289 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,8 @@ classifiers = [ readme = "README.md" requires-python = ">=3.9" dependencies = [ - "dask[dataframe]==2024.1.1", - "distributed==2024.1.1", + "dask[dataframe]>=2024.1.1", + "distributed>=2024.1.1", "pandas>=1.4.0", "fastapi>=0.92.0", "httpx>=0.24.1", From 0fb485d66d8a5c91ddd6bc213782b3653368263d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:26:39 -0700 Subject: [PATCH 02/38] First pass at unblocking dask-expr issues - replace _Frame usage --- dask_sql/physical/rel/custom/wrappers.py | 6 +++--- dask_sql/physical/utils/filter.py | 14 +++++++------- tests/unit/test_ml_utils.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/dask_sql/physical/rel/custom/wrappers.py b/dask_sql/physical/rel/custom/wrappers.py index 49d4adb64..af7619306 100644 --- a/dask_sql/physical/rel/custom/wrappers.py +++ b/dask_sql/physical/rel/custom/wrappers.py @@ -207,7 +207,7 @@ def transform(self, X): estimator=self._postfit_estimator, meta=output_meta, ) - elif isinstance(X, dd._Frame): + elif isinstance(X, dd.DataFrame): if output_meta is None: output_meta = _transform(X._meta_nonempty, self._postfit_estimator) try: @@ -305,7 +305,7 @@ def predict(self, X): ) return result - elif isinstance(X, dd._Frame): + elif isinstance(X, dd.DataFrame): if output_meta is None: # dask-dataframe relies on dd.core.no_default # for infering meta @@ -364,7 +364,7 @@ def predict_proba(self, X): meta=output_meta, chunks=(X.chunks[0], len(self._postfit_estimator.classes_)), ) - elif isinstance(X, dd._Frame): + elif isinstance(X, dd.DataFrame): if output_meta is None: # dask-dataframe relies on dd.core.no_default # for infering meta diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index aff9ab5ef..6e820cd80 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -304,10 +304,10 @@ def combine(self, other: DNF | _And | _Or | list | tuple | None) -> DNF: # Specify functions that must be generated with # a different API at the dataframe-collection level _special_op_mappings = { - M.fillna: dd._Frame.fillna, - M.isin: dd._Frame.isin, - M.isna: dd._Frame.isna, - M.astype: dd._Frame.astype, + M.fillna: dd.DataFrame.fillna, + M.isin: dd.DataFrame.isin, + M.isna: dd.DataFrame.isna, + M.astype: dd.DataFrame.astype, } # Convert _pass_through_ops to respect "special" mappings @@ -316,7 +316,7 @@ def combine(self, other: DNF | _And | _Or | list | tuple | None) -> DNF: def _preprocess_layers(input_layers): # NOTE: This is a Layer-specific work-around to deal with - # the fact that `dd._Frame.isin(values)` will add a distinct + # the fact that `dd.DataFrame.isin(values)` will add a distinct # `MaterializedLayer` for the `values` argument. # See: https://github.com/dask-contrib/dask-sql/issues/607 skip = set() @@ -418,9 +418,9 @@ def _dnf_filter_expression(self, dsk): func = _blockwise_logical_dnf elif op == operator.getitem: func = _blockwise_getitem_dnf - elif op == dd._Frame.isin: + elif op == dd.DataFrame.isin: func = _blockwise_isin_dnf - elif op == dd._Frame.isna: + elif op == dd.DataFrame.isna: func = _blockwise_isna_dnf elif op == operator.inv: func = _blockwise_inv_dnf diff --git a/tests/unit/test_ml_utils.py b/tests/unit/test_ml_utils.py index 7130b2bed..2c7365f00 100644 --- a/tests/unit/test_ml_utils.py +++ b/tests/unit/test_ml_utils.py @@ -98,7 +98,7 @@ def make_classification( def _assert_eq(l, r, name=None, **kwargs): array_types = (np.ndarray, da.Array) - frame_types = (pd.core.generic.NDFrame, dd._Frame) + frame_types = (pd.core.generic.NDFrame, dd.DataFrame) if isinstance(l, array_types): assert_eq_ar(l, r, **kwargs) elif isinstance(l, frame_types): From 920cd536624dcc430bb5982caed0bdc681bdc56e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:04:01 -0700 Subject: [PATCH 03/38] First pass at unblocking pytest errors --- dask_sql/context.py | 16 ++++++++++++---- dask_sql/physical/rel/logical/table_scan.py | 11 +++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/dask_sql/context.py b/dask_sql/context.py index 83d7820b9..9e4938300 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -262,15 +262,23 @@ def create_table( self.schema[schema_name].filepaths[table_name.lower()] = input_table elif hasattr(input_table, "dask") and dd.utils.is_dataframe_like(input_table): try: - dask_filepath = hlg_layer( - input_table.dask, "read-parquet" - ).creation_info["args"][0] + if dd._dask_expr_enabled(): + from dask_expr.io.parquet import ReadParquet + + dask_filepath = None + operations = input_table.find_operations(ReadParquet) + for op in operations: + dask_filepath = op._args[0] + else: + dask_filepath = hlg_layer( + input_table.dask, "read-parquet" + ).creation_info["args"][0] dc.filepath = dask_filepath self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath except KeyError: logger.debug("Expected 'read-parquet' layer") - if parquet_statistics and not statistics: + if parquet_statistics and not dd._dask_expr_enabled() and not statistics: statistics = parquet_statistics(dc.df) if statistics: row_count = 0 diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index 4a9cecc25..f6cab48cc 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -3,6 +3,7 @@ from functools import reduce from typing import TYPE_CHECKING +from dask.dataframe import _dask_expr_enabled from dask.utils_test import hlg_layer from dask_sql.datacontainer import DataContainer @@ -108,9 +109,11 @@ def _apply_filters(self, table_scan, rel, dc, context): ], ) df = filter_or_scalar(df, df_condition) - try: - logger.debug(hlg_layer(df.dask, "read-parquet").creation_info) - except KeyError: - pass + + if not _dask_expr_enabled(): + try: + logger.debug(hlg_layer(df.dask, "read-parquet").creation_info) + except KeyError: + pass return DataContainer(df, cc) From 7f952d8c09073792656487158fa1d02ec753327e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Apr 2024 12:42:48 -0700 Subject: [PATCH 04/38] Disable predicate pushdown & its tests if dask-expr enabled --- dask_sql/physical/rel/logical/filter.py | 3 ++- tests/integration/test_filter.py | 4 +++- tests/integration/test_select.py | 3 ++- tests/unit/test_utils.py | 8 ++++++++ tests/utils.py | 7 +++++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index af3685a11..0f1437d19 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -38,7 +38,8 @@ def filter_or_scalar( # In SQL, a NULL in a boolean is False on filtering filter_condition = filter_condition.fillna(False) out = df[filter_condition] - if dask_config.get("sql.predicate_pushdown"): + # dask-expr should implicitly handle predicate pushdown + if dask_config.get("sql.predicate_pushdown") and not dd._dask_expr_enabled(): return attempt_predicate_pushdown(out, add_filters=add_filters) else: return out diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py index 41c51d5fb..22947d8b6 100644 --- a/tests/integration/test_filter.py +++ b/tests/integration/test_filter.py @@ -5,7 +5,7 @@ from dask.utils_test import hlg_layer from packaging.version import parse as parseVersion -from tests.utils import assert_eq +from tests.utils import SKIPIF_DASK_EXPR_ENABLED, assert_eq DASK_GT_2022_4_2 = parseVersion(dask.__version__) >= parseVersion("2022.4.2") @@ -208,6 +208,7 @@ def test_filter_year(c): ), ], ) +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown(c, parquet_ddf, query, df_func, filters): # Check for predicate pushdown. @@ -312,6 +313,7 @@ def test_filter_decimal(c, gpu): c.drop_table("df") +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_isna(tmpdir): from dask_sql.context import Context diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index 53ebdc224..bcd4b401f 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -6,7 +6,7 @@ from dask_sql._compat import PANDAS_GT_200 from dask_sql.utils import ParsingException -from tests.utils import assert_eq +from tests.utils import SKIPIF_DASK_EXPR_ENABLED, assert_eq def test_select(c, df): @@ -259,6 +259,7 @@ def test_singular_column_selection(c): ["a", "b", "d"], ], ) +@SKIPIF_DASK_EXPR_ENABLED def test_multiple_column_projection(c, parquet_ddf, input_cols): projection_list = ", ".join(input_cols) result_df = c.sql(f"SELECT {projection_list} from parquet_ddf") diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 6dac75837..1df49868f 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -5,6 +5,7 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown from dask_sql.utils import Pluggable, is_frame +from tests.utils import SKIPIF_DASK_EXPR_ENABLED def test_is_frame_for_frame(): @@ -56,6 +57,7 @@ def test_overwrite(): assert PluginTest1().get_plugin("some_key") == "value_2" +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_simple(parquet_ddf): filtered_df = parquet_ddf[parquet_ddf["a"] > 1] pushdown_df = attempt_predicate_pushdown(filtered_df) @@ -68,6 +70,7 @@ def test_predicate_pushdown_simple(parquet_ddf): assert got_filters == expected_filters +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_logical(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) | (parquet_ddf["a"] == -1) @@ -83,6 +86,7 @@ def test_predicate_pushdown_logical(parquet_ddf): assert got_filters == expected_filters +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_in(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) @@ -103,6 +107,7 @@ def test_predicate_pushdown_in(parquet_ddf): assert got_filters == expected_filters +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_isna(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) @@ -123,6 +128,7 @@ def test_predicate_pushdown_isna(parquet_ddf): assert got_filters == expected_filters +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_add_filters(parquet_ddf): filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] pushdown_df = attempt_predicate_pushdown( @@ -141,6 +147,7 @@ def test_predicate_pushdown_add_filters(parquet_ddf): assert got_filters == expected_filters +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_add_filters_no_extract(parquet_ddf): filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] pushdown_df = attempt_predicate_pushdown( @@ -157,6 +164,7 @@ def test_predicate_pushdown_add_filters_no_extract(parquet_ddf): assert got_filters == expected_filters +@SKIPIF_DASK_EXPR_ENABLED def test_predicate_pushdown_add_filters_no_preserve(parquet_ddf): filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] pushdown_df0 = attempt_predicate_pushdown(filtered_df) diff --git a/tests/utils.py b/tests/utils.py index 291c3bc53..45020191f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,7 +1,14 @@ import os +import pytest +from dask.dataframe import _dask_expr_enabled from dask.dataframe.utils import assert_eq as _assert_eq +SKIPIF_DASK_EXPR_ENABLED = pytest.mark.skipif( + _dask_expr_enabled(), + reason="Predicate pushdown & column projection should be handled implicitly by dask-expr", +) + # use distributed client for testing if it's available scheduler = ( "distributed" From fc4a248e967a524ed1f6e14cb2e9d46b3bccef06 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:05:14 -0700 Subject: [PATCH 05/38] Make sure partition_borders is computed in limit map_partitions func --- dask_sql/physical/rel/logical/limit.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 9bd2be562..805ad69ba 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -58,6 +58,7 @@ def _apply_limit(self, df: dd.DataFrame, limit: int, offset: int) -> dd.DataFram # check if the first partition contains our desired window if ( dask_config.get("sql.limit.check-first-partition") + and not dd._dask_expr_enabled() and all( [ isinstance( @@ -79,6 +80,10 @@ def _apply_limit(self, df: dd.DataFrame, limit: int, offset: int) -> dd.DataFram def limit_partition_func(df, partition_borders, partition_info=None): """Limit the partition to values contained within the specified window, returning an empty dataframe if there are none""" + # with dask-expr we may need to explicitly compute here + if hasattr(partition_borders, "compute"): + partition_borders = partition_borders.compute() + # TODO: remove the `cumsum` call here when dask#9067 is resolved partition_borders = partition_borders.cumsum().to_dict() partition_index = ( From c3d8efe27cd593c5d1fc588a26481b8e8d4a0c46 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:09:01 -0700 Subject: [PATCH 06/38] Skip intake tests for now --- tests/integration/test_filter.py | 6 +++--- tests/integration/test_intake.py | 7 ++++++- tests/integration/test_select.py | 4 ++-- tests/unit/test_utils.py | 16 ++++++++-------- tests/utils.py | 21 ++++++++++++++++----- 5 files changed, 35 insertions(+), 19 deletions(-) diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py index 22947d8b6..f25171b97 100644 --- a/tests/integration/test_filter.py +++ b/tests/integration/test_filter.py @@ -5,7 +5,7 @@ from dask.utils_test import hlg_layer from packaging.version import parse as parseVersion -from tests.utils import SKIPIF_DASK_EXPR_ENABLED, assert_eq +from tests.utils import assert_eq, skipif_dask_expr_enabled DASK_GT_2022_4_2 = parseVersion(dask.__version__) >= parseVersion("2022.4.2") @@ -208,7 +208,7 @@ def test_filter_year(c): ), ], ) -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown(c, parquet_ddf, query, df_func, filters): # Check for predicate pushdown. @@ -313,7 +313,7 @@ def test_filter_decimal(c, gpu): c.drop_table("df") -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_isna(tmpdir): from dask_sql.context import Context diff --git a/tests/integration/test_intake.py b/tests/integration/test_intake.py index 365b89f46..ebfd8dfed 100644 --- a/tests/integration/test_intake.py +++ b/tests/integration/test_intake.py @@ -6,7 +6,12 @@ import pytest from dask_sql.context import Context -from tests.utils import assert_eq +from tests.utils import assert_eq, skipif_dask_expr_enabled + +# intake doesn't yet have proper dask-expr support +pytestmark = skipif_dask_expr_enabled( + reason="Intake doesn't yet have proper dask-expr support" +) # skip the test if intake is not installed intake = pytest.importorskip("intake") diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index bcd4b401f..d79583d51 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -6,7 +6,7 @@ from dask_sql._compat import PANDAS_GT_200 from dask_sql.utils import ParsingException -from tests.utils import SKIPIF_DASK_EXPR_ENABLED, assert_eq +from tests.utils import assert_eq, skipif_dask_expr_enabled def test_select(c, df): @@ -259,7 +259,7 @@ def test_singular_column_selection(c): ["a", "b", "d"], ], ) -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_multiple_column_projection(c, parquet_ddf, input_cols): projection_list = ", ".join(input_cols) result_df = c.sql(f"SELECT {projection_list} from parquet_ddf") diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 1df49868f..de4702f85 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -5,7 +5,7 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown from dask_sql.utils import Pluggable, is_frame -from tests.utils import SKIPIF_DASK_EXPR_ENABLED +from tests.utils import skipif_dask_expr_enabled def test_is_frame_for_frame(): @@ -57,7 +57,7 @@ def test_overwrite(): assert PluginTest1().get_plugin("some_key") == "value_2" -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_simple(parquet_ddf): filtered_df = parquet_ddf[parquet_ddf["a"] > 1] pushdown_df = attempt_predicate_pushdown(filtered_df) @@ -70,7 +70,7 @@ def test_predicate_pushdown_simple(parquet_ddf): assert got_filters == expected_filters -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_logical(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) | (parquet_ddf["a"] == -1) @@ -86,7 +86,7 @@ def test_predicate_pushdown_logical(parquet_ddf): assert got_filters == expected_filters -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_in(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) @@ -107,7 +107,7 @@ def test_predicate_pushdown_in(parquet_ddf): assert got_filters == expected_filters -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_isna(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) @@ -128,7 +128,7 @@ def test_predicate_pushdown_isna(parquet_ddf): assert got_filters == expected_filters -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_add_filters(parquet_ddf): filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] pushdown_df = attempt_predicate_pushdown( @@ -147,7 +147,7 @@ def test_predicate_pushdown_add_filters(parquet_ddf): assert got_filters == expected_filters -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_add_filters_no_extract(parquet_ddf): filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] pushdown_df = attempt_predicate_pushdown( @@ -164,7 +164,7 @@ def test_predicate_pushdown_add_filters_no_extract(parquet_ddf): assert got_filters == expected_filters -@SKIPIF_DASK_EXPR_ENABLED +@skipif_dask_expr_enabled() def test_predicate_pushdown_add_filters_no_preserve(parquet_ddf): filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] pushdown_df0 = attempt_predicate_pushdown(filtered_df) diff --git a/tests/utils.py b/tests/utils.py index 45020191f..6043accce 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,11 +4,6 @@ from dask.dataframe import _dask_expr_enabled from dask.dataframe.utils import assert_eq as _assert_eq -SKIPIF_DASK_EXPR_ENABLED = pytest.mark.skipif( - _dask_expr_enabled(), - reason="Predicate pushdown & column projection should be handled implicitly by dask-expr", -) - # use distributed client for testing if it's available scheduler = ( "distributed" @@ -40,3 +35,19 @@ def convert_nullable_columns(df): df[selected_cols] = df[selected_cols].astype(dtypes_mapping[dtype]) return df + + +def skipif_dask_expr_enabled(reason=None): + """ + Skip the test if dask-expr is enabled + """ + # most common reason for skipping + if reason is None: + reason = ( + "Predicate pushdown & column projection should be handled implicitly by dask-expr", + ) + + return pytest.mark.skipif( + _dask_expr_enabled(), + reason=reason, + ) From 4f1747c4983c8f14b76d79ed15f96e0dc3e1bb70 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:15:58 -0700 Subject: [PATCH 07/38] Simplify cross join logic to avoid internal graph manipulation --- dask_sql/physical/rel/logical/join.py | 42 ++++----------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index 06bb34ca3..c4e4fe759 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -6,8 +6,6 @@ import dask.dataframe as dd from dask import config as dask_config -from dask.base import tokenize -from dask.highlevelgraph import HighLevelGraph from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -132,41 +130,11 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # TODO: we should implement a shortcut # for filter conditions that are always false - def merge_single_partitions(lhs_partition, rhs_partition): - # Do a cross join with the two partitions - # TODO: it would be nice to apply the filter already here - # problem: this would mean we need to ship the rex to the - # workers (as this is executed on the workers), - # which is definitely not possible (java dependency, JVM start...) - lhs_partition = lhs_partition.assign(common=1) - rhs_partition = rhs_partition.assign(common=1) - - return lhs_partition.merge(rhs_partition, on="common").drop( - columns="common" - ) - - # Iterate nested over all partitions from lhs and rhs and merge them - name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed) - dsk = { - (name, i * df_rhs_renamed.npartitions + j): ( - merge_single_partitions, - (df_lhs_renamed._name, i), - (df_rhs_renamed._name, j), - ) - for i in range(df_lhs_renamed.npartitions) - for j in range(df_rhs_renamed.npartitions) - } - - graph = HighLevelGraph.from_collections( - name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed] - ) - - meta = dd.dispatch.concat( - [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1 - ) - # TODO: Do we know the divisions in any way here? - divisions = [None] * (len(dsk) + 1) - df = dd.DataFrame(graph, name, meta=meta, divisions=divisions) + df = dd.merge( + df_lhs_renamed.assign(common=1), + df_rhs_renamed.assign(common=1), + on="common", + ).drop(columns="common") warnings.warn( "Need to do a cross-join, which is typically very resource heavy", From e50e30bee71b12127345d83391f33bdf22d614e4 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:16:14 -0700 Subject: [PATCH 08/38] Round trip timeseries fixture to pandas to avoid dask-expr bug --- tests/integration/fixtures.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index 669260215..ae596cb80 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -120,7 +120,15 @@ def datetime_table(): @pytest.fixture() def timeseries(): - return dd_timeseries(freq="1d").reset_index(drop=True) + df = dd_timeseries(freq="1d").reset_index(drop=True) + + # dask-expr doesn't play well with the timeseries dataset + # roundtripping through pandas seems to help + # see: https://github.com/dask/dask-expr/issues/1013 + if dd._dask_expr_enabled(): + df = dd.from_pandas(df.compute()) + + return df @pytest.fixture() From 2d5c8c779edeaaf6735dffeead6bb8aca0a93dfe Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:18:39 -0700 Subject: [PATCH 09/38] Fix skipif_dask_expr_enabled marker --- tests/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 6043accce..0cb27ba90 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -43,9 +43,7 @@ def skipif_dask_expr_enabled(reason=None): """ # most common reason for skipping if reason is None: - reason = ( - "Predicate pushdown & column projection should be handled implicitly by dask-expr", - ) + reason = "Predicate pushdown & column projection should be handled implicitly by dask-expr" return pytest.mark.skipif( _dask_expr_enabled(), From 2fe2e3c9791ac8ad4123d33b9f2e16682e0cceac Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:58:25 -0700 Subject: [PATCH 10/38] Ignore warnings around mismatched dtypes in joins --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index fd0358289..02f7064d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,5 +102,6 @@ filterwarnings = [ "ignore:Need to do a cross-join:ResourceWarning:dask_sql[.*]", "ignore:Dask doesn't support Dask frames:ResourceWarning:dask_sql[.*]", "ignore:Running on a single-machine scheduler:UserWarning:dask[.*]", + "ignore:Merging dataframes with merge column data type mismatches:UserWarning:dask[.*]", ] xfail_strict = true From a795526c633ef4801c42d8b939cbb81ed82245fe Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:00:38 -0700 Subject: [PATCH 11/38] Add handling for dask-expr to test_broadcast_join --- tests/integration/test_join.py | 39 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index e6257ca02..7c453303f 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -1,3 +1,5 @@ +from contextlib import nullcontext + import dask.dataframe as dd import numpy as np import pandas as pd @@ -523,6 +525,30 @@ def test_join_reorder(c): assert_eq(result_df, expected_df, check_index=False) +def check_broadcast_join(df, val, raises=False): + """ + Check that the broadcast join is correctly set in the Dask layer or expression graph + + Parameters + ---------- + df : DataFrame + The DataFrame to check + val : bool or float + The expected value of the broadcast join + raises : bool, optional + Whether the legacy Dask check should raise an error if the broadcast join is not set + """ + if dd._dask_expr_enabled(): + from dask_expr._merge import Merge + + merge_ops = [op for op in df.expr.find_operations(Merge)] + assert len(merge_ops) == 1 + assert merge_ops[0].broadcast == val + else: + with pytest.raises(KeyError) if raises else nullcontext(): + assert hlg_layer(df.dask, "bcast-join") + + @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_broadcast_join(c, client, gpu): df1 = dd.from_pandas( @@ -545,7 +571,7 @@ def test_broadcast_join(c, client, gpu): expected_df = df1.merge(df2, on="user_id", how="inner") res_df = c.sql(query_string, config_options={"sql.join.broadcast": True}) - assert hlg_layer(res_df.dask, "bcast-join") + check_broadcast_join(res_df, True) assert_eq( res_df, expected_df, @@ -555,7 +581,7 @@ def test_broadcast_join(c, client, gpu): ) res_df = c.sql(query_string, config_options={"sql.join.broadcast": 1.0}) - assert hlg_layer(res_df.dask, "bcast-join") + check_broadcast_join(res_df, 1.0) assert_eq( res_df, expected_df, @@ -565,18 +591,15 @@ def test_broadcast_join(c, client, gpu): ) res_df = c.sql(query_string, config_options={"sql.join.broadcast": 0.5}) - with pytest.raises(KeyError): - hlg_layer(res_df.dask, "bcast-join") + check_broadcast_join(res_df, 0.5, raises=True) assert_eq(res_df, expected_df, check_index=False, scheduler="distributed") res_df = c.sql(query_string, config_options={"sql.join.broadcast": False}) - with pytest.raises(KeyError): - hlg_layer(res_df.dask, "bcast-join") + check_broadcast_join(res_df, False, raises=True) assert_eq(res_df, expected_df, check_index=False, scheduler="distributed") res_df = c.sql(query_string, config_options={"sql.join.broadcast": None}) - with pytest.raises(KeyError): - hlg_layer(res_df.dask, "bcast-join") + check_broadcast_join(res_df, None, raises=True) assert_eq(res_df, expected_df, check_index=False, scheduler="distributed") From 8938a4a58dca9d668d277aec14a1d524e51afd39 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:23:02 -0700 Subject: [PATCH 12/38] Skip parquet stats tests for now --- tests/unit/test_statistics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/test_statistics.py b/tests/unit/test_statistics.py index 815e561fb..7c9e705b2 100644 --- a/tests/unit/test_statistics.py +++ b/tests/unit/test_statistics.py @@ -5,6 +5,12 @@ from dask_sql import Context from dask_sql.datacontainer import Statistics from dask_sql.physical.utils.statistics import parquet_statistics +from tests.utils import skipif_dask_expr_enabled + +# TODO: add support for parquet statistics with dask-expr +pytestmark = skipif_dask_expr_enabled( + reason="Parquet statistics not yet supported with dask-expr" +) @pytest.mark.parametrize("parallel", [None, False, 2]) From 6305a17edbabd7605a8640e208ca18cccc5a63ed Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:29:01 -0700 Subject: [PATCH 13/38] Skip DPP tests on dask-expr for now --- tests/unit/test_config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index aad045656..2cd594fbb 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -11,6 +11,7 @@ # Required to instantiate default sql config import dask_sql # noqa: F401 from dask_sql import Context +from tests.utils import skipif_dask_expr_enabled def test_custom_yaml(tmpdir): @@ -106,6 +107,7 @@ def test_dask_setconfig(): sys.version_info < (3, 10), reason="Writing and reading the Dask DataFrame causes a ProtocolError", ) +@skipif_dask_expr_enabled("dynamic partition pruning not yet supported with dask-expr") def test_dynamic_partition_pruning(tmpdir): c = Context() @@ -155,6 +157,7 @@ def test_dynamic_partition_pruning(tmpdir): assert inlist_expr in explain_string +@skipif_dask_expr_enabled("dynamic partition pruning not yet supported with dask-expr") def test_dpp_single_file_parquet(tmpdir): c = Context() From 76b55e32d847638e5fdb787ad08f46fe142b89c0 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:47:03 -0700 Subject: [PATCH 14/38] Pass ddf object as meta for test_describe_model --- tests/integration/test_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 4ef441f23..0c171ace3 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -501,12 +501,12 @@ def test_describe_model(c): .apply(lambda x: str(x)) .sort_index() ) - # test - result = c.sql("DESCRIBE MODEL ex_describe_model")["Params"].apply( - lambda x: str(x), meta=("Params", "object") + actual_series = c.sql("DESCRIBE MODEL ex_describe_model") + actual_series = actual_series["Params"].apply( + lambda x: str(x), meta=actual_series["Params"] ) - assert_eq(expected_series, result) + assert_eq(expected_series, actual_series) with pytest.raises(RuntimeError): c.sql("DESCRIBE MODEL undefined_model") From 876d28260a684381f0d4013e45bfd9a92d9f5aa8 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 9 Apr 2024 11:28:22 -0700 Subject: [PATCH 15/38] Add dask-expr handling to test_sort_topk --- tests/integration/test_sort.py | 48 +++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_sort.py b/tests/integration/test_sort.py index 1956a3bce..0b9428d4f 100644 --- a/tests/integration/test_sort.py +++ b/tests/integration/test_sort.py @@ -353,6 +353,28 @@ def test_sort_by_old_alias(c, input_table_1, request): ] +def check_sort_topk(df, layer, contains=True): + if dd._dask_expr_enabled(): + from dask_expr._reductions import NLargest, NSmallest + + if layer == "nsmallest": + assert len(list(df.expr.find_operations(NSmallest))) == ( + 1 if contains else 0 + ) + elif layer == "nlargest": + assert len(list(df.expr.find_operations(NLargest))) == ( + 1 if contains else 0 + ) + else: + assert False + else: + assert ( + any([layer in key for key in df.dask.layers.keys()]) + if contains + else all([layer not in key for key in df.dask.layers.keys()]) + ) + + @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_sort_topk(gpu): c = Context() @@ -366,7 +388,7 @@ def test_sort_topk(gpu): c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu) df_result = c.sql("""SELECT * FROM df ORDER BY a LIMIT 10""") - assert any(["nsmallest" in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nsmallest", True) assert_eq( df_result, pd.DataFrame( @@ -380,7 +402,7 @@ def test_sort_topk(gpu): ) df_result = c.sql("""SELECT * FROM df ORDER BY a, b LIMIT 10""") - assert any(["nsmallest" in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nsmallest", True) assert_eq( df_result, pd.DataFrame({"a": [1.0] * 10, "b": [1] * 10, "c": ["a"] * 10}), @@ -390,7 +412,7 @@ def test_sort_topk(gpu): df_result = c.sql( """SELECT * FROM df ORDER BY a DESC NULLS LAST, b DESC NULLS LAST LIMIT 10""" ) - assert any(["nlargest" in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nlargest", True) assert_eq( df_result, pd.DataFrame({"a": [1.0] * 10, "b": [3] * 10, "c": ["c"] * 10}), @@ -400,8 +422,8 @@ def test_sort_topk(gpu): # String column nlargest/smallest not supported for pandas df_result = c.sql("""SELECT * FROM df ORDER BY c LIMIT 10""") if not gpu: - assert all(["nlargest" not in key for key in df_result.dask.layers.keys()]) - assert all(["nsmallest" not in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nsmallest", False) + check_sort_topk(df_result, "nlargest", False) else: assert_eq( df_result, @@ -413,24 +435,24 @@ def test_sort_topk(gpu): df_result = c.sql( """SELECT * FROM df ORDER BY a DESC, b DESC NULLS LAST LIMIT 10""" ) - assert all(["nlargest" not in key for key in df_result.dask.layers.keys()]) - assert all(["nsmallest" not in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nlargest", False) + check_sort_topk(df_result, "nsmallest", False) # Assert optimization isn't applied for mixed asc + desc sort df_result = c.sql("""SELECT * FROM df ORDER BY a, b DESC NULLS LAST LIMIT 10""") - assert all(["nlargest" not in key for key in df_result.dask.layers.keys()]) - assert all(["nsmallest" not in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nlargest", False) + check_sort_topk(df_result, "nsmallest", False) # Assert optimization isn't applied when the number of requested elements # exceed topk-nelem-limit config value # Default topk-nelem-limit is 1M and 334k*3columns takes it above this limit df_result = c.sql("""SELECT * FROM df ORDER BY a, b LIMIT 333334""") - assert all(["nlargest" not in key for key in df_result.dask.layers.keys()]) - assert all(["nsmallest" not in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nlargest", False) + check_sort_topk(df_result, "nsmallest", False) df_result = c.sql( """SELECT * FROM df ORDER BY a, b LIMIT 10""", config_options={"sql.sort.topk-nelem-limit": 29}, ) - assert all(["nlargest" not in key for key in df_result.dask.layers.keys()]) - assert all(["nsmallest" not in key for key in df_result.dask.layers.keys()]) + check_sort_topk(df_result, "nlargest", False) + check_sort_topk(df_result, "nsmallest", False) From 1e72f33a1066c24592fe894380d2846efac62b5c Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 9 Apr 2024 12:00:12 -0700 Subject: [PATCH 16/38] Avoid using Dask graph internals for random functions --- dask_sql/physical/rex/core/call.py | 38 ++++++++++-------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index ae17b2027..6fd3180f5 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -11,9 +11,6 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask.base import tokenize -from dask.dataframe.core import Series -from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data from dask_sql._datafusion_lib import SqlTypeName @@ -828,37 +825,28 @@ def random_function(self, partition, random_state, kwargs): def random_frame(self, seed: int, dc: DataContainer, **kwargs) -> dd.Series: """This function - in contrast to others in this module - will only ever be called on data frames""" - - random_state = np.random.RandomState(seed=seed) - - # Idea taken from dask.DataFrame.sample: - # initialize a random state for each of the partitions - # separately and then create a random series - # for each partition df = dc.df - name = "sample-" + tokenize(df, random_state) - - state_data = random_state_data(df.npartitions, random_state) - dsk = { - (name, i): ( - self.random_function, - (df._name, i), - np.random.RandomState(state), - kwargs, + state_data = random_state_data(df.npartitions, np.random.RandomState(seed=seed)) + + def random_partition_func(df, state_data, partition_info=None): + """Create a random number for each partition""" + partition_index = ( + partition_info["number"] if partition_info is not None else 0 ) - for i, state in enumerate(state_data) - } - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) - random_series = Series(graph, name, ("random", "float64"), df.divisions) + state = np.random.RandomState(state_data[partition_index]) + return self.random_function(df, state, kwargs) + + random_series = df.map_partitions( + random_partition_func, state_data, meta=("random", "float64") + ) # This part seems to be stupid, but helps us do a very simple # task without going into the (private) internals of Dask: # copy all meta information from the original input dataframe # This is important so that the returned series looks # exactly like coming from the input dataframe - return_df = df.assign(random=random_series)["random"] - return return_df + return df.assign(random=random_series)["random"] class RandOperation(BaseRandomOperation): From fddf52f7df1908f660f15c5130e48939399df059 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 07:33:01 -0700 Subject: [PATCH 17/38] Skip over window count tests for now --- tests/integration/test_compatibility.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index e9d372c88..957ddbdbd 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -19,7 +19,7 @@ from dask_sql import Context from dask_sql.utils import ParsingException -from tests.utils import assert_eq, convert_nullable_columns +from tests.utils import assert_eq, convert_nullable_columns, skipif_dask_expr_enabled def eq_sqlite(sql, **dfs): @@ -813,6 +813,7 @@ def test_window_min_max_partition_by(): ) +@skipif_dask_expr_enabled() def test_window_count(): for func in ["COUNT"]: a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) @@ -863,6 +864,7 @@ def test_window_count(): ) +@skipif_dask_expr_enabled() def test_window_count_partition_by(): for func in ["COUNT"]: a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) From 72dd6abbd069f2bd70e143baf5d67e869cebcdd5 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 09:31:51 -0700 Subject: [PATCH 18/38] Skip test_over_calls and test_over_with_windows --- tests/integration/test_over.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_over.py b/tests/integration/test_over.py index be53817e9..f089497bc 100644 --- a/tests/integration/test_over.py +++ b/tests/integration/test_over.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from tests.utils import assert_eq +from tests.utils import assert_eq, skipif_dask_expr_enabled def test_over_with_sorting(c, user_table_1): @@ -76,6 +76,7 @@ def test_over_with_different(c, user_table_1): assert_eq(return_df, expected_df, check_dtype=False, check_index=False) +@skipif_dask_expr_enabled() def test_over_calls(c, user_table_1): return_df = c.sql( """ @@ -139,6 +140,7 @@ def test_over_single_value(c, user_table_1): assert_eq(return_df, expected_df, check_dtype=False, check_index=False) +@skipif_dask_expr_enabled() def test_over_with_windows(c): tmp_df = pd.DataFrame({"a": range(5)}) c.create_table("tmp", tmp_df) From 93bc5dcdeb80e49c9c6048d9c594b8fb33daf640 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 09:58:07 -0700 Subject: [PATCH 19/38] Update timeseries fixture comment to acknowledge fix --- tests/integration/fixtures.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index ae596cb80..c1a5017d6 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -122,8 +122,7 @@ def datetime_table(): def timeseries(): df = dd_timeseries(freq="1d").reset_index(drop=True) - # dask-expr doesn't play well with the timeseries dataset - # roundtripping through pandas seems to help + # TODO: remove this roundtrip once fix for dask-expr#1013 is released # see: https://github.com/dask/dask-expr/issues/1013 if dd._dask_expr_enabled(): df = dd.from_pandas(df.compute()) From 5cc7cc572fd21f04487a26e10b47fd87d445fea2 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:00:12 -0700 Subject: [PATCH 20/38] More detailed messages for window test skips --- tests/integration/test_compatibility.py | 6 ++++-- tests/integration/test_over.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index 957ddbdbd..8b7fdfef4 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -813,7 +813,8 @@ def test_window_min_max_partition_by(): ) -@skipif_dask_expr_enabled() +# TODO: investigate source of window count deadlocks +@skipif_dask_expr_enabled("Deadlocks with query planning enabled") def test_window_count(): for func in ["COUNT"]: a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) @@ -864,7 +865,8 @@ def test_window_count(): ) -@skipif_dask_expr_enabled() +# TODO: investigate source of window count deadlocks +@skipif_dask_expr_enabled("Deadlocks with query planning enabled") def test_window_count_partition_by(): for func in ["COUNT"]: a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50)) diff --git a/tests/integration/test_over.py b/tests/integration/test_over.py index f089497bc..45b8c888d 100644 --- a/tests/integration/test_over.py +++ b/tests/integration/test_over.py @@ -76,7 +76,8 @@ def test_over_with_different(c, user_table_1): assert_eq(return_df, expected_df, check_dtype=False, check_index=False) -@skipif_dask_expr_enabled() +# TODO: investigate source of window count deadlocks +@skipif_dask_expr_enabled("Deadlocks with query planning enabled") def test_over_calls(c, user_table_1): return_df = c.sql( """ @@ -140,7 +141,8 @@ def test_over_single_value(c, user_table_1): assert_eq(return_df, expected_df, check_dtype=False, check_index=False) -@skipif_dask_expr_enabled() +# TODO: investigate source of window count deadlocks +@skipif_dask_expr_enabled("Deadlocks with query planning enabled") def test_over_with_windows(c): tmp_df = pd.DataFrame({"a": range(5)}) c.create_table("tmp", tmp_df) From 9ba3f0aa03d03d7fb91c5cb31f9f113e2d067e51 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:02:37 -0700 Subject: [PATCH 21/38] Skip test_join_alias_w_projection for now --- tests/integration/test_join.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index 7c453303f..c99c30dc1 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -8,7 +8,7 @@ from dask_sql import Context from dask_sql.datacontainer import Statistics -from tests.utils import assert_eq +from tests.utils import assert_eq, skipif_dask_expr_enabled def test_join(c): @@ -427,6 +427,9 @@ def test_intersect_multi_col(c): assert_eq(return_df, expected_df, check_index=False) +# TODO: remove this marker once fix for dask-expr#1018 is released +# see: https://github.com/dask/dask-expr/issues/1018 +@skipif_dask_expr_enabled("Waiting for fix to dask-expr#1018") def test_join_alias_w_projection(c, parquet_ddf): result_df = c.sql( "SELECT t2.c as c_y from parquet_ddf t1, parquet_ddf t2 WHERE t1.a=t2.a and t1.c='A'" From ffd695c7cb7a4291f2705e9b9bdfde3dec98710c Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:34:09 -0700 Subject: [PATCH 22/38] Un-xfail test_xgboost_training_prediction on win32 --- tests/integration/test_model.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 0c171ace3..24ec7719e 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -93,10 +93,6 @@ def test_training_and_prediction(c, gpu_client): check_trained_model(c, df_name=timeseries) -@pytest.mark.xfail( - sys.platform == "win32", - reason="'xgboost.core.XGBoostError: Failed to poll' on Windows only", -) @pytest.mark.xfail( sys.platform == "darwin", reason="Intermittent socket errors on macOS", strict=False ) From 73d32a2bcdcfddbb16cf82ebfe55f43985270bcf Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:26:23 -0700 Subject: [PATCH 23/38] Windows failures are still intermittent --- tests/integration/test_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 24ec7719e..4aecc7034 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -94,7 +94,9 @@ def test_training_and_prediction(c, gpu_client): @pytest.mark.xfail( - sys.platform == "darwin", reason="Intermittent socket errors on macOS", strict=False + sys.platform in ("darwin", "win32"), + reason="Intermittent failures on macOS/Windows", + strict=False, ) @pytest.mark.parametrize( "gpu_client", [False, pytest.param(True, marks=pytest.mark.gpu)], indirect=True From 35aa2259529f30d2b8e80ecb3a9854f8b12c2b9d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:30:04 -0700 Subject: [PATCH 24/38] Bump rust to 1.73 to circumvent conda sha256 errors --- Cargo.toml | 2 +- continuous_integration/recipe/conda_build_config.yaml | 2 +- docs/environment.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f362039f6..7eca21971 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ description = "Bindings for DataFusion used by Dask-SQL" readme = "README.md" license = "Apache-2.0" edition = "2021" -rust-version = "1.72" +rust-version = "1.73" include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] [dependencies] diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml index 2a523c954..16ee3f47a 100644 --- a/continuous_integration/recipe/conda_build_config.yaml +++ b/continuous_integration/recipe/conda_build_config.yaml @@ -5,7 +5,7 @@ c_compiler_version: rust_compiler: - rust rust_compiler_version: - - '1.72' + - '1.73' maturin: - '1.3' xz: # [linux64] diff --git a/docs/environment.yml b/docs/environment.yml index 13befb971..84dec6fc9 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -19,4 +19,4 @@ dependencies: - pygments>=2.7.1 - tabulate - ucx-proc=*=cpu - - rust>=1.72 + - rust>=1.73 From 8e6dc0523053f6d2b08a4c8ef7cb67f8b1ef77a2 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 12:47:56 -0700 Subject: [PATCH 25/38] Disable query planning in GPU CI for now --- continuous_integration/gpuci/build.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh index 156a945f0..90225a751 100644 --- a/continuous_integration/gpuci/build.sh +++ b/continuous_integration/gpuci/build.sh @@ -23,6 +23,8 @@ cd "$WORKSPACE" # Determine CUDA release version export CUDA_REL=${CUDA_VERSION%.*} +export DASK_DATAFRAME__QUERY_PLANNING=false + ################################################################################ # SETUP - Check environment ################################################################################ @@ -61,4 +63,4 @@ conda config --show-sources conda list --show-channel-urls rapids-logger "Python py.test for dask-sql" -py.test $WORKSPACE -n 4 -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term +py.test $WORKSPACE -n $PARALLEL_LEVEL -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term From 9563b8917fcd0b614127f5462dd3bad5ced28141 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 13:01:13 -0700 Subject: [PATCH 26/38] Revert "Bump rust to 1.73 to circumvent conda sha256 errors" This reverts commit 35aa2259529f30d2b8e80ecb3a9854f8b12c2b9d. --- Cargo.toml | 2 +- continuous_integration/recipe/conda_build_config.yaml | 2 +- docs/environment.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7eca21971..f362039f6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ description = "Bindings for DataFusion used by Dask-SQL" readme = "README.md" license = "Apache-2.0" edition = "2021" -rust-version = "1.73" +rust-version = "1.72" include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] [dependencies] diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml index 16ee3f47a..2a523c954 100644 --- a/continuous_integration/recipe/conda_build_config.yaml +++ b/continuous_integration/recipe/conda_build_config.yaml @@ -5,7 +5,7 @@ c_compiler_version: rust_compiler: - rust rust_compiler_version: - - '1.73' + - '1.72' maturin: - '1.3' xz: # [linux64] diff --git a/docs/environment.yml b/docs/environment.yml index 84dec6fc9..13befb971 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -19,4 +19,4 @@ dependencies: - pygments>=2.7.1 - tabulate - ucx-proc=*=cpu - - rust>=1.73 + - rust>=1.72 From b7ebab9d4737c82e0928657196a3db2e7b64262d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 13:04:41 -0700 Subject: [PATCH 27/38] Use older conda-build version to try and resolve build issues --- .github/workflows/conda.yml | 2 +- continuous_integration/gpuci/build.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 549b3411a..e59b18dc0 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -76,7 +76,7 @@ jobs: channel-priority: strict - name: Install dependencies run: | - mamba install -c conda-forge boa conda-verify + mamba install -c conda-forge boa "conda-build<24.3" conda-verify which python pip list diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh index 90225a751..1683a866e 100644 --- a/continuous_integration/gpuci/build.sh +++ b/continuous_integration/gpuci/build.sh @@ -23,6 +23,7 @@ cd "$WORKSPACE" # Determine CUDA release version export CUDA_REL=${CUDA_VERSION%.*} +# TODO: remove once RAPIDS 24.06 has support for query planning export DASK_DATAFRAME__QUERY_PLANNING=false ################################################################################ From 9f3248263e44c599bb8d716f2bbd4df1140effdf Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 10 Apr 2024 13:11:12 -0700 Subject: [PATCH 28/38] Pin to an older version of conda-build and boa --- .github/workflows/conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index e59b18dc0..e6255b5d2 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -76,7 +76,7 @@ jobs: channel-priority: strict - name: Install dependencies run: | - mamba install -c conda-forge boa "conda-build<24.3" conda-verify + mamba install -c conda-forge "boa<0.17" "conda-build<24.1" conda-verify which python pip list From 207803b906ea7b77753f424e5e2e9bf8627517df Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 11 Apr 2024 08:05:01 -0700 Subject: [PATCH 29/38] Skip deadlocking xgboost test on GPU --- tests/integration/test_model.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 4aecc7034..fed960fa0 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -93,13 +93,21 @@ def test_training_and_prediction(c, gpu_client): check_trained_model(c, df_name=timeseries) +# TODO: investigate deadlocks on GPU @pytest.mark.xfail( sys.platform in ("darwin", "win32"), reason="Intermittent failures on macOS/Windows", strict=False, ) @pytest.mark.parametrize( - "gpu_client", [False, pytest.param(True, marks=pytest.mark.gpu)], indirect=True + "gpu_client", + [ + False, + pytest.param( + True, marks=(pytest.mark.gpu, pytest.mark.skip(reason="Deadlocks on GPU")) + ), + ], + indirect=True, ) def test_xgboost_training_prediction(c, gpu_client): gpu = "CUDA" in str(gpu_client.cluster) From 6cbe5a9c15fb8b65eb0cbf59c4742878f9420ed1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 12 Apr 2024 13:26:11 -0700 Subject: [PATCH 30/38] Add subset of testing with query planning disabled --- .github/workflows/test-upstream.yml | 19 +++++++++++++++++++ .github/workflows/test.yml | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index 91bd80604..4dc4b2c5f 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -16,20 +16,33 @@ jobs: env: CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }} + DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] python: ["3.9", "3.10", "3.11", "3.12"] distributed: [false] + query-planning: [true] include: # run tests on a distributed client - os: "ubuntu-latest" python: "3.9" distributed: true + query-planning: true - os: "ubuntu-latest" python: "3.11" distributed: true + query-planning: true + # run tests with query planning disabled + - os: "ubuntu-latest" + python: "3.9" + distributed: false + query-planning: false + - os: "ubuntu-latest" + python: "3.11" + distributed: false + query-planning: false steps: - uses: actions/checkout@v4 with: @@ -74,6 +87,10 @@ jobs: import-dev: name: "Test importing with bare requirements and upstream dev" runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + query-planning: [true, false] steps: - uses: actions/checkout@v4 - name: Set up Python @@ -93,8 +110,10 @@ jobs: - name: Install upstream dev Dask run: | python -m pip install git+https://github.com/dask/dask + python -m pip install git+https://github.com/dask/dask-expr python -m pip install git+https://github.com/dask/distributed - name: Try to import dask-sql + env: DASK_DATAFRAME_QUERY_PLANNING=${{ matrix.query-planning }} run: | python -c "import dask_sql; print('ok')" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ef1398881..9679ee2cf 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,20 +39,33 @@ jobs: env: CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }} + DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] python: ["3.9", "3.10", "3.11", "3.12"] distributed: [false] + query-planning: [true] include: # run tests on a distributed client - os: "ubuntu-latest" python: "3.9" distributed: true + query-planning: true - os: "ubuntu-latest" python: "3.11" distributed: true + query-planning: true + # run tests with query planning disabled + - os: "ubuntu-latest" + python: "3.9" + distributed: false + query-planning: false + - os: "ubuntu-latest" + python: "3.11" + distributed: false + query-planning: false steps: - uses: actions/checkout@v4 - name: Set up Python @@ -99,6 +112,10 @@ jobs: name: "Test importing with bare requirements" needs: [detect-ci-trigger] runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + query-planning: [true, false] steps: - uses: actions/checkout@v4 - name: Set up Python @@ -119,7 +136,9 @@ jobs: if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | python -m pip install git+https://github.com/dask/dask + python -m pip install git+https://github.com/dask/dask-expr python -m pip install git+https://github.com/dask/distributed - name: Try to import dask-sql + env: DASK_DATAFRAME_QUERY_PLANNING=${{ matrix.query-planning }} run: | python -c "import dask_sql; print('ok')" From 5a279b8cedd67fd1912ab1cb9ebaec9d3018cdf0 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 12 Apr 2024 13:39:15 -0700 Subject: [PATCH 31/38] Add query-planning to job names --- .github/workflows/test-upstream.yml | 4 ++-- .github/workflows/test.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index 4dc4b2c5f..d2165f276 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -11,7 +11,7 @@ defaults: jobs: test-dev: - name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})" + name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})" runs-on: ${{ matrix.os }} env: CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml @@ -85,7 +85,7 @@ jobs: path: test-${{ matrix.os }}-py${{ matrix.python }}-results.jsonl import-dev: - name: "Test importing with bare requirements and upstream dev" + name: "Test importing with bare requirements and upstream dev (query-planning: ${{ matrix.query-planning }})" runs-on: ubuntu-latest strategy: fail-fast: false diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9679ee2cf..9489cab96 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -33,7 +33,7 @@ jobs: keyword: "[test-upstream]" test: - name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})" + name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})" needs: [detect-ci-trigger] runs-on: ${{ matrix.os }} env: @@ -109,7 +109,7 @@ jobs: uses: codecov/codecov-action@v3 import: - name: "Test importing with bare requirements" + name: "Test importing with bare requirements (query-planning: ${{ matrix.query-planning }})" needs: [detect-ci-trigger] runs-on: ubuntu-latest strategy: From 7ce1cbf4a40384a948f96a5e25bf6169e776b8ea Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 12 Apr 2024 13:40:31 -0700 Subject: [PATCH 32/38] Fix syntax errors --- .github/workflows/test-upstream.yml | 3 ++- .github/workflows/test.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index d2165f276..2864f4749 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -113,7 +113,8 @@ jobs: python -m pip install git+https://github.com/dask/dask-expr python -m pip install git+https://github.com/dask/distributed - name: Try to import dask-sql - env: DASK_DATAFRAME_QUERY_PLANNING=${{ matrix.query-planning }} + env: + DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }} run: | python -c "import dask_sql; print('ok')" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9489cab96..fd599bd50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -139,6 +139,7 @@ jobs: python -m pip install git+https://github.com/dask/dask-expr python -m pip install git+https://github.com/dask/distributed - name: Try to import dask-sql - env: DASK_DATAFRAME_QUERY_PLANNING=${{ matrix.query-planning }} + env: + DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }} run: | python -c "import dask_sql; print('ok')" From 6d762a5472b04565524d3d5680b67336af93d297 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 15 Apr 2024 07:17:29 -0700 Subject: [PATCH 33/38] Add dask-expr to CI environments, bump to pandas 2 --- continuous_integration/environment-3.10.yaml | 3 +- continuous_integration/environment-3.11.yaml | 3 +- continuous_integration/environment-3.12.yaml | 3 +- continuous_integration/environment-3.9.yaml | 3 +- .../gpuci/environment-3.10.yaml | 3 +- .../gpuci/environment-3.9.yaml | 3 +- dask_sql/_compat.py | 5 --- dask_sql/physical/rel/logical/window.py | 32 ++++++------------- tests/integration/test_select.py | 3 +- 9 files changed, 22 insertions(+), 36 deletions(-) diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index 30fd86a8f..968dd5787 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -4,6 +4,7 @@ channels: dependencies: - c-compiler - dask>=2024.1.1 +- dask-expr>=0.4.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -14,7 +15,7 @@ dependencies: - mlflow>=2.9 - mock - numpy>=1.22.4 -- pandas>=1.4.0 +- pandas>=2 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index fd80633a3..54c437cc6 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -4,6 +4,7 @@ channels: dependencies: - c-compiler - dask>=2024.1.1 +- dask-expr>=0.4.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -14,7 +15,7 @@ dependencies: - mlflow>=2.9 - mock - numpy>=1.22.4 -- pandas>=1.4.0 +- pandas>=2 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml index 98b5fd583..56ff78847 100644 --- a/continuous_integration/environment-3.12.yaml +++ b/continuous_integration/environment-3.12.yaml @@ -4,6 +4,7 @@ channels: dependencies: - c-compiler - dask>=2024.1.1 +- dask-expr>=0.4.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -15,7 +16,7 @@ dependencies: # - mlflow>=2.9 - mock - numpy>=1.22.4 -- pandas>=1.4.0 +- pandas>=2 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index 8a233ed07..79da1ee8a 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -4,6 +4,7 @@ channels: dependencies: - c-compiler - dask=2024.1.1 +- dask-expr=0.4.0 - fastapi=0.92.0 - fugue=0.7.3 - httpx=0.24.1 @@ -14,7 +15,7 @@ dependencies: - mlflow=2.9 - mock - numpy=1.22.4 -- pandas=1.4.0 +- pandas=2 - pre-commit - prompt_toolkit=3.0.8 - psycopg2 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index acf11276b..f1c933ff1 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -10,6 +10,7 @@ dependencies: - c-compiler - zlib - dask>=2024.1.1 +- dask-expr>=0.4.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -20,7 +21,7 @@ dependencies: - mlflow>=2.9 - mock - numpy>=1.22.4 -- pandas>=1.4.0 +- pandas>=2 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 7582f8721..a8115e58f 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -10,6 +10,7 @@ dependencies: - c-compiler - zlib - dask>=2024.1.1 +- dask-expr>=0.4.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -20,7 +21,7 @@ dependencies: - mlflow>=2.9 - mock - numpy>=1.22.4 -- pandas>=1.4.0 +- pandas>=2 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index c637ef385..429a92dc8 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -1,12 +1,7 @@ -import pandas as pd import prompt_toolkit from packaging.version import parse as parseVersion -_pandas_version = parseVersion(pd.__version__) _prompt_toolkit_version = parseVersion(prompt_toolkit.__version__) -INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0") -PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0") - # TODO: remove if prompt-toolkit min version gets bumped PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29") diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index adebed8c1..591f15181 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -8,7 +8,6 @@ import pandas as pd from pandas.api.indexers import BaseIndexer -from dask_sql._compat import INDEXER_WINDOW_STEP_IMPLEMENTED from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex.convert import RexConverter @@ -132,28 +131,15 @@ def _get_window_bounds( ) return start, end - if INDEXER_WINDOW_STEP_IMPLEMENTED: - - def get_window_bounds( - self, - num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - step: Optional[int] = None, - ) -> tuple[np.ndarray, np.ndarray]: - return self._get_window_bounds(num_values, min_periods, center, closed) - - else: - - def get_window_bounds( - self, - num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> tuple[np.ndarray, np.ndarray]: - return self._get_window_bounds(num_values, min_periods, center, closed) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + step: Optional[int] = None, + ) -> tuple[np.ndarray, np.ndarray]: + return self._get_window_bounds(num_values, min_periods, center, closed) def map_on_each_group( diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index d79583d51..acf45939e 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -4,7 +4,6 @@ from dask.dataframe.optimize import optimize_dataframe_getitem from dask.utils_test import hlg_layer -from dask_sql._compat import PANDAS_GT_200 from dask_sql.utils import ParsingException from tests.utils import assert_eq, skipif_dask_expr_enabled @@ -36,7 +35,7 @@ def test_select_different_types(c): { "date": pd.to_datetime( ["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT], - format="mixed" if PANDAS_GT_200 else None, + format="mixed", ), "string": ["this is a test", "another test", "äölüć", ""], "integer": [1, 2, -4, 5], From 536d156be53e9b599df42515098dd821147f8a44 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 15 Apr 2024 07:28:40 -0700 Subject: [PATCH 34/38] Bump dask/dask-expr to 2024.2.1/0.5 to get around aggregation bug --- continuous_integration/docker/conda.txt | 2 +- continuous_integration/docker/main.dockerfile | 2 +- continuous_integration/environment-3.10.yaml | 4 ++-- continuous_integration/environment-3.11.yaml | 4 ++-- continuous_integration/environment-3.12.yaml | 4 ++-- continuous_integration/environment-3.9.yaml | 4 ++-- continuous_integration/gpuci/environment-3.10.yaml | 4 ++-- continuous_integration/gpuci/environment-3.9.yaml | 4 ++-- continuous_integration/recipe/meta.yaml | 2 +- docs/environment.yml | 2 +- docs/requirements-docs.txt | 2 +- pyproject.toml | 4 ++-- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/continuous_integration/docker/conda.txt b/continuous_integration/docker/conda.txt index 3849238c4..47b2a3391 100644 --- a/continuous_integration/docker/conda.txt +++ b/continuous_integration/docker/conda.txt @@ -1,5 +1,5 @@ python>=3.9 -dask>=2024.1.1 +dask>=2024.2.1 pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 diff --git a/continuous_integration/docker/main.dockerfile b/continuous_integration/docker/main.dockerfile index c2da576a0..a74b4969f 100644 --- a/continuous_integration/docker/main.dockerfile +++ b/continuous_integration/docker/main.dockerfile @@ -16,7 +16,7 @@ RUN mamba install -y \ # build requirements "maturin>=1.3,<1.4" \ # core dependencies - "dask>=2024.1.1" \ + "dask>=2024.2.1" \ "pandas>=1.4.0" \ "fastapi>=0.92.0" \ "httpx>=0.24.1" \ diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index 968dd5787..3859118c7 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.1.1 -- dask-expr>=0.4.0 +- dask>=2024.2.1 +- dask-expr>=0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index 54c437cc6..4443add23 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.1.1 -- dask-expr>=0.4.0 +- dask>=2024.2.1 +- dask-expr>=0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml index 56ff78847..5c25c763b 100644 --- a/continuous_integration/environment-3.12.yaml +++ b/continuous_integration/environment-3.12.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.1.1 -- dask-expr>=0.4.0 +- dask>=2024.2.1 +- dask-expr>=0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index 79da1ee8a..a652b9a07 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask=2024.1.1 -- dask-expr=0.4.0 +- dask=2024.2.1 +- dask-expr=0.5 - fastapi=0.92.0 - fugue=0.7.3 - httpx=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index f1c933ff1..1b9d6824c 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -9,8 +9,8 @@ channels: dependencies: - c-compiler - zlib -- dask>=2024.1.1 -- dask-expr>=0.4.0 +- dask>=2024.2.1 +- dask-expr>=0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index a8115e58f..c5ac6d2f3 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -9,8 +9,8 @@ channels: dependencies: - c-compiler - zlib -- dask>=2024.1.1 -- dask-expr>=0.4.0 +- dask>=2024.2.1 +- dask-expr>=0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index cf78f71f8..d6b65cccc 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,7 @@ requirements: - xz # [linux64] run: - python - - dask >=2024.1.1 + - dask >=2024.2.1 - pandas >=1.4.0 - fastapi >=0.92.0 - httpx >=0.24.1 diff --git a/docs/environment.yml b/docs/environment.yml index 13befb971..35647ec53 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -6,7 +6,7 @@ dependencies: - sphinx>=4.0.0 - sphinx-tabs - dask-sphinx-theme>=2.0.3 - - dask>=2024.1.1 + - dask>=2024.2.1 - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 0f121bf7a..c8cbc3890 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 -dask>=2024.1.1 +dask>=2024.2.1 pandas>=1.4.0 fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/pyproject.toml b/pyproject.toml index 02f7064d7..a1bf18d4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,8 @@ classifiers = [ readme = "README.md" requires-python = ">=3.9" dependencies = [ - "dask[dataframe]>=2024.1.1", - "distributed>=2024.1.1", + "dask[dataframe]>=2024.2.1", + "distributed>=2024.2.1", "pandas>=1.4.0", "fastapi>=0.92.0", "httpx>=0.24.1", From 53c6af598296fe0b17320aa75f19952609baf5cf Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 15 Apr 2024 08:55:10 -0700 Subject: [PATCH 35/38] Bump dask / dask-expr to 2024.3.1 / 1.0.5 to resolve drop bug --- continuous_integration/docker/conda.txt | 2 +- continuous_integration/docker/main.dockerfile | 2 +- continuous_integration/environment-3.10.yaml | 4 ++-- continuous_integration/environment-3.11.yaml | 4 ++-- continuous_integration/environment-3.12.yaml | 4 ++-- continuous_integration/environment-3.9.yaml | 4 ++-- continuous_integration/gpuci/environment-3.10.yaml | 4 ++-- continuous_integration/gpuci/environment-3.9.yaml | 4 ++-- continuous_integration/recipe/meta.yaml | 2 +- docs/environment.yml | 2 +- docs/requirements-docs.txt | 2 +- pyproject.toml | 4 ++-- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/continuous_integration/docker/conda.txt b/continuous_integration/docker/conda.txt index 47b2a3391..a2370946b 100644 --- a/continuous_integration/docker/conda.txt +++ b/continuous_integration/docker/conda.txt @@ -1,5 +1,5 @@ python>=3.9 -dask>=2024.2.1 +dask>=2024.3.1 pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 diff --git a/continuous_integration/docker/main.dockerfile b/continuous_integration/docker/main.dockerfile index a74b4969f..e746d874f 100644 --- a/continuous_integration/docker/main.dockerfile +++ b/continuous_integration/docker/main.dockerfile @@ -16,7 +16,7 @@ RUN mamba install -y \ # build requirements "maturin>=1.3,<1.4" \ # core dependencies - "dask>=2024.2.1" \ + "dask>=2024.3.1" \ "pandas>=1.4.0" \ "fastapi>=0.92.0" \ "httpx>=0.24.1" \ diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index 3859118c7..c99886889 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.2.1 -- dask-expr>=0.5 +- dask>=2024.3.1 +- dask-expr>=1.0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index 4443add23..5e7462d1c 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.2.1 -- dask-expr>=0.5 +- dask>=2024.3.1 +- dask-expr>=1.0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml index 5c25c763b..b17d80cc6 100644 --- a/continuous_integration/environment-3.12.yaml +++ b/continuous_integration/environment-3.12.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.2.1 -- dask-expr>=0.5 +- dask>=2024.3.1 +- dask-expr>=1.0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index a652b9a07..6a8dc14c6 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask=2024.2.1 -- dask-expr=0.5 +- dask=2024.3.1 +- dask-expr=1.0.5 - fastapi=0.92.0 - fugue=0.7.3 - httpx=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 1b9d6824c..880d044be 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -9,8 +9,8 @@ channels: dependencies: - c-compiler - zlib -- dask>=2024.2.1 -- dask-expr>=0.5 +- dask>=2024.3.1 +- dask-expr>=1.0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index c5ac6d2f3..454c43d12 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -9,8 +9,8 @@ channels: dependencies: - c-compiler - zlib -- dask>=2024.2.1 -- dask-expr>=0.5 +- dask>=2024.3.1 +- dask-expr>=1.0.5 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index d6b65cccc..191b2507b 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,7 @@ requirements: - xz # [linux64] run: - python - - dask >=2024.2.1 + - dask >=2024.3.1 - pandas >=1.4.0 - fastapi >=0.92.0 - httpx >=0.24.1 diff --git a/docs/environment.yml b/docs/environment.yml index 35647ec53..859488bc7 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -6,7 +6,7 @@ dependencies: - sphinx>=4.0.0 - sphinx-tabs - dask-sphinx-theme>=2.0.3 - - dask>=2024.2.1 + - dask>=2024.3.1 - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index c8cbc3890..060a57de4 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 -dask>=2024.2.1 +dask>=2024.3.1 pandas>=1.4.0 fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/pyproject.toml b/pyproject.toml index a1bf18d4c..0e9f2459e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,8 @@ classifiers = [ readme = "README.md" requires-python = ">=3.9" dependencies = [ - "dask[dataframe]>=2024.2.1", - "distributed>=2024.2.1", + "dask[dataframe]>=2024.3.1", + "distributed>=2024.3.1", "pandas>=1.4.0", "fastapi>=0.92.0", "httpx>=0.24.1", From 8d1d1070e99b99f1c787cc7e0ebe30b2f0a6701c Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 15 Apr 2024 10:25:17 -0700 Subject: [PATCH 36/38] Bump dask / dask-expr to 2024.4.1 / 1.0.11 to resolve head bug --- continuous_integration/docker/conda.txt | 2 +- continuous_integration/docker/main.dockerfile | 2 +- continuous_integration/environment-3.10.yaml | 4 ++-- continuous_integration/environment-3.11.yaml | 4 ++-- continuous_integration/environment-3.12.yaml | 4 ++-- continuous_integration/environment-3.9.yaml | 4 ++-- continuous_integration/gpuci/environment-3.10.yaml | 4 ++-- continuous_integration/gpuci/environment-3.9.yaml | 4 ++-- continuous_integration/recipe/meta.yaml | 2 +- docs/environment.yml | 2 +- docs/requirements-docs.txt | 2 +- pyproject.toml | 4 ++-- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/continuous_integration/docker/conda.txt b/continuous_integration/docker/conda.txt index a2370946b..58cfa1b75 100644 --- a/continuous_integration/docker/conda.txt +++ b/continuous_integration/docker/conda.txt @@ -1,5 +1,5 @@ python>=3.9 -dask>=2024.3.1 +dask>=2024.4.1 pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 diff --git a/continuous_integration/docker/main.dockerfile b/continuous_integration/docker/main.dockerfile index e746d874f..d32604e61 100644 --- a/continuous_integration/docker/main.dockerfile +++ b/continuous_integration/docker/main.dockerfile @@ -16,7 +16,7 @@ RUN mamba install -y \ # build requirements "maturin>=1.3,<1.4" \ # core dependencies - "dask>=2024.3.1" \ + "dask>=2024.4.1" \ "pandas>=1.4.0" \ "fastapi>=0.92.0" \ "httpx>=0.24.1" \ diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index c99886889..6ee044f42 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.3.1 -- dask-expr>=1.0.5 +- dask>=2024.4.1 +- dask-expr>=1.0.11 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index 5e7462d1c..9edaddbbd 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.3.1 -- dask-expr>=1.0.5 +- dask>=2024.4.1 +- dask-expr>=1.0.11 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml index b17d80cc6..657e18507 100644 --- a/continuous_integration/environment-3.12.yaml +++ b/continuous_integration/environment-3.12.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2024.3.1 -- dask-expr>=1.0.5 +- dask>=2024.4.1 +- dask-expr>=1.0.11 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index 6a8dc14c6..6b2a506b2 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -3,8 +3,8 @@ channels: - conda-forge dependencies: - c-compiler -- dask=2024.3.1 -- dask-expr=1.0.5 +- dask=2024.4.1 +- dask-expr=1.0.11 - fastapi=0.92.0 - fugue=0.7.3 - httpx=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 880d044be..2371144e7 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -9,8 +9,8 @@ channels: dependencies: - c-compiler - zlib -- dask>=2024.3.1 -- dask-expr>=1.0.5 +- dask>=2024.4.1 +- dask-expr>=1.0.11 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 454c43d12..cb54b2ac7 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -9,8 +9,8 @@ channels: dependencies: - c-compiler - zlib -- dask>=2024.3.1 -- dask-expr>=1.0.5 +- dask>=2024.4.1 +- dask-expr>=1.0.11 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 191b2507b..2e9ab41a4 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,7 @@ requirements: - xz # [linux64] run: - python - - dask >=2024.3.1 + - dask >=2024.4.1 - pandas >=1.4.0 - fastapi >=0.92.0 - httpx >=0.24.1 diff --git a/docs/environment.yml b/docs/environment.yml index 859488bc7..601337b4c 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -6,7 +6,7 @@ dependencies: - sphinx>=4.0.0 - sphinx-tabs - dask-sphinx-theme>=2.0.3 - - dask>=2024.3.1 + - dask>=2024.4.1 - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 060a57de4..f338c3fa5 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 -dask>=2024.3.1 +dask>=2024.4.1 pandas>=1.4.0 fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/pyproject.toml b/pyproject.toml index 0e9f2459e..be62f704d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,8 @@ classifiers = [ readme = "README.md" requires-python = ">=3.9" dependencies = [ - "dask[dataframe]>=2024.3.1", - "distributed>=2024.3.1", + "dask[dataframe]>=2024.4.1", + "distributed>=2024.4.1", "pandas>=1.4.0", "fastapi>=0.92.0", "httpx>=0.24.1", From 4a64319749ccb84430323448c99534cde381081d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 15 Apr 2024 10:29:13 -0700 Subject: [PATCH 37/38] Remove dask-expr workaround from timeseries fixture --- tests/integration/fixtures.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index c1a5017d6..669260215 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -120,14 +120,7 @@ def datetime_table(): @pytest.fixture() def timeseries(): - df = dd_timeseries(freq="1d").reset_index(drop=True) - - # TODO: remove this roundtrip once fix for dask-expr#1013 is released - # see: https://github.com/dask/dask-expr/issues/1013 - if dd._dask_expr_enabled(): - df = dd.from_pandas(df.compute()) - - return df + return dd_timeseries(freq="1d").reset_index(drop=True) @pytest.fixture() From 99a5745d7c7cded09eb9581effd5fe1d5aa64221 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 15 Apr 2024 11:13:01 -0700 Subject: [PATCH 38/38] Unpin sqlalchemy in python 3.9 CI environment --- continuous_integration/environment-3.9.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index 6b2a506b2..88eee32b5 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -30,8 +30,7 @@ dependencies: - py-xgboost=2.0.3 - scikit-learn=1.0.0 - sphinx -# TODO: remove this constraint when we require pandas>2 -- sqlalchemy<2 +- sqlalchemy - tpot>=0.12.0 # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2