dask-contrib · charlesbluca · Apr 15, 2024 · Mar 19, 2024 · Mar 22, 2024 · Mar 26, 2024
@@ -76,7 +76,7 @@ jobs:
           channel-priority: strict
       - name: Install dependencies
         run: |
-          mamba install -c conda-forge boa conda-verify
+          mamba install -c conda-forge "boa<0.17" "conda-build<24.1" conda-verify
 
           which python
           pip list

@@ -11,25 +11,38 @@ defaults:
 
 jobs:
   test-dev:
-    name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})"
+    name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})"
     runs-on: ${{ matrix.os }}
     env:
       CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml
       DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }}
+      DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python: ["3.9", "3.10", "3.11", "3.12"]
         distributed: [false]
+        query-planning: [true]
         include:
           # run tests on a distributed client
           - os: "ubuntu-latest"
             python: "3.9"
             distributed: true
+            query-planning: true
           - os: "ubuntu-latest"
             python: "3.11"
             distributed: true
+            query-planning: true
+          # run tests with query planning disabled
+          - os: "ubuntu-latest"
+            python: "3.9"
+            distributed: false
+            query-planning: false
+          - os: "ubuntu-latest"
+            python: "3.11"
+            distributed: false
+            query-planning: false
     steps:
       - uses: actions/checkout@v4
         with:
@@ -72,8 +85,12 @@ jobs:
           path: test-${{ matrix.os }}-py${{ matrix.python }}-results.jsonl
 
   import-dev:
-    name: "Test importing with bare requirements and upstream dev"
+    name: "Test importing with bare requirements and upstream dev (query-planning: ${{ matrix.query-planning }})"
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        query-planning: [true, false]
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python
@@ -93,8 +110,11 @@ jobs:
       - name: Install upstream dev Dask
         run: |
           python -m pip install git+https://github.com/dask/dask
+          python -m pip install git+https://github.com/dask/dask-expr
           python -m pip install git+https://github.com/dask/distributed
       - name: Try to import dask-sql
+        env:
+          DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }}
         run: |
           python -c "import dask_sql; print('ok')"
 

@@ -33,26 +33,39 @@ jobs:
           keyword: "[test-upstream]"
 
   test:
-    name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})"
+    name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})"
     needs: [detect-ci-trigger]
     runs-on: ${{ matrix.os }}
     env:
       CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml
       DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }}
+      DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python: ["3.9", "3.10", "3.11", "3.12"]
         distributed: [false]
+        query-planning: [true]
         include:
           # run tests on a distributed client
           - os: "ubuntu-latest"
             python: "3.9"
             distributed: true
+            query-planning: true
           - os: "ubuntu-latest"
             python: "3.11"
             distributed: true
+            query-planning: true
+          # run tests with query planning disabled
+          - os: "ubuntu-latest"
+            python: "3.9"
+            distributed: false
+            query-planning: false
+          - os: "ubuntu-latest"
+            python: "3.11"
+            distributed: false
+            query-planning: false
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python
@@ -96,9 +109,13 @@ jobs:
         uses: codecov/codecov-action@v3
 
   import:
-    name: "Test importing with bare requirements"
+    name: "Test importing with bare requirements (query-planning: ${{ matrix.query-planning }})"
     needs: [detect-ci-trigger]
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        query-planning: [true, false]
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python
@@ -119,7 +136,10 @@ jobs:
         if: needs.detect-ci-trigger.outputs.triggered == 'true'
         run: |
           python -m pip install git+https://github.com/dask/dask
+          python -m pip install git+https://github.com/dask/dask-expr
           python -m pip install git+https://github.com/dask/distributed
       - name: Try to import dask-sql
+        env:
+          DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }}
         run: |
           python -c "import dask_sql; print('ok')"
@@ -1,5 +1,5 @@
 python>=3.9
-dask==2024.1.1
+dask>=2024.4.1
 pandas>=1.4.0
 jpype1>=1.0.2
 openjdk>=8

@@ -16,7 +16,7 @@ RUN mamba install -y \
     # build requirements
     "maturin>=1.3,<1.4" \
     # core dependencies
-    "dask==2024.1.1" \
+    "dask>=2024.4.1" \
     "pandas>=1.4.0" \
     "fastapi>=0.92.0" \
     "httpx>=0.24.1" \

@@ -3,7 +3,8 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask==2024.1.1
+- dask>=2024.4.1
+- dask-expr>=1.0.11
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1
@@ -14,7 +15,7 @@ dependencies:
 - mlflow>=2.9
 - mock
 - numpy>=1.22.4
-- pandas>=1.4.0
+- pandas>=2
 - pre-commit
 - prompt_toolkit>=3.0.8
 - psycopg2

@@ -3,7 +3,8 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask==2024.1.1
+- dask>=2024.4.1
+- dask-expr>=1.0.11
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1
@@ -14,7 +15,7 @@ dependencies:
 - mlflow>=2.9
 - mock
 - numpy>=1.22.4
-- pandas>=1.4.0
+- pandas>=2
 - pre-commit
 - prompt_toolkit>=3.0.8
 - psycopg2

@@ -3,7 +3,8 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask==2024.1.1
+- dask>=2024.4.1
+- dask-expr>=1.0.11
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1
@@ -15,7 +16,7 @@ dependencies:
 # - mlflow>=2.9
 - mock
 - numpy>=1.22.4
-- pandas>=1.4.0
+- pandas>=2
 - pre-commit
 - prompt_toolkit>=3.0.8
 - psycopg2

@@ -3,7 +3,8 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask=2024.1.1
+- dask=2024.4.1
+- dask-expr=1.0.11
 - fastapi=0.92.0
 - fugue=0.7.3
 - httpx=0.24.1
@@ -14,7 +15,7 @@ dependencies:
 - mlflow=2.9
 - mock
 - numpy=1.22.4
-- pandas=1.4.0
+- pandas=2
 - pre-commit
 - prompt_toolkit=3.0.8
 - psycopg2
@@ -29,8 +30,7 @@ dependencies:
 - py-xgboost=2.0.3
 - scikit-learn=1.0.0
 - sphinx
-# TODO: remove this constraint when we require pandas>2
-- sqlalchemy<2
+- sqlalchemy
 - tpot>=0.12.0
 # FIXME: https://github.com/fugue-project/fugue/issues/526
 - triad<0.9.2

@@ -23,6 +23,9 @@ cd "$WORKSPACE"
 # Determine CUDA release version
 export CUDA_REL=${CUDA_VERSION%.*}
 
+# TODO: remove once RAPIDS 24.06 has support for query planning
+export DASK_DATAFRAME__QUERY_PLANNING=false
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -61,4 +64,4 @@ conda config --show-sources
 conda list --show-channel-urls
 
 rapids-logger "Python py.test for dask-sql"
-py.test $WORKSPACE -n 4 -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term
+py.test $WORKSPACE -n $PARALLEL_LEVEL -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term
@@ -9,7 +9,8 @@ channels:
 dependencies:
 - c-compiler
 - zlib
-- dask==2024.1.1
+- dask>=2024.4.1
+- dask-expr>=1.0.11
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1
@@ -20,7 +21,7 @@ dependencies:
 - mlflow>=2.9
 - mock
 - numpy>=1.22.4
-- pandas>=1.4.0
+- pandas>=2
 - pre-commit
 - prompt_toolkit>=3.0.8
 - psycopg2

@@ -9,7 +9,8 @@ channels:
 dependencies:
 - c-compiler
 - zlib
-- dask==2024.1.1
+- dask>=2024.4.1
+- dask-expr>=1.0.11
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1
@@ -20,7 +21,7 @@ dependencies:
 - mlflow>=2.9
 - mock
 - numpy>=1.22.4
-- pandas>=1.4.0
+- pandas>=2
 - pre-commit
 - prompt_toolkit>=3.0.8
 - psycopg2

@@ -32,7 +32,7 @@ requirements:
     - xz  # [linux64]
   run:
     - python
-    - dask ==2024.1.1
+    - dask >=2024.4.1
     - pandas >=1.4.0
     - fastapi >=0.92.0
     - httpx >=0.24.1

@@ -1,12 +1,7 @@
-import pandas as pd
 import prompt_toolkit
 from packaging.version import parse as parseVersion
 
-_pandas_version = parseVersion(pd.__version__)
 _prompt_toolkit_version = parseVersion(prompt_toolkit.__version__)
 
-INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0")
-PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0")
-
 # TODO: remove if prompt-toolkit min version gets bumped
 PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29")
@@ -262,15 +262,23 @@ def create_table(
             self.schema[schema_name].filepaths[table_name.lower()] = input_table
         elif hasattr(input_table, "dask") and dd.utils.is_dataframe_like(input_table):
             try:
-                dask_filepath = hlg_layer(
-                    input_table.dask, "read-parquet"
-                ).creation_info["args"][0]
+                if dd._dask_expr_enabled():
+                    from dask_expr.io.parquet import ReadParquet
+
+                    dask_filepath = None
+                    operations = input_table.find_operations(ReadParquet)
+                    for op in operations:
+                        dask_filepath = op._args[0]
+                else:
+                    dask_filepath = hlg_layer(
+                        input_table.dask, "read-parquet"
+                    ).creation_info["args"][0]
                 dc.filepath = dask_filepath
                 self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
             except KeyError:
                 logger.debug("Expected 'read-parquet' layer")
 
-        if parquet_statistics and not statistics:
+        if parquet_statistics and not dd._dask_expr_enabled() and not statistics:
             statistics = parquet_statistics(dc.df)
             if statistics:
                 row_count = 0

@@ -207,7 +207,7 @@ def transform(self, X):
                 estimator=self._postfit_estimator,
                 meta=output_meta,
             )
-        elif isinstance(X, dd._Frame):
+        elif isinstance(X, dd.DataFrame):
             if output_meta is None:
                 output_meta = _transform(X._meta_nonempty, self._postfit_estimator)
             try:
@@ -305,7 +305,7 @@ def predict(self, X):
             )
             return result
 
-        elif isinstance(X, dd._Frame):
+        elif isinstance(X, dd.DataFrame):
             if output_meta is None:
                 # dask-dataframe relies on dd.core.no_default
                 # for infering meta
@@ -364,7 +364,7 @@ def predict_proba(self, X):
                 meta=output_meta,
                 chunks=(X.chunks[0], len(self._postfit_estimator.classes_)),
             )
-        elif isinstance(X, dd._Frame):
+        elif isinstance(X, dd.DataFrame):
             if output_meta is None:
                 # dask-dataframe relies on dd.core.no_default
                 # for infering meta

@@ -38,7 +38,8 @@ def filter_or_scalar(
     # In SQL, a NULL in a boolean is False on filtering
     filter_condition = filter_condition.fillna(False)
     out = df[filter_condition]
-    if dask_config.get("sql.predicate_pushdown"):
+    # dask-expr should implicitly handle predicate pushdown
+    if dask_config.get("sql.predicate_pushdown") and not dd._dask_expr_enabled():
         return attempt_predicate_pushdown(out, add_filters=add_filters)
     else:
         return out