Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into dask-dataframe-over…
Browse files Browse the repository at this point in the history
…rides
  • Loading branch information
charlesbluca committed Apr 15, 2024
2 parents 43faa7a + 0540813 commit 6c8052c
Show file tree
Hide file tree
Showing 39 changed files with 281 additions and 176 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
channel-priority: strict
- name: Install dependencies
run: |
mamba install -c conda-forge boa conda-verify
mamba install -c conda-forge "boa<0.17" "conda-build<24.1" conda-verify
which python
pip list
Expand Down
24 changes: 22 additions & 2 deletions .github/workflows/test-upstream.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,38 @@ defaults:

jobs:
test-dev:
name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})"
name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})"
runs-on: ${{ matrix.os }}
env:
CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml
DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }}
DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python: ["3.9", "3.10", "3.11", "3.12"]
distributed: [false]
query-planning: [true]
include:
# run tests on a distributed client
- os: "ubuntu-latest"
python: "3.9"
distributed: true
query-planning: true
- os: "ubuntu-latest"
python: "3.11"
distributed: true
query-planning: true
# run tests with query planning disabled
- os: "ubuntu-latest"
python: "3.9"
distributed: false
query-planning: false
- os: "ubuntu-latest"
python: "3.11"
distributed: false
query-planning: false
steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -72,8 +85,12 @@ jobs:
path: test-${{ matrix.os }}-py${{ matrix.python }}-results.jsonl

import-dev:
name: "Test importing with bare requirements and upstream dev"
name: "Test importing with bare requirements and upstream dev (query-planning: ${{ matrix.query-planning }})"
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
query-planning: [true, false]
steps:
- uses: actions/checkout@v4
- name: Set up Python
Expand All @@ -93,8 +110,11 @@ jobs:
- name: Install upstream dev Dask
run: |
python -m pip install git+https://github.com/dask/dask
python -m pip install git+https://github.com/dask/dask-expr
python -m pip install git+https://github.com/dask/distributed
- name: Try to import dask-sql
env:
DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }}
run: |
python -c "import dask_sql; print('ok')"
Expand Down
24 changes: 22 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,39 @@ jobs:
keyword: "[test-upstream]"

test:
name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})"
name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})"
needs: [detect-ci-trigger]
runs-on: ${{ matrix.os }}
env:
CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml
DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }}
DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python: ["3.9", "3.10", "3.11", "3.12"]
distributed: [false]
query-planning: [true]
include:
# run tests on a distributed client
- os: "ubuntu-latest"
python: "3.9"
distributed: true
query-planning: true
- os: "ubuntu-latest"
python: "3.11"
distributed: true
query-planning: true
# run tests with query planning disabled
- os: "ubuntu-latest"
python: "3.9"
distributed: false
query-planning: false
- os: "ubuntu-latest"
python: "3.11"
distributed: false
query-planning: false
steps:
- uses: actions/checkout@v4
- name: Set up Python
Expand Down Expand Up @@ -96,9 +109,13 @@ jobs:
uses: codecov/codecov-action@v3

import:
name: "Test importing with bare requirements"
name: "Test importing with bare requirements (query-planning: ${{ matrix.query-planning }})"
needs: [detect-ci-trigger]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
query-planning: [true, false]
steps:
- uses: actions/checkout@v4
- name: Set up Python
Expand All @@ -119,7 +136,10 @@ jobs:
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
python -m pip install git+https://github.com/dask/dask
python -m pip install git+https://github.com/dask/dask-expr
python -m pip install git+https://github.com/dask/distributed
- name: Try to import dask-sql
env:
DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }}
run: |
python -c "import dask_sql; print('ok')"
2 changes: 1 addition & 1 deletion continuous_integration/docker/conda.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
python>=3.9
dask==2024.1.1
dask>=2024.4.1
pandas>=1.4.0
jpype1>=1.0.2
openjdk>=8
Expand Down
2 changes: 1 addition & 1 deletion continuous_integration/docker/main.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ RUN mamba install -y \
# build requirements
"maturin>=1.3,<1.4" \
# core dependencies
"dask==2024.1.1" \
"dask>=2024.4.1" \
"pandas>=1.4.0" \
"fastapi>=0.92.0" \
"httpx>=0.24.1" \
Expand Down
5 changes: 3 additions & 2 deletions continuous_integration/environment-3.10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ channels:
- conda-forge
dependencies:
- c-compiler
- dask==2024.1.1
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
Expand All @@ -14,7 +15,7 @@ dependencies:
- mlflow>=2.9
- mock
- numpy>=1.22.4
- pandas>=1.4.0
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
5 changes: 3 additions & 2 deletions continuous_integration/environment-3.11.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ channels:
- conda-forge
dependencies:
- c-compiler
- dask==2024.1.1
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
Expand All @@ -14,7 +15,7 @@ dependencies:
- mlflow>=2.9
- mock
- numpy>=1.22.4
- pandas>=1.4.0
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
5 changes: 3 additions & 2 deletions continuous_integration/environment-3.12.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ channels:
- conda-forge
dependencies:
- c-compiler
- dask==2024.1.1
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
Expand All @@ -15,7 +16,7 @@ dependencies:
# - mlflow>=2.9
- mock
- numpy>=1.22.4
- pandas>=1.4.0
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
8 changes: 4 additions & 4 deletions continuous_integration/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ channels:
- conda-forge
dependencies:
- c-compiler
- dask=2024.1.1
- dask=2024.4.1
- dask-expr=1.0.11
- fastapi=0.92.0
- fugue=0.7.3
- httpx=0.24.1
Expand All @@ -14,7 +15,7 @@ dependencies:
- mlflow=2.9
- mock
- numpy=1.22.4
- pandas=1.4.0
- pandas=2
- pre-commit
- prompt_toolkit=3.0.8
- psycopg2
Expand All @@ -29,8 +30,7 @@ dependencies:
- py-xgboost=2.0.3
- scikit-learn=1.0.0
- sphinx
# TODO: remove this constraint when we require pandas>2
- sqlalchemy<2
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
Expand Down
5 changes: 4 additions & 1 deletion continuous_integration/gpuci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ cd "$WORKSPACE"
# Determine CUDA release version
export CUDA_REL=${CUDA_VERSION%.*}

# TODO: remove once RAPIDS 24.06 has support for query planning
export DASK_DATAFRAME__QUERY_PLANNING=false

################################################################################
# SETUP - Check environment
################################################################################
Expand Down Expand Up @@ -61,4 +64,4 @@ conda config --show-sources
conda list --show-channel-urls

rapids-logger "Python py.test for dask-sql"
py.test $WORKSPACE -n 4 -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term
py.test $WORKSPACE -n $PARALLEL_LEVEL -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term
5 changes: 3 additions & 2 deletions continuous_integration/gpuci/environment-3.10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ channels:
dependencies:
- c-compiler
- zlib
- dask==2024.1.1
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
Expand All @@ -20,7 +21,7 @@ dependencies:
- mlflow>=2.9
- mock
- numpy>=1.22.4
- pandas>=1.4.0
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
5 changes: 3 additions & 2 deletions continuous_integration/gpuci/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ channels:
dependencies:
- c-compiler
- zlib
- dask==2024.1.1
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
Expand All @@ -20,7 +21,7 @@ dependencies:
- mlflow>=2.9
- mock
- numpy>=1.22.4
- pandas>=1.4.0
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
2 changes: 1 addition & 1 deletion continuous_integration/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ requirements:
- xz # [linux64]
run:
- python
- dask ==2024.1.1
- dask >=2024.4.1
- pandas >=1.4.0
- fastapi >=0.92.0
- httpx >=0.24.1
Expand Down
5 changes: 0 additions & 5 deletions dask_sql/_compat.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
import pandas as pd
import prompt_toolkit
from packaging.version import parse as parseVersion

_pandas_version = parseVersion(pd.__version__)
_prompt_toolkit_version = parseVersion(prompt_toolkit.__version__)

INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0")
PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0")

# TODO: remove if prompt-toolkit min version gets bumped
PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29")
16 changes: 12 additions & 4 deletions dask_sql/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,15 +262,23 @@ def create_table(
self.schema[schema_name].filepaths[table_name.lower()] = input_table
elif hasattr(input_table, "dask") and dd.utils.is_dataframe_like(input_table):
try:
dask_filepath = hlg_layer(
input_table.dask, "read-parquet"
).creation_info["args"][0]
if dd._dask_expr_enabled():
from dask_expr.io.parquet import ReadParquet

dask_filepath = None
operations = input_table.find_operations(ReadParquet)
for op in operations:
dask_filepath = op._args[0]
else:
dask_filepath = hlg_layer(
input_table.dask, "read-parquet"
).creation_info["args"][0]
dc.filepath = dask_filepath
self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
except KeyError:
logger.debug("Expected 'read-parquet' layer")

if parquet_statistics and not statistics:
if parquet_statistics and not dd._dask_expr_enabled() and not statistics:
statistics = parquet_statistics(dc.df)
if statistics:
row_count = 0
Expand Down
6 changes: 3 additions & 3 deletions dask_sql/physical/rel/custom/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def transform(self, X):
estimator=self._postfit_estimator,
meta=output_meta,
)
elif isinstance(X, dd._Frame):
elif isinstance(X, dd.DataFrame):
if output_meta is None:
output_meta = _transform(X._meta_nonempty, self._postfit_estimator)
try:
Expand Down Expand Up @@ -305,7 +305,7 @@ def predict(self, X):
)
return result

elif isinstance(X, dd._Frame):
elif isinstance(X, dd.DataFrame):
if output_meta is None:
# dask-dataframe relies on dd.core.no_default
# for infering meta
Expand Down Expand Up @@ -364,7 +364,7 @@ def predict_proba(self, X):
meta=output_meta,
chunks=(X.chunks[0], len(self._postfit_estimator.classes_)),
)
elif isinstance(X, dd._Frame):
elif isinstance(X, dd.DataFrame):
if output_meta is None:
# dask-dataframe relies on dd.core.no_default
# for infering meta
Expand Down
3 changes: 2 additions & 1 deletion dask_sql/physical/rel/logical/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def filter_or_scalar(
# In SQL, a NULL in a boolean is False on filtering
filter_condition = filter_condition.fillna(False)
out = df[filter_condition]
if dask_config.get("sql.predicate_pushdown"):
# dask-expr should implicitly handle predicate pushdown
if dask_config.get("sql.predicate_pushdown") and not dd._dask_expr_enabled():
return attempt_predicate_pushdown(out, add_filters=add_filters)
else:
return out
Expand Down
Loading

0 comments on commit 6c8052c

Please sign in to comment.