diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 2160025ea0..f60b6828a5 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -92,6 +92,19 @@ jobs: name: Run smoke tests with minimum deps Windows shell: cmd + - name: Install pyarrow + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk + + - run: | + poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow + if: runner.os != 'Windows' + name: Run pipeline tests with pyarrow but no pandas installed + - run: | + poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow + if: runner.os == 'Windows' + name: Run pipeline tests with pyarrow but no pandas installed Windows + shell: cmd + - name: Install pipeline dependencies run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py index 93e6b764bc..7a94dcf6e2 100644 --- a/dlt/common/libs/pandas.py +++ b/dlt/common/libs/pandas.py @@ -1,7 +1,14 @@ +from typing import Any from dlt.common.exceptions import MissingDependencyException try: import pandas - from pandas.io.sql import _wrap_result except ModuleNotFoundError: raise MissingDependencyException("DLT Pandas Helpers", ["pandas"]) + + +def pandas_to_arrow(df: pandas.DataFrame) -> Any: + """Converts pandas to arrow or raises an exception if pyarrow is not installed""" + from dlt.common.libs.pyarrow import pyarrow as pa + + return pa.Table.from_pandas(df) diff --git a/dlt/common/libs/pandas_sql.py b/dlt/common/libs/pandas_sql.py new file mode 100644 index 0000000000..e9e2a7da11 --- /dev/null +++ b/dlt/common/libs/pandas_sql.py @@ -0,0 +1,7 @@ +from dlt.common.exceptions import MissingDependencyException + + +try: + from pandas.io.sql import _wrap_result +except ModuleNotFoundError: + raise MissingDependencyException("dlt pandas helper for sql", ["pandas"]) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 183c27954b..c1fbfbff85 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -18,7 +18,9 @@ import pyarrow.compute except ModuleNotFoundError: raise MissingDependencyException( - "dlt parquet Helpers", [f"{version.DLT_PKG_NAME}[parquet]"], "dlt Helpers for for parquet." + "dlt pyarrow helpers", + [f"{version.DLT_PKG_NAME}[parquet]"], + "Install pyarrow to be allow to load arrow tables, panda frames and to use parquet files.", ) diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index 695f1a0972..9d872a238e 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -221,7 +221,7 @@ def _get_columns(self) -> List[str]: return [c[0] for c in self.native_cursor.description] def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: - from dlt.common.libs.pandas import _wrap_result + from dlt.common.libs.pandas_sql import _wrap_result columns = self._get_columns() if chunk_size is None: diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 84abb4f3a8..ed41c50e12 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -29,9 +29,10 @@ from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem except MissingDependencyException: pyarrow = None + pa = None try: - from dlt.common.libs.pandas import pandas + from dlt.common.libs.pandas import pandas, pandas_to_arrow except MissingDependencyException: pandas = None @@ -224,7 +225,7 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No for tbl in ( ( # 1. Convert pandas frame(s) to arrow Table - pa.Table.from_pandas(item) + pandas_to_arrow(item) if (pandas and isinstance(item, pandas.DataFrame)) else item ) diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 2ad827b755..29b20de7b8 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -18,7 +18,6 @@ try: from dlt.common.libs import pyarrow - from dlt.common.libs.pandas import pandas from dlt.common.libs.numpy import numpy from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem from dlt.common.libs.pyarrow import from_arrow_scalar, to_arrow_scalar @@ -26,6 +25,11 @@ pa = None pyarrow = None numpy = None + +# NOTE: always import pandas independently from pyarrow +try: + from dlt.common.libs.pandas import pandas, pandas_to_arrow +except MissingDependencyException: pandas = None @@ -220,7 +224,7 @@ def __call__( ) -> Tuple[TDataItem, bool, bool]: is_pandas = pandas is not None and isinstance(tbl, pandas.DataFrame) if is_pandas: - tbl = pa.Table.from_pandas(tbl) + tbl = pandas_to_arrow(tbl) primary_key = self.primary_key(tbl) if callable(self.primary_key) else self.primary_key if primary_key: diff --git a/dlt/extract/wrappers.py b/dlt/extract/wrappers.py index 7ffb6b4fc6..e761fcdeab 100644 --- a/dlt/extract/wrappers.py +++ b/dlt/extract/wrappers.py @@ -6,11 +6,17 @@ try: from dlt.common.libs.pandas import pandas + + PandaFrame = pandas.DataFrame +except MissingDependencyException: + PandaFrame = NoneType + +try: from dlt.common.libs.pyarrow import pyarrow - PandaFrame, ArrowTable, ArrowRecords = pandas.DataFrame, pyarrow.Table, pyarrow.RecordBatch + ArrowTable, ArrowRecords = pyarrow.Table, pyarrow.RecordBatch except MissingDependencyException: - PandaFrame, ArrowTable, ArrowRecords = NoneType, NoneType, NoneType + ArrowTable, ArrowRecords = NoneType, NoneType def wrap_additional_type(data: Any) -> Any: diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index 81c883c273..7f1b52154d 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -1,13 +1,25 @@ import os +import importlib.util from typing import Any, ClassVar, Dict, Iterator, List, Optional import pytest -from pydantic import BaseModel + +try: + from pydantic import BaseModel + from dlt.common.libs.pydantic import DltConfig +except ImportError: + # mock pydantic with dataclasses. allow to run tests + # not requiring pydantic + from dataclasses import dataclass + + @dataclass + class BaseModel: # type: ignore[no-redef] + pass + import dlt from dlt.common import json, pendulum from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.capabilities import TLoaderFileFormat -from dlt.common.libs.pydantic import DltConfig from dlt.common.runtime.collector import ( AliveCollector, EnlightenCollector, @@ -386,3 +398,31 @@ class Parent(BaseModel): } assert loaded_values == {"data_dictionary__child_attribute": "any string"} + + +@pytest.mark.skipif( + importlib.util.find_spec("pandas") is not None, + reason="Test skipped because pandas IS installed", +) +def test_arrow_no_pandas() -> None: + import pyarrow as pa + + data = { + "Numbers": [1, 2, 3, 4, 5], + "Strings": ["apple", "banana", "cherry", "date", "elderberry"], + } + + df = pa.table(data) + + @dlt.resource + def pandas_incremental(numbers=dlt.sources.incremental("Numbers")): + yield df + + info = dlt.run( + pandas_incremental(), write_disposition="append", table_name="data", destination="duckdb" + ) + + with info.pipeline.sql_client() as client: # type: ignore + with client.execute_query("SELECT * FROM data") as c: + with pytest.raises(ImportError): + df = c.df()