diff --git a/docs/source/modin/index.rst b/docs/source/modin/index.rst index 99ea3881bd8..68e8ef7c8e9 100644 --- a/docs/source/modin/index.rst +++ b/docs/source/modin/index.rst @@ -19,5 +19,6 @@ For your convenience, here is all the :doc:`Supported APIs ` window groupby resampling + interoperability numpy performance \ No newline at end of file diff --git a/docs/source/modin/interoperability.rst b/docs/source/modin/interoperability.rst new file mode 100644 index 00000000000..1c0e846fadf --- /dev/null +++ b/docs/source/modin/interoperability.rst @@ -0,0 +1,58 @@ +Interoperability with third party libraries +============================================= + +Many third party libraries are interoperable with pandas, for example by accepting pandas dataframes objects as function +inputs. Here we have a non-exhaustive list of third party library use cases with pandas and note whether each method +works in Snowpark pandas as well. + +Snowpark pandas supports the `dataframe interchange protocol `_, which +some libraries use to interoperate with Snowpark pandas to the same level of support as pandas. + +The following table is structured as follows: The first column contains a method name. +The second column is a flag for whether or not interoperability is guaranteed with Snowpark pandas. For each of these +methods, we validate that passing in a Snowpark pandas dataframe as the dataframe input parameter behaves equivalently +to passing in a pandas dataframe. + +.. note:: + ``Y`` stands for yes, i.e., interoperability is guaranteed with this method, and ``N`` stands for no. + +Plotly.express module methods + +.. note:: + Currently only plotly versions <6.0.0 are supported through the dataframe interchange protocol. + ++-------------------------+---------------------------------------------+--------------------------------------------+ +| Method name | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``scatter`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``line`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``area`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``timeline`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``violin`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``bar`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``histogram`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``pie`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``treemap`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``sunburst`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``icicle`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``scatter_matrix`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``funnel`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``density_heatmap`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``boxplot`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``imshow`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ diff --git a/setup.py b/setup.py index a1be8a8eda7..12e67ab3393 100644 --- a/setup.py +++ b/setup.py @@ -200,6 +200,8 @@ def run(self): "scipy", # Snowpark pandas 3rd party library testing "statsmodels", # Snowpark pandas 3rd party library testing "scikit-learn==1.5.2", # Snowpark pandas scikit-learn tests + # plotly version restricted due to foreseen change in query counts in version 6.0.0+ + "plotly<6.0.0", # Snowpark pandas 3rd party library testing ], "localtest": [ "pandas", diff --git a/tests/integ/modin/interoperability/plotly/test_plotly.py b/tests/integ/modin/interoperability/plotly/test_plotly.py new file mode 100644 index 00000000000..2dc9ae59d55 --- /dev/null +++ b/tests/integ/modin/interoperability/plotly/test_plotly.py @@ -0,0 +1,211 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import plotly.express as px +import pytest +import pandas as native_pd + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.modin.utils import eval_snowpark_pandas_result + +# Integration tests for plotly.express module (https://plotly.com/python-api-reference/plotly.express.html). +# To add tests for additional APIs, +# - Call the method with Snowpark pandas and native pandas df input and get the JSON representation with +# `to_plotly_json()`. +# - Assert correctness of the plot produced using `assert_plotly_equal` function defined below. + + +def assert_plotly_equal(expect, got): + # referenced from cudf plotly integration test + # https://github.com/rapidsai/cudf/blob/main/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py#L10 + + assert type(expect) == type(got) + if isinstance(expect, dict): + assert expect.keys() == got.keys() + for k in expect.keys(): + assert_plotly_equal(expect[k], got[k]) + elif isinstance(got, list): + assert len(expect) == len(got) + for i in range(len(expect)): + assert_plotly_equal(expect[i], got[i]) + elif isinstance(expect, np.ndarray): + if isinstance(expect[0], float): + np.testing.assert_allclose(expect, got) + else: + assert (expect == got).all() + else: + assert expect == got + + +@pytest.fixture() +def test_dfs(): + nsamps = 50 + rng = np.random.default_rng(seed=42) + data = { + "x": rng.random(nsamps), + "y": rng.random(nsamps), + "category": rng.integers(0, 5, nsamps), + "category2": rng.integers(0, 5, nsamps), + } + snow_df = pd.DataFrame(data) + native_df = native_pd.DataFrame(data) + return snow_df, native_df + + +@sql_count_checker(query_count=1) +def test_scatter(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.scatter(df, x="x", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_line(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.line(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_area(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.area(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_timeline(): + native_df = native_pd.DataFrame( + [ + dict(Task="Job A", Start="2009-01-01", Finish="2009-02-28"), + dict(Task="Job B", Start="2009-03-05", Finish="2009-04-15"), + dict(Task="Job C", Start="2009-02-20", Finish="2009-05-30"), + ] + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: px.timeline( + df, x_start="Start", x_end="Finish", y="Task" + ).to_plotly_json(), + comparator=assert_plotly_equal, + ) + + +@sql_count_checker(query_count=1) +def test_violin(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.violin(df, y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_bar(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.bar(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_histogram(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.histogram(df, x="category").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_pie(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.pie(df, values="category", names="category2").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_treemap(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.treemap(df, names="category", values="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_sunburst(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.sunburst(df, names="category", values="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_icicle(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.icicle(df, names="category", values="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_scatter_matrix(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.scatter_matrix(df, dimensions=["category"]).to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_funnel(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.funnel(df, x="x", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_density_heatmap(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.density_heatmap(df, x="x", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_box(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.box(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=4) +def test_imshow(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.imshow(df, x=df.columns, y=df.index).to_plotly_json(), + comparator=assert_plotly_equal + )