From 71843f38e39281f0354f15657f1f39580af6683e Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Fri, 13 Dec 2024 10:48:00 -0800 Subject: [PATCH] SNOW-1852900: Support Cortex function Sentiment in apply (#2742) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1852900 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [x] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development) 3. Please describe how your code solves the related issue. Support Cortex function Sentiment in apply. --------- Signed-off-by: Labanya Mukhopadhyay --- CHANGELOG.md | 7 +++ docs/source/snowpark/functions.rst | 1 + src/snowflake/snowpark/functions.py | 22 ++++++++ .../modin/plugin/_internal/apply_utils.py | 2 + .../test_apply_snowpark_python_functions.py | 53 +++++++++++++------ tests/integ/test_function.py | 26 ++++++++- 6 files changed, 93 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98feea9cdaa..7f519a4169a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ## 1.27.0 (TBD) +### Snowpark Python API Updates + +#### New Features + +- Added support for function `snowflake_cortex_sentiment` in `functions.py`. + ### Snowpark pandas API Updates #### New Features @@ -9,6 +15,7 @@ - Added support for `Series.str.ljust` and `Series.str.rjust`. - Added support for `Series.str.center`. - Added support for `Series.str.pad`. +- Added support for applying Snowpark Python function `snowflake_cortex_sentiment`. ## 1.26.0 (2024-12-05) diff --git a/docs/source/snowpark/functions.rst b/docs/source/snowpark/functions.rst index 71e83093565..b049c800838 100644 --- a/docs/source/snowpark/functions.rst +++ b/docs/source/snowpark/functions.rst @@ -260,6 +260,7 @@ Functions sinh size skew + snowflake_cortex_sentiment snowflake_cortex_summarize sort_array soundex diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 29f2b40a7ad..76001e342b1 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -10159,3 +10159,25 @@ def snowflake_cortex_summarize(text: ColumnOrLiteralStr): sql_func_name = "snowflake.cortex.summarize" text_col = _to_col_if_lit(text, sql_func_name) return builtin(sql_func_name)(text_col) + + +def snowflake_cortex_sentiment(text: ColumnOrLiteralStr): + """ + A string containing the text for which a sentiment score should be calculated. + + Args: + text: A string containing the English text from which a summary should be generated. + Returns: + A floating-point number from -1 to 1 (inclusive) indicating the level of negative or positive sentiment in the + text. Values around 0 indicate neutral sentiment. + + Example:: + + >>> content = "A very very bad review!" + >>> df = session.create_dataframe([[content]], schema=["content"]) + >>> result = df.select(snowflake_cortex_sentiment(content)).collect()[0][0] + >>> assert -1 <= result <= 0 + """ + sql_func_name = "snowflake.cortex.sentiment" + text_col = _to_col_if_lit(text, sql_func_name) + return builtin(sql_func_name)(text_col) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py index 187f9d26c59..c78f1c1a734 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py @@ -35,6 +35,7 @@ _log2, _log10, sin, + snowflake_cortex_sentiment, snowflake_cortex_summarize, udf, to_variant, @@ -110,6 +111,7 @@ floor, trunc, sqrt, + snowflake_cortex_sentiment, snowflake_cortex_summarize, } diff --git a/tests/integ/modin/test_apply_snowpark_python_functions.py b/tests/integ/modin/test_apply_snowpark_python_functions.py index 5e5911a92c5..8a2c214cea7 100644 --- a/tests/integ/modin/test_apply_snowpark_python_functions.py +++ b/tests/integ/modin/test_apply_snowpark_python_functions.py @@ -10,7 +10,7 @@ import pytest from tests.integ.modin.utils import assert_frame_equal, assert_series_equal -from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.utils.sql_counter import sql_count_checker, SqlCounter @sql_count_checker(query_count=4) @@ -71,21 +71,40 @@ def test_apply_snowpark_python_function_not_implemented(): pd.DataFrame({"a": [1, 2, 3]}).apply(asc, args=(1, 2)) -@sql_count_checker(query_count=1) -@pytest.mark.skip("SNOW-1758914 snowflake.cortex.summarize error on GCP") -def test_apply_snowflake_cortex_summarize(): +def test_apply_snowflake_cortex_summarize(session): from snowflake.snowpark.functions import snowflake_cortex_summarize - content = """pandas on Snowflake lets you run your pandas code in a distributed manner directly on your data in - Snowflake. Just by changing the import statement and a few lines of code, you can get the familiar pandas experience - you know and love with the scalability and security benefits of Snowflake. With pandas on Snowflake, you can work - with much larger datasets and avoid the time and expense of porting your pandas pipelines to other big data - frameworks or provisioning large and expensive machines. It runs workloads natively in Snowflake through - transpilation to SQL, enabling it to take advantage of parallelization and the data governance and security - benefits of Snowflake. pandas on Snowflake is delivered through the Snowpark pandas API as part of the Snowpark - Python library, which enables scalable data processing of Python code within the Snowflake platform. -""" - s = pd.Series([content]) - summary = s.apply(snowflake_cortex_summarize).iloc[0] - # this length check is to get around the fact that this function may not be deterministic - assert 0 < len(summary) < len(content) + # TODO: SNOW-1758914 snowflake.cortex.summarize error on GCP + with SqlCounter(query_count=0): + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + + with SqlCounter(query_count=1): + content = """pandas on Snowflake lets you run your pandas code in a distributed manner directly on your data in + Snowflake. Just by changing the import statement and a few lines of code, you can get the familiar pandas experience + you know and love with the scalability and security benefits of Snowflake. With pandas on Snowflake, you can work + with much larger datasets and avoid the time and expense of porting your pandas pipelines to other big data + frameworks or provisioning large and expensive machines. It runs workloads natively in Snowflake through + transpilation to SQL, enabling it to take advantage of parallelization and the data governance and security + benefits of Snowflake. pandas on Snowflake is delivered through the Snowpark pandas API as part of the Snowpark + Python library, which enables scalable data processing of Python code within the Snowflake platform. + """ + s = pd.Series([content]) + summary = s.apply(snowflake_cortex_summarize).iloc[0] + # this length check is to get around the fact that this function may not be deterministic + assert 0 < len(summary) < len(content) + + +def test_apply_snowflake_cortex_sentiment(session): + from snowflake.snowpark.functions import snowflake_cortex_sentiment + + # TODO: SNOW-1758914 snowflake.cortex.sentiment error on GCP + with SqlCounter(query_count=0): + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + + with SqlCounter(query_count=1): + content = "A very very bad review!" + s = pd.Series([content]) + sentiment = s.apply(snowflake_cortex_sentiment).iloc[0] + assert -1 <= sentiment <= 0 diff --git a/tests/integ/test_function.py b/tests/integ/test_function.py index 2b22fe692df..a67d3207199 100644 --- a/tests/integ/test_function.py +++ b/tests/integ/test_function.py @@ -127,6 +127,7 @@ reverse, sequence, size, + snowflake_cortex_sentiment, snowflake_cortex_summarize, split, sqrt, @@ -2272,8 +2273,11 @@ def test_ln(session): "config.getoption('local_testing_mode', default=False)", reason="FEAT: snowflake_cortex functions not supported", ) -@pytest.mark.skip("SNOW-1758914 snowflake.cortex.summarize error on GCP") def test_snowflake_cortex_summarize(session): + # TODO: SNOW-1758914 snowflake.cortex.summarize error on GCP + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + content = """In Snowpark, the main way in which you query and process data is through a DataFrame. This topic explains how to work with DataFrames. To retrieve and manipulate data, you use the DataFrame class. A DataFrame represents a relational dataset that is evaluated lazily: it only executes when a specific action is triggered. In a sense, a DataFrame is like a query that needs to be evaluated in order to retrieve data. @@ -2302,3 +2306,23 @@ def test_snowflake_cortex_summarize(session): # this length check is to get around the fact that this function may not be deterministic assert 0 < len(summary_from_col) < len(content) assert 0 < len(summary_from_str) < len(content) + + +@pytest.mark.skipif( + "config.getoption('local_testing_mode', default=False)", + reason="FEAT: snowflake_cortex functions not supported", +) +def test_apply_snowflake_cortex_sentiment(session): + # TODO: SNOW-1758914 snowflake.cortex.sentiment error on GCP + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + content = "A very very bad review!" + df = session.create_dataframe([[content]], schema=["content"]) + + sentiment_from_col = df.select( + snowflake_cortex_sentiment(col("content")) + ).collect()[0][0] + sentiment_from_str = df.select(snowflake_cortex_sentiment(content)).collect()[0][0] + + assert -1 <= sentiment_from_col <= 0 + assert -1 <= sentiment_from_str <= 0