From 98eeec71af09db51777be938450bee5927e8e63c Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Mon, 24 Jun 2024 14:14:28 -0700 Subject: [PATCH] move tests to new file so they run on ci --- tests/integ/modin/strings/test_strings.py | 117 ------------------ tests/integ/modin/strings/test_translate.py | 125 ++++++++++++++++++++ 2 files changed, 125 insertions(+), 117 deletions(-) create mode 100644 tests/integ/modin/strings/test_translate.py diff --git a/tests/integ/modin/strings/test_strings.py b/tests/integ/modin/strings/test_strings.py index 4545fca796d..cd2edb4f85d 100644 --- a/tests/integ/modin/strings/test_strings.py +++ b/tests/integ/modin/strings/test_strings.py @@ -13,7 +13,6 @@ from tests.integ.modin.utils import ( assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, - create_test_series, eval_snowpark_pandas_result, ) from tests.utils import running_on_public_ci @@ -579,119 +578,3 @@ def test_get_with_dict_label(key, expected_result): result = s.str.get(key) expected = native_pd.Series(expected_result) assert_snowpark_pandas_equal_to_pandas(result, expected, check_dtype=False) - - -@pytest.mark.parametrize( - "data, table", - [ - ( - # Simple 1-element mapping - ["aaaaa", "bbbaaa", "cafdsaf;lh"], - str.maketrans("a", "b"), - ), - ( - # Mapping with mixed str, unicode code points, and Nones - ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"], - str.maketrans( - {ord("a"): "A", ord("f"): None, "y": "z", "k": None, ord("j"): ""} - ), - ), - ( - # Mapping with special characters - [ - "Peña", - "Ordoñez", - "Raúl", - "Ibañez", - "François", - "øen", - "2πr = τ", - "München", - ], - str.maketrans( - { - "ñ": "n", - "ú": "u", - "ç": "c", - "ø": "o", - "τ": "t", - "π": "p", - "ü": "u", - } - ), - ), - ( - # Mapping with compound emojis. Each item in the series renders as a single emoji, - # but is actually 4 characters. Calling `len` on each element correctly returns 4. - # https://unicode.org/emoji/charts/emoji-zwj-sequences.html - # Inputs: - # - "head shaking horizontally" = 1F642 + 200D + 2194 + FE0F - # - "heart on fire" = 2764 + FE0F + 200D + 1F525 - # - "judge" = 1F9D1 + 200D + 2696 + FE0F - # Outputs: - # - "head shaking vertically" = 1F642 + 200D + 2195 + FE0F - # - "mending heart" = 2764 + FE0F + 200D + 1FA79 - # - "health worker" = 1F91D1 + 200D + 2695 + FE0F - ["🙂‍↔️", "❤️‍🔥", "🧑‍⚖️"], - { - 0x2194: 0x2195, - 0x1F525: 0x1FA79, - 0x2696: 0x2695, - }, - ), - ], -) -@sql_count_checker(query_count=1) -def test_translate(data, table): - eval_snowpark_pandas_result( - *create_test_series(data), lambda ser: ser.str.translate(table) - ) - - -@sql_count_checker(query_count=1) -def test_translate_without_maketrans(): - # pandas requires all table keys to be unicode ordinal values, and does not know how to handle - # string keys that were not converted to ordinals via `ord` or `str.maketrans`. Since Snowflake - # SQL uses strings in its mappings, we accept string keys as well as ordinals. - data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"] - table = {ord("a"): "A", ord("f"): None, "y": "z", "k": None} - snow_ser = pd.Series(data) - assert_snowpark_pandas_equal_to_pandas( - snow_ser.str.translate(table), - native_pd.Series(data).str.translate(str.maketrans(table)), - ) - # Mappings for "y" and "k" are ignored if not passed through str.maketrans because they are - # not unicode ordinals - assert ( - not native_pd.Series(data) - .str.translate(table) - .equals(native_pd.Series(data).str.translate(str.maketrans(table))) - ) - - -@pytest.mark.parametrize( - "table", - [ - {"😶‍🌫️": "a"}, # This emoji key is secretly 4 code points - {"aa": "a"}, # Key is 2 chars - # Mapping 1 char to multiple is valid in vanilla pandas, but we don't support this - {ord("a"): "😶‍🌫️"}, # This emoji value is secretly 4 code points - {ord("a"): "aa"}, # Value is 2 chars - ], -) -@sql_count_checker(query_count=0) -def test_translate_invalid_mappings(table): - data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"] - # native pandas silently treats all of these cases as no-ops. However, since Snowflake SQL uses - # strings as mappings instead of a dict construct, passing these arguments to the equivalent - # SQL argument would either cause an inscrutable error or unexpected changes to the output series. - snow_ser, native_ser = create_test_series(data) - native_ser.str.translate(table) - with pytest.raises(ValueError): - snow_ser.str.translate(table) - - -@sql_count_checker(query_count=0) -def test_translate_coverage_canary(): - # For some reason code coverage isn't picking up tests, this is a canary to make sure these are being run - raise AssertionError() diff --git a/tests/integ/modin/strings/test_translate.py b/tests/integ/modin/strings/test_translate.py new file mode 100644 index 00000000000..4c655d320da --- /dev/null +++ b/tests/integ/modin/strings/test_translate.py @@ -0,0 +1,125 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equal_to_pandas, + create_test_series, + eval_snowpark_pandas_result, +) + + +@pytest.mark.parametrize( + "data, table", + [ + ( + # Simple 1-element mapping + ["aaaaa", "bbbaaa", "cafdsaf;lh"], + str.maketrans("a", "b"), + ), + ( + # Mapping with mixed str, unicode code points, and Nones + ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"], + str.maketrans( + {ord("a"): "A", ord("f"): None, "y": "z", "k": None, ord("j"): ""} + ), + ), + ( + # Mapping with special characters + [ + "Peña", + "Ordoñez", + "Raúl", + "Ibañez", + "François", + "øen", + "2πr = τ", + "München", + ], + str.maketrans( + { + "ñ": "n", + "ú": "u", + "ç": "c", + "ø": "o", + "τ": "t", + "π": "p", + "ü": "u", + } + ), + ), + ( + # Mapping with compound emojis. Each item in the series renders as a single emoji, + # but is actually 4 characters. Calling `len` on each element correctly returns 4. + # https://unicode.org/emoji/charts/emoji-zwj-sequences.html + # Inputs: + # - "head shaking horizontally" = 1F642 + 200D + 2194 + FE0F + # - "heart on fire" = 2764 + FE0F + 200D + 1F525 + # - "judge" = 1F9D1 + 200D + 2696 + FE0F + # Outputs: + # - "head shaking vertically" = 1F642 + 200D + 2195 + FE0F + # - "mending heart" = 2764 + FE0F + 200D + 1FA79 + # - "health worker" = 1F91D1 + 200D + 2695 + FE0F + ["🙂‍↔️", "❤️‍🔥", "🧑‍⚖️"], + { + 0x2194: 0x2195, + 0x1F525: 0x1FA79, + 0x2696: 0x2695, + }, + ), + ], +) +@sql_count_checker(query_count=1) +def test_translate(data, table): + eval_snowpark_pandas_result( + *create_test_series(data), lambda ser: ser.str.translate(table) + ) + + +@sql_count_checker(query_count=1) +def test_translate_without_maketrans(): + # pandas requires all table keys to be unicode ordinal values, and does not know how to handle + # string keys that were not converted to ordinals via `ord` or `str.maketrans`. Since Snowflake + # SQL uses strings in its mappings, we accept string keys as well as ordinals. + data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"] + table = {ord("a"): "A", ord("f"): None, "y": "z", "k": None} + snow_ser = pd.Series(data) + assert_snowpark_pandas_equal_to_pandas( + snow_ser.str.translate(table), + native_pd.Series(data).str.translate(str.maketrans(table)), + ) + # Mappings for "y" and "k" are ignored if not passed through str.maketrans because they are + # not unicode ordinals + assert ( + not native_pd.Series(data) + .str.translate(table) + .equals(native_pd.Series(data).str.translate(str.maketrans(table))) + ) + + +@pytest.mark.parametrize( + "table", + [ + {"😶‍🌫️": "a"}, # This emoji key is secretly 4 code points + {"aa": "a"}, # Key is 2 chars + # Mapping 1 char to multiple is valid in vanilla pandas, but we don't support this + {ord("a"): "😶‍🌫️"}, # This emoji value is secretly 4 code points + {ord("a"): "aa"}, # Value is 2 chars + ], +) +@sql_count_checker(query_count=0) +def test_translate_invalid_mappings(table): + data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"] + # native pandas silently treats all of these cases as no-ops. However, since Snowflake SQL uses + # strings as mappings instead of a dict construct, passing these arguments to the equivalent + # SQL argument would either cause an inscrutable error or unexpected changes to the output series. + snow_ser, native_ser = create_test_series(data) + native_ser.str.translate(table) + with pytest.raises(ValueError): + snow_ser.str.translate(table)