Skip to content

Commit

Permalink
SNOW-1445360: Add Support for Series.str.get (#1714)
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-helmeleegy authored Jun 1, 2024
1 parent ef71050 commit c26be8a
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#### New Features

- Added partial support for `DataFrame.pct_change` and `Series.pct_change` without the `freq` and `limit` parameters.
- Added support for `Series.str.get`.

#### Bug Fixes

Expand Down
4 changes: 3 additions & 1 deletion docs/source/modin/supported/series_str_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ the method in the left column.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``fullmatch`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``get`` | N | |
| ``get`` | P | ``N`` if the `i` parameter is set to a non-int |
| | | value. Also non-string data values such as list |
| | | and dict are not yet supported. |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``get_dummies`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12284,8 +12284,57 @@ def output_col(col_name: ColumnOrName) -> SnowparkColumn:
)
return SnowflakeQueryCompiler(new_internal_frame)

def str_get(self, i: int) -> None:
ErrorMessage.method_not_implemented_error("get", "Series.str")
def str_get(self, i: int) -> "SnowflakeQueryCompiler":
"""
Extract element from each component at specified position or with specified key.

Extract element from lists, tuples, dict, or strings in each element in the Series/Index.

Parameters
----------
i : int
Position or key of element to extract.

Returns
-------
SnowflakeQueryCompiler representing result of the string operation.
"""
if i is not None and not isinstance(i, int):
ErrorMessage.not_implemented(
"Snowpark pandas method 'Series.str.get' doesn't yet support non-numeric 'i' argument"
)

def output_col(col_name: ColumnOrName) -> SnowparkColumn:
col_len_exp = length(col(col_name))
if i is None:
new_col = pandas_lit(None)
elif i < 0:
# Index is relative to the end boundary.
# If it falls before the beginning boundary, Null is returned.
# Note that string methods in pandas are 0-based while in Snowflake, they are 1-based.
new_col = iff(
pandas_lit(i) + col_len_exp < pandas_lit(0),
pandas_lit(None),
substring(
col(col_name), pandas_lit(i + 1) + col_len_exp, pandas_lit(1)
),
)
else:
assert i >= 0
# Index is relative to the beginning boundary.
# If it falls after the end boundary, Null is returned.
# Note that string methods in pandas are 0-based while in Snowflake, they are 1-based.
new_col = iff(
pandas_lit(i) >= col_len_exp,
pandas_lit(None),
substring(col(col_name), pandas_lit(i + 1), pandas_lit(1)),
)
return self._replace_non_str(col(col_name), new_col)

new_internal_frame = self._modin_frame.apply_snowpark_function_to_data_columns(
output_col
)
return SnowflakeQueryCompiler(new_internal_frame)

def str_get_dummies(self, sep: str) -> None:
ErrorMessage.method_not_implemented_error("get_dummies", "Series.str")
Expand Down
41 changes: 40 additions & 1 deletion src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,46 @@ def rsplit():
pass

def get():
pass
"""
Extract element from each component at specified position or with specified key.
Extract element from lists, tuples, dict, or strings in each element in the Series/Index.
Parameters
----------
i : int
Position or key of element to extract.
Returns
-------
Series or Index
Examples
--------
>>> s = pd.Series(["String",
... (1, 2, 3),
... ["a", "b", "c"],
... 123,
... -456,
... {1: "Hello", "2": "World"}])
>>> s.str.get(1)
0 t
1 None
2 None
3 None
4 None
5 None
dtype: object
>>> s.str.get(-1)
0 g
1 None
2 None
3 None
4 None
5 None
dtype: object
"""

def join():
pass
Expand Down
23 changes: 23 additions & 0 deletions tests/integ/modin/series/test_str_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,29 @@ def test_str_count(pat, flags):
)


@pytest.mark.parametrize("i", [None, -100, -2, -1, 0, 1, 2, 100])
@sql_count_checker(query_count=1)
def test_str_get(i):
native_ser = native_pd.Series(TEST_DATA)
snow_ser = pd.Series(native_ser)
eval_snowpark_pandas_result(
snow_ser,
native_ser,
lambda ser: ser.str.get(i=i),
)


@sql_count_checker(query_count=0)
def test_str_get_neg():
native_ser = native_pd.Series(TEST_DATA)
snow_ser = pd.Series(native_ser)
with pytest.raises(
NotImplementedError,
match="Snowpark pandas method 'Series.str.get' doesn't yet support non-numeric 'i' argument",
):
snow_ser.str.get(i="a")


@pytest.mark.parametrize("start", [None, -100, -2, -1, 0, 1, 2, 100])
@pytest.mark.parametrize("stop", [None, -100, -2, -1, 0, 1, 2, 100])
@pytest.mark.parametrize("step", [None, -100, -2, -1, 1, 2, 100])
Expand Down
1 change: 0 additions & 1 deletion tests/unit/modin/test_series_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def test_str_cat_no_others(mock_str_register, mock_series):
(lambda s: s.str.decode("utf-8"), "decode"),
(lambda s: s.str.encode("utf-8"), "encode"),
(lambda s: s.str.rsplit("_", n=1), "rsplit"),
(lambda s: s.str.get(3), "get"),
(lambda s: s.str.join("_"), "join"),
(lambda s: s.str.pad(10), "pad"),
(lambda s: s.str.center(10), "center"),
Expand Down

0 comments on commit c26be8a

Please sign in to comment.