Skip to content

Commit

Permalink
SNOW-1331876: Support interval in Window.range_between (#2294)
Browse files Browse the repository at this point in the history
<!---
Please answer these questions before creating your pull request. Thanks!
--->

1. Which Jira issue is this PR addressing? Make sure that there is an
accompanying issue to your PR.

   <!---
   In this section, please add a Snowflake Jira issue number.
   
Note that if a corresponding GitHub issue exists, you should still
include
   the Snowflake Jira issue number. For example, for GitHub issue
#1400, you should
   add "SNOW-1335071" here.
    --->

   Fixes SNOW-1331876

2. Fill out the following pre-review checklist:

- [x] I am adding a new automated test(s) to verify correctness of my
new code
- [ ] If this test skips Local Testing mode, I'm requesting review from
@snowflakedb/local-testing
   - [ ] I am adding new logging messages
   - [ ] I am adding a new telemetry message
   - [ ] I am adding new credentials
   - [ ] I am adding a new dependency
- [ ] If this is a new feature/behavior, I'm adding the Local Testing
parity changes.

3. Please describe how your code solves the related issue.

   Interval support in RANGE BTEWEEN is just GA last month.
  • Loading branch information
sfc-gh-jdu authored Sep 18, 2024
1 parent f7b9c67 commit 67e2f87
Show file tree
Hide file tree
Showing 8 changed files with 440 additions and 61 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

### Snowpark pandas API Updates

#### New Features

- Added the following new functions in `snowflake.snowpark.functions`:
- `make_interval`
- Added support for using Snowflake Interval constants with `Window.range_between()` when the order by column is TIMESTAMP or DATE type

#### Improvements

- Improved `to_pandas` to persist the original timezone offset for TIMESTAMP_TZ type.
Expand Down
1 change: 1 addition & 0 deletions docs/source/snowpark/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ Functions
lower
lpad
ltrim
make_interval
max
md5
mean
Expand Down
37 changes: 25 additions & 12 deletions src/snowflake/snowpark/_internal/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
Alias,
Cast,
UnaryExpression,
UnaryMinus,
UnresolvedAlias,
)
from snowflake.snowpark._internal.analyzer.unary_plan_node import (
Expand Down Expand Up @@ -344,14 +345,10 @@ def analyze(
return specified_window_frame_expression(
expr.frame_type.sql,
self.window_frame_boundary(
self.to_sql_try_avoid_cast(
expr.lower, df_aliased_col_name_to_real_col_name
)
expr.lower, df_aliased_col_name_to_real_col_name
),
self.window_frame_boundary(
self.to_sql_try_avoid_cast(
expr.upper, df_aliased_col_name_to_real_col_name
)
expr.upper, df_aliased_col_name_to_real_col_name
),
)
if isinstance(expr, UnspecifiedFrame):
Expand Down Expand Up @@ -722,12 +719,28 @@ def grouping_extractor(
df_aliased_col_name_to_real_col_name,
)

def window_frame_boundary(self, offset: str) -> str:
try:
num = int(offset)
return window_frame_boundary_expression(str(abs(num)), num >= 0)
except Exception:
return offset
def window_frame_boundary(
self,
boundary: Expression,
df_aliased_col_name_to_real_col_name: DefaultDict[str, Dict[str, str]],
) -> str:
# it means interval preceding
if isinstance(boundary, UnaryMinus) and isinstance(boundary.child, Interval):
return window_frame_boundary_expression(
boundary.child.sql, is_following=False
)
elif isinstance(boundary, Interval):
return window_frame_boundary_expression(boundary.sql, is_following=True)
else:
# boundary should be an integer
offset = self.to_sql_try_avoid_cast(
boundary, df_aliased_col_name_to_real_col_name
)
try:
num = int(offset)
return window_frame_boundary_expression(str(abs(num)), num >= 0)
except Exception:
return offset

def to_sql_try_avoid_cast(
self,
Expand Down
64 changes: 64 additions & 0 deletions src/snowflake/snowpark/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@
from snowflake.snowpark._internal.analyzer.expression import (
CaseWhen,
FunctionExpression,
Interval,
ListAgg,
Literal,
MultipleExpression,
Expand Down Expand Up @@ -8636,3 +8637,66 @@ def locate(expr1: str, expr2: ColumnOrName, start_pos: int = 1) -> Column:
_substr = lit(expr1)
_str = _to_col_if_str(expr2, "locate")
return builtin("charindex")(_substr, _str, lit(start_pos))


def make_interval(
years: Optional[int] = None,
quarters: Optional[int] = None,
months: Optional[int] = None,
weeks: Optional[int] = None,
days: Optional[int] = None,
hours: Optional[int] = None,
minutes: Optional[int] = None,
seconds: Optional[int] = None,
milliseconds: Optional[int] = None,
microseconds: Optional[int] = None,
nanoseconds: Optional[int] = None,
mins: Optional[int] = None,
secs: Optional[int] = None,
) -> Column:
"""
Creates an interval column with the specified years, quarters, months, weeks, days, hours,
minutes, seconds, milliseconds, microseconds, and nanoseconds. You can find more details in
`Interval constants <https://docs.snowflake.com/en/sql-reference/data-types-datetime#interval-constants>`_.
INTERVAL is not a data type (that is, you can’t define a table column to be of data type INTERVAL).
Intervals can only be used in date, time, and timestamp arithmetic. For example,
``df.select(make_interval(days=0))`` is not valid.
Example::
>>> import datetime
>>> from snowflake.snowpark.functions import to_date
>>>
>>> df = session.create_dataframe([datetime.datetime(2023, 8, 8, 1, 2, 3)], schema=["ts"])
>>> df.select(to_date(col("ts") + make_interval(days=10)).alias("next_day")).show()
--------------
|"NEXT_DAY" |
--------------
|2023-08-18 |
--------------
<BLANKLINE>
You can also find some examples to use interval constants with :meth:`~snowflake.snowpark.Window.range_between`
method.
"""
# for migration purpose
minutes = minutes or mins
seconds = seconds or secs

# create column
return Column(
Interval(
years,
quarters,
months,
weeks,
days,
hours,
minutes,
seconds,
milliseconds,
microseconds,
nanoseconds,
)
)
113 changes: 95 additions & 18 deletions src/snowflake/snowpark/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,33 @@
from collections.abc import Iterable


def _convert_boundary_to_expr(start: int, end: int) -> Tuple[Expression, Expression]:
if start == 0:
boundary_start = CurrentRow()
elif start <= Window.UNBOUNDED_PRECEDING:
boundary_start = UnboundedPreceding()
def _convert_boundary_to_expr(
start: Union[int, "snowflake.snowpark.Column"],
end: Union[int, "snowflake.snowpark.Column"],
) -> Tuple[Expression, Expression]:
if isinstance(start, int):
if start == 0:
boundary_start = CurrentRow()
elif start <= Window.UNBOUNDED_PRECEDING:
boundary_start = UnboundedPreceding()
else:
boundary_start = Literal(start)
elif isinstance(start, snowflake.snowpark.Column):
boundary_start = start._expression
else:
boundary_start = Literal(start)

if end == 0:
boundary_end = CurrentRow()
elif end >= Window.UNBOUNDED_FOLLOWING:
boundary_end = UnboundedFollowing()
raise ValueError("start must be an integer or a Column")

if isinstance(end, int):
if end == 0:
boundary_end = CurrentRow()
elif end >= Window.UNBOUNDED_FOLLOWING:
boundary_end = UnboundedFollowing()
else:
boundary_end = Literal(end)
elif isinstance(end, snowflake.snowpark.Column):
boundary_end = end._expression
else:
boundary_end = Literal(end)
raise ValueError("end must be an integer or a Column")

return boundary_start, boundary_end

Expand Down Expand Up @@ -132,9 +145,27 @@ def rows_between(start: int, end: int) -> "WindowSpec":
return Window._spec().rows_between(start, end)

@staticmethod
def range_between(start: int, end: int) -> "WindowSpec":
def range_between(
start: Union[int, "snowflake.snowpark.Column"],
end: Union[int, "snowflake.snowpark.Column"],
) -> "WindowSpec":
"""
Returns a :class:`WindowSpec` object with the range frame clause.
``start`` and ``end`` can be
- an integer representing the relative position from the current row, or
- :attr:`Window.UNBOUNDED_PRECEDING`, :attr:`Window.UNBOUNDED_FOLLOWING`
and :attr:`Window.CURRENT_ROW`, which represent unbounded preceding,
unbounded following and current row respectively, or
- a :class:`~snowflake.snowpark.column.Column` object created by
:func:`~snowflake.snowpark.functions.make_interval` to use
`Interval constants <https://docs.snowflake.com/en/sql-reference/data-types-datetime#interval-constants>`_.
Interval constants can only be used with this function when the order by column is TIMESTAMP or DATE type
See more details how to use interval constants in
`RANGE BETWEEN <https://docs.snowflake.com/sql-reference/functions-analytic#label-range-between-syntax-desc>`_
clause. However, you cannot mix the numeric values and interval constants in the same range frame clause.
Args:
start: The relative position from the current row as a boundary start (inclusive).
Expand All @@ -144,10 +175,52 @@ def range_between(start: int, end: int) -> "WindowSpec":
The frame is unbounded if this is :attr:`Window.UNBOUNDED_FOLLOWING`, or any
value greater than or equal to 9223372036854775807 (``sys.maxsize``).
Note:
You can use :attr:`Window.UNBOUNDED_PRECEDING`, :attr:`Window.UNBOUNDED_FOLLOWING`,
and :attr:`Window.CURRENT_ROW` to specify ``start`` and ``end``, instead of using
integral values directly.
Example 1
Use numeric values to specify the range frame:
>>> from snowflake.snowpark.functions import col, count, make_interval
>>>
>>> df = session.range(5)
>>> window = Window.order_by("id").range_between(-1, Window.CURRENT_ROW)
>>> df.select(col("id"), count("id").over(window).as_("count")).show()
------------------
|"ID" |"COUNT" |
------------------
|0 |1 |
|1 |2 |
|2 |2 |
|3 |2 |
|4 |2 |
------------------
<BLANKLINE>
Example 2
Use interval constants to specify the range frame:
>>> import datetime
>>> from snowflake.snowpark.types import StructType, StructField, TimestampType, TimestampTimeZone
>>>
>>> df = session.create_dataframe(
... [
... datetime.datetime(2021, 12, 21, 9, 12, 56),
... datetime.datetime(2021, 12, 21, 8, 12, 56),
... datetime.datetime(2021, 12, 21, 7, 12, 56),
... datetime.datetime(2021, 12, 21, 6, 12, 56),
... ],
... schema=StructType([StructField("a", TimestampType(TimestampTimeZone.NTZ))]),
... )
>>> window = Window.order_by(col("a").desc()).range_between(-make_interval(hours=1), make_interval(hours=1))
>>> df.select(col("a"), count("a").over(window).as_("count")).show()
---------------------------------
|"A" |"COUNT" |
---------------------------------
|2021-12-21 09:12:56 |2 |
|2021-12-21 08:12:56 |3 |
|2021-12-21 07:12:56 |3 |
|2021-12-21 06:12:56 |2 |
---------------------------------
<BLANKLINE>
"""
return Window._spec().range_between(start, end)

Expand Down Expand Up @@ -241,7 +314,11 @@ def rows_between(self, start: int, end: int) -> "WindowSpec":
SpecifiedWindowFrame(RowFrame(), boundary_start, boundary_end),
)

def range_between(self, start: int, end: int) -> "WindowSpec":
def range_between(
self,
start: Union[int, "snowflake.snowpark.Column"],
end: Union[int, "snowflake.snowpark.Column"],
) -> "WindowSpec":
"""
Returns a new :class:`WindowSpec` object with the new range frame clause.
Expand Down
Loading

0 comments on commit 67e2f87

Please sign in to comment.