Skip to content

Commit

Permalink
SNOW-1486910: Return DatetimeIndex from to_datetime and date_range (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-nkumar authored Aug 12, 2024
1 parent 131e820 commit 619feae
Show file tree
Hide file tree
Showing 22 changed files with 245 additions and 278 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
### Behavior change
- `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object.
- Refactor and introduce `query_compiler` argument in `Index` constructor to create `Index` from query compiler.
- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object.
- `pd.date_range` now returns a DatetimeIndex object instead of a Series object.

## 1.20.0 (2024-07-17)

Expand Down
172 changes: 55 additions & 117 deletions src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,7 +1352,7 @@ def to_datetime(
infer_datetime_format: lib.NoDefault | bool = lib.no_default,
origin: Any = "unix",
cache: bool = True,
) -> Series | DatetimeScalar | NaTType | None:
) -> pd.DatetimeIndex | Series | DatetimeScalar | NaTType | None:
"""
Convert argument to datetime.
Expand Down Expand Up @@ -1459,8 +1459,7 @@ def to_datetime(
parsing):
- scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
- array-like: :class:`~snowflake.snowpark.modin.pandas.Series` with :class:`datetime64` dtype containing
:class:`datetime.datetime` (or
- array-like: :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex` (or
:class: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`object` dtype containing
:class:`datetime.datetime`)
- Series: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`datetime64` dtype (or
Expand Down Expand Up @@ -1572,10 +1571,7 @@ def to_datetime(
>>> pd.to_datetime([1, 2, 3], unit='D',
... origin=pd.Timestamp('1960-01-01'))
0 1960-01-02
1 1960-01-03
2 1960-01-04
dtype: datetime64[ns]
DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
**Non-convertible date/times**
Expand All @@ -1589,9 +1585,7 @@ def to_datetime(
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
>>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce')
0 NaT
1 NaT
dtype: datetime64[ns]
DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)
.. _to_datetime_tz_examples:
Expand All @@ -1603,55 +1597,41 @@ def to_datetime(
- Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
0 2018-10-26 12:00:00
1 2018-10-26 13:00:15
dtype: datetime64[ns]
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
- Timezone-aware inputs *with constant time offset* are still converted to
timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
>>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
0 2018-10-26 12:00:00
1 2018-10-26 13:00:00
dtype: datetime64[ns]
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
- Use right format to convert to timezone-aware type (Note that when call Snowpark
pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
>>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z")
0 2018-10-26 10:00:00-07:00
1 2018-10-26 11:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
- Timezone-aware inputs *with mixed time offsets* (for example
issued from a timezone with daylight savings, such as Europe/Paris):
>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
0 2020-10-25 02:00:00
1 2020-10-25 04:00:00
dtype: datetime64[ns]
DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
0 2020-10-24 17:00:00-07:00
1 2020-10-24 20:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
- Timezone-naive inputs are *localized* based on the session timezone
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
0 2018-10-26 12:00:00-07:00
1 2018-10-26 13:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
- Timezone-aware inputs are *converted* to session timezone
>>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'],
... utc=True)
0 2018-10-26 10:30:00-07:00
1 2018-10-26 10:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
"""
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
raise_if_native_pandas_objects(arg)
Expand All @@ -1668,22 +1648,14 @@ def to_datetime(
message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
)
arg_is_scalar = is_scalar(arg)
# handle empty array, list, dict
if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0:
return arg if isinstance(arg, Series) else Series(arg) # always return a Series
if not isinstance(arg, (DataFrame, Series)):
# turn dictionary like arg into DataFrame and list like or scalar to Series
if isinstance(arg, dict):
arg = DataFrame(arg) # pragma: no cover
else:
name = None
# keep index name
if isinstance(arg, pd.Index):
name = arg.name
arg = Series(arg)
arg.name = name

series = arg._to_datetime(

if not isinstance(arg, (DataFrame, Series, pd.Index)):
# Turn dictionary like arg into pd.DataFrame and list-like or scalar to
# pd.Index.
arg = [arg] if arg_is_scalar else arg
arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)

series_or_index = arg._to_datetime(
errors=errors,
dayfirst=dayfirst,
yearfirst=yearfirst,
Expand All @@ -1697,9 +1669,10 @@ def to_datetime(
if arg_is_scalar:
# Calling squeeze directly on Snowpark pandas Series makes an unnecessary
# count sql call. To avoid that we convert Snowpark pandas Series to Native
# pandas seris first.
return series.to_pandas().squeeze()
return series
# pandas series first.
# Note: When arg_is_scalar is True 'series_or_index' is always an Index.
return series_or_index.to_series().to_pandas().squeeze()
return series_or_index


@snowpark_pandas_telemetry_standalone_function_decorator
Expand Down Expand Up @@ -2004,9 +1977,9 @@ def date_range(
name: Hashable | None = None,
inclusive: IntervalClosedType = "both",
**kwargs,
) -> Series:
) -> pd.DatetimeIndex:
"""
Return a fixed frequency series.
Return a fixed frequency DatetimeIndex.
Returns the range of equally spaced time points (where the difference between any
two adjacent points is specified by the given frequency) such that they all
Expand Down Expand Up @@ -2078,109 +2051,72 @@ def date_range(
Specify `start` and `end`, with the default daily frequency.
>>> pd.date_range(start='1/1/2018', end='1/08/2018')
0 2018-01-01
1 2018-01-02
2 2018-01-03
3 2018-01-04
4 2018-01-05
5 2018-01-06
6 2018-01-07
7 2018-01-08
dtype: datetime64[ns]
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
dtype='datetime64[ns]', freq=None)
Specify `start` and `periods`, the number of periods (days).
>>> pd.date_range(start='1/1/2018', periods=8)
0 2018-01-01
1 2018-01-02
2 2018-01-03
3 2018-01-04
4 2018-01-05
5 2018-01-06
6 2018-01-07
7 2018-01-08
dtype: datetime64[ns]
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
dtype='datetime64[ns]', freq=None)
Specify `end` and `periods`, the number of periods (days).
>>> pd.date_range(end='1/1/2018', periods=8)
0 2017-12-25
1 2017-12-26
2 2017-12-27
3 2017-12-28
4 2017-12-29
5 2017-12-30
6 2017-12-31
7 2018-01-01
dtype: datetime64[ns]
DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
'2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
dtype='datetime64[ns]', freq=None)
Specify `start`, `end`, and `periods`; the frequency is generated
automatically (linearly spaced).
>>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
0 2018-04-24 00:00:00
1 2018-04-25 12:00:00
2 2018-04-27 00:00:00
dtype: datetime64[ns]
DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
'2018-04-27 00:00:00'],
dtype='datetime64[ns]', freq=None)
**Other Parameters**
Changed the `freq` (frequency) to ``'ME'`` (month end frequency).
>>> pd.date_range(start='1/1/2018', periods=5, freq='ME')
0 2018-01-31
1 2018-02-28
2 2018-03-31
3 2018-04-30
4 2018-05-31
dtype: datetime64[ns]
DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
'2018-05-31'],
dtype='datetime64[ns]', freq=None)
Multiples are allowed
>>> pd.date_range(start='1/1/2018', periods=5, freq='3ME')
0 2018-01-31
1 2018-04-30
2 2018-07-31
3 2018-10-31
4 2019-01-31
dtype: datetime64[ns]
DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
'2019-01-31'],
dtype='datetime64[ns]', freq=None)
`freq` can also be specified as an Offset object.
>>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
0 2018-01-31
1 2018-04-30
2 2018-07-31
3 2018-10-31
4 2019-01-31
dtype: datetime64[ns]
DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
'2019-01-31'],
dtype='datetime64[ns]', freq=None)
`inclusive` controls whether to include `start` and `end` that are on the
boundary. The default, "both", includes boundary points on either end.
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both")
0 2017-01-01
1 2017-01-02
2 2017-01-03
3 2017-01-04
dtype: datetime64[ns]
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left')
0 2017-01-01
1 2017-01-02
2 2017-01-03
dtype: datetime64[ns]
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)
Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and
similarly ``inclusive='neither'`` will exclude both `start` and `end`.
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
0 2017-01-02
1 2017-01-03
2 2017-01-04
dtype: datetime64[ns]
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
"""
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py

Expand Down Expand Up @@ -2229,9 +2165,11 @@ def date_range(
left_inclusive=left_inclusive,
right_inclusive=right_inclusive,
)
s = Series(query_compiler=qc)
s.name = name
return s
# Set date range as index column.
qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False)
# Set index column name.
qc = qc.set_index_names([name])
return pd.DatetimeIndex(query_compiler=qc)


@snowpark_pandas_telemetry_standalone_function_decorator
Expand Down
6 changes: 5 additions & 1 deletion src/snowflake/snowpark/modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ def __init__(
# modified:
# Engine.subscribe(_update_engine)

if isinstance(data, type(self)):
# Convert lazy index to Series without pulling the data to client.
if isinstance(data, pd.Index):
query_compiler = data.to_series(index=index, name=name)._query_compiler
query_compiler = query_compiler.reset_index(drop=True)
elif isinstance(data, type(self)):
query_compiler = data._query_compiler.copy()
if index is not None:
if any(i not in data.index for i in index):
Expand Down
Loading

0 comments on commit 619feae

Please sign in to comment.