Skip to content

Commit

Permalink
SNOW-1486910: Return DatetimeIndex from to_datetime and date_range
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-nkumar committed Aug 9, 2024
1 parent e427ab9 commit a88a2ab
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 147 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@

### Behavior change
- `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object.
- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object.
- `pd.date_range` now returns a DatetimeIndex object instead of a Series object.

## 1.20.0 (2024-07-17)

Expand Down
167 changes: 53 additions & 114 deletions src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -1572,10 +1572,7 @@ def to_datetime(
>>> pd.to_datetime([1, 2, 3], unit='D',
... origin=pd.Timestamp('1960-01-01'))
0 1960-01-02
1 1960-01-03
2 1960-01-04
dtype: datetime64[ns]
DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
**Non-convertible date/times**
Expand All @@ -1589,9 +1586,7 @@ def to_datetime(
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
>>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce')
0 NaT
1 NaT
dtype: datetime64[ns]
DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)
.. _to_datetime_tz_examples:
Expand All @@ -1603,55 +1598,41 @@ def to_datetime(
- Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
0 2018-10-26 12:00:00
1 2018-10-26 13:00:15
dtype: datetime64[ns]
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
- Timezone-aware inputs *with constant time offset* are still converted to
timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
>>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
0 2018-10-26 12:00:00
1 2018-10-26 13:00:00
dtype: datetime64[ns]
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
- Use right format to convert to timezone-aware type (Note that when call Snowpark
pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
>>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z")
0 2018-10-26 10:00:00-07:00
1 2018-10-26 11:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
- Timezone-aware inputs *with mixed time offsets* (for example
issued from a timezone with daylight savings, such as Europe/Paris):
>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
0 2020-10-25 02:00:00
1 2020-10-25 04:00:00
dtype: datetime64[ns]
DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
0 2020-10-24 17:00:00-07:00
1 2020-10-24 20:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
- Timezone-naive inputs are *localized* based on the session timezone
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
0 2018-10-26 12:00:00-07:00
1 2018-10-26 13:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
- Timezone-aware inputs are *converted* to session timezone
>>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'],
... utc=True)
0 2018-10-26 10:30:00-07:00
1 2018-10-26 10:00:00-07:00
dtype: datetime64[ns, America/Los_Angeles]
DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
"""
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
raise_if_native_pandas_objects(arg)
Expand All @@ -1668,22 +1649,14 @@ def to_datetime(
message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
)
arg_is_scalar = is_scalar(arg)
# handle empty array, list, dict
if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0:
return arg if isinstance(arg, Series) else Series(arg) # always return a Series
if not isinstance(arg, (DataFrame, Series)):
# turn dictionary like arg into DataFrame and list like or scalar to Series
if isinstance(arg, dict):
arg = DataFrame(arg) # pragma: no cover
else:
name = None
# keep index name
if isinstance(arg, pd.Index):
name = arg.name
arg = Series(arg)
arg.name = name

series = arg._to_datetime(

if not isinstance(arg, (DataFrame, Series, pd.Index)):
# Turn dictionary like arg into pd.DataFrame and list-like or scalar to
# pd.Index.
arg = [arg] if arg_is_scalar else arg
arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)

series_or_index = arg._to_datetime(
errors=errors,
dayfirst=dayfirst,
yearfirst=yearfirst,
Expand All @@ -1697,9 +1670,10 @@ def to_datetime(
if arg_is_scalar:
# Calling squeeze directly on Snowpark pandas Series makes an unnecessary
# count sql call. To avoid that we convert Snowpark pandas Series to Native
# pandas seris first.
return series.to_pandas().squeeze()
return series
# pandas series first.
# Note: When arg_is_scalar is True 'series_or_index' is always an Index.
return series_or_index.to_series().to_pandas().squeeze()
return series_or_index


@snowpark_pandas_telemetry_standalone_function_decorator
Expand Down Expand Up @@ -2004,9 +1978,9 @@ def date_range(
name: Hashable | None = None,
inclusive: IntervalClosedType = "both",
**kwargs,
) -> Series:
) -> pd.DatetimeIndex:
"""
Return a fixed frequency series.
Return a fixed frequency DatetimeIndex.
Returns the range of equally spaced time points (where the difference between any
two adjacent points is specified by the given frequency) such that they all
Expand Down Expand Up @@ -2078,109 +2052,72 @@ def date_range(
Specify `start` and `end`, with the default daily frequency.
>>> pd.date_range(start='1/1/2018', end='1/08/2018')
0 2018-01-01
1 2018-01-02
2 2018-01-03
3 2018-01-04
4 2018-01-05
5 2018-01-06
6 2018-01-07
7 2018-01-08
dtype: datetime64[ns]
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
dtype='datetime64[ns]', freq=None)
Specify `start` and `periods`, the number of periods (days).
>>> pd.date_range(start='1/1/2018', periods=8)
0 2018-01-01
1 2018-01-02
2 2018-01-03
3 2018-01-04
4 2018-01-05
5 2018-01-06
6 2018-01-07
7 2018-01-08
dtype: datetime64[ns]
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
dtype='datetime64[ns]', freq=None)
Specify `end` and `periods`, the number of periods (days).
>>> pd.date_range(end='1/1/2018', periods=8)
0 2017-12-25
1 2017-12-26
2 2017-12-27
3 2017-12-28
4 2017-12-29
5 2017-12-30
6 2017-12-31
7 2018-01-01
dtype: datetime64[ns]
DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
'2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
dtype='datetime64[ns]', freq=None)
Specify `start`, `end`, and `periods`; the frequency is generated
automatically (linearly spaced).
>>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
0 2018-04-24 00:00:00
1 2018-04-25 12:00:00
2 2018-04-27 00:00:00
dtype: datetime64[ns]
DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
'2018-04-27 00:00:00'],
dtype='datetime64[ns]', freq=None)
**Other Parameters**
Changed the `freq` (frequency) to ``'ME'`` (month end frequency).
>>> pd.date_range(start='1/1/2018', periods=5, freq='ME')
0 2018-01-31
1 2018-02-28
2 2018-03-31
3 2018-04-30
4 2018-05-31
dtype: datetime64[ns]
DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
'2018-05-31'],
dtype='datetime64[ns]', freq=None)
Multiples are allowed
>>> pd.date_range(start='1/1/2018', periods=5, freq='3ME')
0 2018-01-31
1 2018-04-30
2 2018-07-31
3 2018-10-31
4 2019-01-31
dtype: datetime64[ns]
DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
'2019-01-31'],
dtype='datetime64[ns]', freq=None)
`freq` can also be specified as an Offset object.
>>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
0 2018-01-31
1 2018-04-30
2 2018-07-31
3 2018-10-31
4 2019-01-31
dtype: datetime64[ns]
DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
'2019-01-31'],
dtype='datetime64[ns]', freq=None)
`inclusive` controls whether to include `start` and `end` that are on the
boundary. The default, "both", includes boundary points on either end.
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both")
0 2017-01-01
1 2017-01-02
2 2017-01-03
3 2017-01-04
dtype: datetime64[ns]
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left')
0 2017-01-01
1 2017-01-02
2 2017-01-03
dtype: datetime64[ns]
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)
Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and
similarly ``inclusive='neither'`` will exclude both `start` and `end`.
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
0 2017-01-02
1 2017-01-03
2 2017-01-04
dtype: datetime64[ns]
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
"""
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py

Expand Down Expand Up @@ -2229,9 +2166,11 @@ def date_range(
left_inclusive=left_inclusive,
right_inclusive=right_inclusive,
)
s = Series(query_compiler=qc)
s.name = name
return s
# Set date range as index column.
qc = qc.set_index_from_columns(qc.columns.tolist())
# Set index column name.
qc = qc.set_index_names([name])
return pd.DatetimeIndex(data=qc)


@snowpark_pandas_telemetry_standalone_function_decorator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5551,7 +5551,7 @@ def set_index_from_columns(
for (
ids
) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
keys
keys, include_index=False
):
# Error checking for missing labels is already done in frontend layer.
index_column_snowflake_quoted_identifiers.append(ids[0])
Expand Down Expand Up @@ -5870,6 +5870,7 @@ def series_to_datetime(
unit: Optional[str] = None,
infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default,
origin: DateTimeOrigin = "unix",
include_index: bool = False,
) -> "SnowflakeQueryCompiler":
"""
Convert series to the datetime dtype.
Expand All @@ -5884,6 +5885,7 @@ def series_to_datetime(
unit: to_datetime unit
infer_datetime_format: to_datetime infer_datetime_format
origin: to_datetime origin
include_index: If True, also convert index columns to datetime.
Returns:
SnowflakeQueryCompiler:
QueryCompiler with a single data column converted to datetime dtype.
Expand All @@ -5896,12 +5898,16 @@ def series_to_datetime(
to_snowflake_timestamp_format(format) if format is not None else None
)
id_to_sf_type_map = self._modin_frame.quoted_identifier_to_snowflake_type()
col_id = self._modin_frame.data_column_snowflake_quoted_identifiers[0]
sf_type = id_to_sf_type_map[col_id]
col_ids = []
if include_index:
col_ids = self._modin_frame.index_column_snowflake_quoted_identifiers
col_ids.extend(self._modin_frame.data_column_snowflake_quoted_identifiers)

if isinstance(sf_type, BooleanType):
# bool is not allowed in to_datetime (but note that bool is allowed by astype)
raise TypeError("dtype bool cannot be converted to datetime64[ns]")
for col_id in col_ids:
sf_type = id_to_sf_type_map[col_id]
if isinstance(sf_type, BooleanType):
# bool is not allowed in to_datetime (but note that bool is allowed by astype)
raise TypeError("dtype bool cannot be converted to datetime64[ns]")

to_datetime_cols = {
col_id: generate_timestamp_col(
Expand All @@ -5913,6 +5919,7 @@ def series_to_datetime(
unit="ns" if unit is None else unit,
origin=origin,
)
for col_id in col_ids
}
return SnowflakeQueryCompiler(
self._modin_frame.update_snowflake_quoted_identifiers_with_expressions(
Expand Down
Loading

0 comments on commit a88a2ab

Please sign in to comment.