SNOW-1486910: Return DatetimeIndex from to_datetime and date_range (#…

…2050)
snowflakedb · Aug 12, 2024 · 619feae · 619feae
1 parent 131e820
commit 619feae
Show file tree

Hide file tree

Showing 22 changed files with 245 additions and 278 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -80,6 +80,8 @@
 ### Behavior change
 - `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object.
 - Refactor and introduce `query_compiler` argument in `Index` constructor to create `Index` from query compiler.
+- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object.
+- `pd.date_range` now returns a DatetimeIndex object instead of a Series object.
 
 ## 1.20.0 (2024-07-17)
 

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1352,7 +1352,7 @@ def to_datetime(
     infer_datetime_format: lib.NoDefault | bool = lib.no_default,
     origin: Any = "unix",
     cache: bool = True,
-) -> Series | DatetimeScalar | NaTType | None:
+) -> pd.DatetimeIndex | Series | DatetimeScalar | NaTType | None:
     """
     Convert argument to datetime.
 
@@ -1459,8 +1459,7 @@ def to_datetime(
         parsing):
 
         - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
-        - array-like: :class:`~snowflake.snowpark.modin.pandas.Series` with :class:`datetime64` dtype containing
-          :class:`datetime.datetime` (or
+        - array-like: :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex` (or
           :class: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`object` dtype containing
           :class:`datetime.datetime`)
         - Series: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`datetime64` dtype (or
@@ -1572,10 +1571,7 @@ def to_datetime(
 
     >>> pd.to_datetime([1, 2, 3], unit='D',
     ...                origin=pd.Timestamp('1960-01-01'))
-    0   1960-01-02
-    1   1960-01-03
-    2   1960-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
 
 
     **Non-convertible date/times**
@@ -1589,9 +1585,7 @@ def to_datetime(
     in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
 
     >>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce')
-    0   NaT
-    1   NaT
-    dtype: datetime64[ns]
+    DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)
 
 
     .. _to_datetime_tz_examples:
@@ -1603,55 +1597,41 @@ def to_datetime(
     - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
-    0   2018-10-26 12:00:00
-    1   2018-10-26 13:00:15
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
 
     - Timezone-aware inputs *with constant time offset* are still converted to
       timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
-    0   2018-10-26 12:00:00
-    1   2018-10-26 13:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
 
     - Use right format to convert to timezone-aware type (Note that when call Snowpark
       pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z")
-    0   2018-10-26 10:00:00-07:00
-    1   2018-10-26 11:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs *with mixed time offsets* (for example
       issued from a timezone with daylight savings, such as Europe/Paris):
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
-    0   2020-10-25 02:00:00
-    1   2020-10-25 04:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
-    0   2020-10-24 17:00:00-07:00
-    1   2020-10-24 20:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
 
     - Timezone-naive inputs are *localized* based on the session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
-    0   2018-10-26 12:00:00-07:00
-    1   2018-10-26 13:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs are *converted* to session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'],
     ...                utc=True)
-    0   2018-10-26 10:30:00-07:00
-    1   2018-10-26 10:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
     raise_if_native_pandas_objects(arg)
@@ -1668,22 +1648,14 @@ def to_datetime(
             message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
         )
     arg_is_scalar = is_scalar(arg)
-    # handle empty array, list, dict
-    if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0:
-        return arg if isinstance(arg, Series) else Series(arg)  # always return a Series
-    if not isinstance(arg, (DataFrame, Series)):
-        # turn dictionary like arg into DataFrame and list like or scalar to Series
-        if isinstance(arg, dict):
-            arg = DataFrame(arg)  # pragma: no cover
-        else:
-            name = None
-            # keep index name
-            if isinstance(arg, pd.Index):
-                name = arg.name
-            arg = Series(arg)
-            arg.name = name
-
-    series = arg._to_datetime(
+
+    if not isinstance(arg, (DataFrame, Series, pd.Index)):
+        # Turn dictionary like arg into pd.DataFrame and list-like or scalar to
+        # pd.Index.
+        arg = [arg] if arg_is_scalar else arg
+        arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)
+
+    series_or_index = arg._to_datetime(
         errors=errors,
         dayfirst=dayfirst,
         yearfirst=yearfirst,
@@ -1697,9 +1669,10 @@ def to_datetime(
     if arg_is_scalar:
         # Calling squeeze directly on Snowpark pandas Series makes an unnecessary
         # count sql call. To avoid that we convert Snowpark pandas Series to Native
-        # pandas seris first.
-        return series.to_pandas().squeeze()
-    return series
+        # pandas series first.
+        # Note: When arg_is_scalar is True 'series_or_index' is always an Index.
+        return series_or_index.to_series().to_pandas().squeeze()
+    return series_or_index
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
@@ -2004,9 +1977,9 @@ def date_range(
     name: Hashable | None = None,
     inclusive: IntervalClosedType = "both",
     **kwargs,
-) -> Series:
+) -> pd.DatetimeIndex:
     """
-    Return a fixed frequency series.
+    Return a fixed frequency DatetimeIndex.
 
     Returns the range of equally spaced time points (where the difference between any
     two adjacent points is specified by the given frequency) such that they all
@@ -2078,109 +2051,72 @@ def date_range(
     Specify `start` and `end`, with the default daily frequency.
 
     >>> pd.date_range(start='1/1/2018', end='1/08/2018')
-    0   2018-01-01
-    1   2018-01-02
-    2   2018-01-03
-    3   2018-01-04
-    4   2018-01-05
-    5   2018-01-06
-    6   2018-01-07
-    7   2018-01-08
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `start` and `periods`, the number of periods (days).
 
     >>> pd.date_range(start='1/1/2018', periods=8)
-    0   2018-01-01
-    1   2018-01-02
-    2   2018-01-03
-    3   2018-01-04
-    4   2018-01-05
-    5   2018-01-06
-    6   2018-01-07
-    7   2018-01-08
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `end` and `periods`, the number of periods (days).
 
     >>> pd.date_range(end='1/1/2018', periods=8)
-    0   2017-12-25
-    1   2017-12-26
-    2   2017-12-27
-    3   2017-12-28
-    4   2017-12-29
-    5   2017-12-30
-    6   2017-12-31
-    7   2018-01-01
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
+                   '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `start`, `end`, and `periods`; the frequency is generated
     automatically (linearly spaced).
 
     >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
-    0   2018-04-24 00:00:00
-    1   2018-04-25 12:00:00
-    2   2018-04-27 00:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
+                   '2018-04-27 00:00:00'],
+                  dtype='datetime64[ns]', freq=None)
+
 
     **Other Parameters**
 
     Changed the `freq` (frequency) to ``'ME'`` (month end frequency).
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq='ME')
-    0   2018-01-31
-    1   2018-02-28
-    2   2018-03-31
-    3   2018-04-30
-    4   2018-05-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
+                   '2018-05-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     Multiples are allowed
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME')
-    0   2018-01-31
-    1   2018-04-30
-    2   2018-07-31
-    3   2018-10-31
-    4   2019-01-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     `freq` can also be specified as an Offset object.
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
-    0   2018-01-31
-    1   2018-04-30
-    2   2018-07-31
-    3   2018-10-31
-    4   2019-01-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     `inclusive` controls whether to include `start` and `end` that are on the
     boundary. The default, "both", includes boundary points on either end.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both")
-    0   2017-01-01
-    1   2017-01-02
-    2   2017-01-03
-    3   2017-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
 
     Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left')
-    0   2017-01-01
-    1   2017-01-02
-    2   2017-01-03
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)
 
     Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and
     similarly ``inclusive='neither'`` will exclude both `start` and `end`.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
-    0   2017-01-02
-    1   2017-01-03
-    2   2017-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
 
@@ -2229,9 +2165,11 @@ def date_range(
         left_inclusive=left_inclusive,
         right_inclusive=right_inclusive,
     )
-    s = Series(query_compiler=qc)
-    s.name = name
-    return s
+    # Set date range as index column.
+    qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False)
+    # Set index column name.
+    qc = qc.set_index_names([name])
+    return pd.DatetimeIndex(query_compiler=qc)
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator

diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py
@@ -130,7 +130,11 @@ def __init__(
         # modified:
         # Engine.subscribe(_update_engine)
 
-        if isinstance(data, type(self)):
+        # Convert lazy index to Series without pulling the data to client.
+        if isinstance(data, pd.Index):
+            query_compiler = data.to_series(index=index, name=name)._query_compiler
+            query_compiler = query_compiler.reset_index(drop=True)
+        elif isinstance(data, type(self)):
             query_compiler = data._query_compiler.copy()
             if index is not None:
                 if any(i not in data.index for i in index):