SNOW-1486910: Return DatetimeIndex from to_datetime and date_range

snowflakedb · Aug 9, 2024 · a88a2ab · a88a2ab
1 parent e427ab9
commit a88a2ab
Show file tree

Hide file tree

Showing 7 changed files with 166 additions and 147 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -78,6 +78,8 @@
 
 ### Behavior change
 - `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object.
+- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object.
+- `pd.date_range` now returns a DatetimeIndex object instead of a Series object.
 
 ## 1.20.0 (2024-07-17)
 

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1572,10 +1572,7 @@ def to_datetime(
 
     >>> pd.to_datetime([1, 2, 3], unit='D',
     ...                origin=pd.Timestamp('1960-01-01'))
-    0   1960-01-02
-    1   1960-01-03
-    2   1960-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
 
 
     **Non-convertible date/times**
@@ -1589,9 +1586,7 @@ def to_datetime(
     in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
 
     >>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce')
-    0   NaT
-    1   NaT
-    dtype: datetime64[ns]
+    DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)
 
 
     .. _to_datetime_tz_examples:
@@ -1603,55 +1598,41 @@ def to_datetime(
     - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
-    0   2018-10-26 12:00:00
-    1   2018-10-26 13:00:15
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
 
     - Timezone-aware inputs *with constant time offset* are still converted to
       timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
-    0   2018-10-26 12:00:00
-    1   2018-10-26 13:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
 
     - Use right format to convert to timezone-aware type (Note that when call Snowpark
       pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z")
-    0   2018-10-26 10:00:00-07:00
-    1   2018-10-26 11:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs *with mixed time offsets* (for example
       issued from a timezone with daylight savings, such as Europe/Paris):
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
-    0   2020-10-25 02:00:00
-    1   2020-10-25 04:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
-    0   2020-10-24 17:00:00-07:00
-    1   2020-10-24 20:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
 
     - Timezone-naive inputs are *localized* based on the session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
-    0   2018-10-26 12:00:00-07:00
-    1   2018-10-26 13:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs are *converted* to session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'],
     ...                utc=True)
-    0   2018-10-26 10:30:00-07:00
-    1   2018-10-26 10:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
     raise_if_native_pandas_objects(arg)
@@ -1668,22 +1649,14 @@ def to_datetime(
             message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
         )
     arg_is_scalar = is_scalar(arg)
-    # handle empty array, list, dict
-    if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0:
-        return arg if isinstance(arg, Series) else Series(arg)  # always return a Series
-    if not isinstance(arg, (DataFrame, Series)):
-        # turn dictionary like arg into DataFrame and list like or scalar to Series
-        if isinstance(arg, dict):
-            arg = DataFrame(arg)  # pragma: no cover
-        else:
-            name = None
-            # keep index name
-            if isinstance(arg, pd.Index):
-                name = arg.name
-            arg = Series(arg)
-            arg.name = name
-
-    series = arg._to_datetime(
+
+    if not isinstance(arg, (DataFrame, Series, pd.Index)):
+        # Turn dictionary like arg into pd.DataFrame and list-like or scalar to
+        # pd.Index.
+        arg = [arg] if arg_is_scalar else arg
+        arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)
+
+    series_or_index = arg._to_datetime(
         errors=errors,
         dayfirst=dayfirst,
         yearfirst=yearfirst,
@@ -1697,9 +1670,10 @@ def to_datetime(
     if arg_is_scalar:
         # Calling squeeze directly on Snowpark pandas Series makes an unnecessary
         # count sql call. To avoid that we convert Snowpark pandas Series to Native
-        # pandas seris first.
-        return series.to_pandas().squeeze()
-    return series
+        # pandas series first.
+        # Note: When arg_is_scalar is True 'series_or_index' is always an Index.
+        return series_or_index.to_series().to_pandas().squeeze()
+    return series_or_index
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
@@ -2004,9 +1978,9 @@ def date_range(
     name: Hashable | None = None,
     inclusive: IntervalClosedType = "both",
     **kwargs,
-) -> Series:
+) -> pd.DatetimeIndex:
     """
-    Return a fixed frequency series.
+    Return a fixed frequency DatetimeIndex.
 
     Returns the range of equally spaced time points (where the difference between any
     two adjacent points is specified by the given frequency) such that they all
@@ -2078,109 +2052,72 @@ def date_range(
     Specify `start` and `end`, with the default daily frequency.
 
     >>> pd.date_range(start='1/1/2018', end='1/08/2018')
-    0   2018-01-01
-    1   2018-01-02
-    2   2018-01-03
-    3   2018-01-04
-    4   2018-01-05
-    5   2018-01-06
-    6   2018-01-07
-    7   2018-01-08
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `start` and `periods`, the number of periods (days).
 
     >>> pd.date_range(start='1/1/2018', periods=8)
-    0   2018-01-01
-    1   2018-01-02
-    2   2018-01-03
-    3   2018-01-04
-    4   2018-01-05
-    5   2018-01-06
-    6   2018-01-07
-    7   2018-01-08
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `end` and `periods`, the number of periods (days).
 
     >>> pd.date_range(end='1/1/2018', periods=8)
-    0   2017-12-25
-    1   2017-12-26
-    2   2017-12-27
-    3   2017-12-28
-    4   2017-12-29
-    5   2017-12-30
-    6   2017-12-31
-    7   2018-01-01
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
+                   '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `start`, `end`, and `periods`; the frequency is generated
     automatically (linearly spaced).
 
     >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
-    0   2018-04-24 00:00:00
-    1   2018-04-25 12:00:00
-    2   2018-04-27 00:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
+                   '2018-04-27 00:00:00'],
+                  dtype='datetime64[ns]', freq=None)
+
 
     **Other Parameters**
 
     Changed the `freq` (frequency) to ``'ME'`` (month end frequency).
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq='ME')
-    0   2018-01-31
-    1   2018-02-28
-    2   2018-03-31
-    3   2018-04-30
-    4   2018-05-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
+                   '2018-05-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     Multiples are allowed
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME')
-    0   2018-01-31
-    1   2018-04-30
-    2   2018-07-31
-    3   2018-10-31
-    4   2019-01-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     `freq` can also be specified as an Offset object.
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
-    0   2018-01-31
-    1   2018-04-30
-    2   2018-07-31
-    3   2018-10-31
-    4   2019-01-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     `inclusive` controls whether to include `start` and `end` that are on the
     boundary. The default, "both", includes boundary points on either end.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both")
-    0   2017-01-01
-    1   2017-01-02
-    2   2017-01-03
-    3   2017-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
 
     Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left')
-    0   2017-01-01
-    1   2017-01-02
-    2   2017-01-03
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)
 
     Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and
     similarly ``inclusive='neither'`` will exclude both `start` and `end`.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
-    0   2017-01-02
-    1   2017-01-03
-    2   2017-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
 
@@ -2229,9 +2166,11 @@ def date_range(
         left_inclusive=left_inclusive,
         right_inclusive=right_inclusive,
     )
-    s = Series(query_compiler=qc)
-    s.name = name
-    return s
+    # Set date range as index column.
+    qc = qc.set_index_from_columns(qc.columns.tolist())
+    # Set index column name.
+    qc = qc.set_index_names([name])
+    return pd.DatetimeIndex(data=qc)
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -5551,7 +5551,7 @@ def set_index_from_columns(
         for (
             ids
         ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
-            keys
+            keys, include_index=False
         ):
             # Error checking for missing labels is already done in frontend layer.
             index_column_snowflake_quoted_identifiers.append(ids[0])
@@ -5870,6 +5870,7 @@ def series_to_datetime(
         unit: Optional[str] = None,
         infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default,
         origin: DateTimeOrigin = "unix",
+        include_index: bool = False,
     ) -> "SnowflakeQueryCompiler":
         """
         Convert series to the datetime dtype.
@@ -5884,6 +5885,7 @@ def series_to_datetime(
             unit: to_datetime unit
             infer_datetime_format: to_datetime infer_datetime_format
             origin: to_datetime origin
+            include_index: If True, also convert index columns to datetime.
         Returns:
             SnowflakeQueryCompiler:
             QueryCompiler with a single data column converted to datetime dtype.
@@ -5896,12 +5898,16 @@ def series_to_datetime(
             to_snowflake_timestamp_format(format) if format is not None else None
         )
         id_to_sf_type_map = self._modin_frame.quoted_identifier_to_snowflake_type()
-        col_id = self._modin_frame.data_column_snowflake_quoted_identifiers[0]
-        sf_type = id_to_sf_type_map[col_id]
+        col_ids = []
+        if include_index:
+            col_ids = self._modin_frame.index_column_snowflake_quoted_identifiers
+        col_ids.extend(self._modin_frame.data_column_snowflake_quoted_identifiers)
 
-        if isinstance(sf_type, BooleanType):
-            # bool is not allowed in to_datetime (but note that bool is allowed by astype)
-            raise TypeError("dtype bool cannot be converted to datetime64[ns]")
+        for col_id in col_ids:
+            sf_type = id_to_sf_type_map[col_id]
+            if isinstance(sf_type, BooleanType):
+                # bool is not allowed in to_datetime (but note that bool is allowed by astype)
+                raise TypeError("dtype bool cannot be converted to datetime64[ns]")
 
         to_datetime_cols = {
             col_id: generate_timestamp_col(
@@ -5913,6 +5919,7 @@ def series_to_datetime(
                 unit="ns" if unit is None else unit,
                 origin=origin,
             )
+            for col_id in col_ids
         }
         return SnowflakeQueryCompiler(
             self._modin_frame.update_snowflake_quoted_identifiers_with_expressions(