SNOW-1855330, SNOW-1856158: Add support for DataFrame.from_dict, Data…

…Frame.from_records
snowflakedb · Dec 13, 2024 · f77f93c · f77f93c
1 parent 3a66c84
commit f77f93c
Show file tree

Hide file tree

Showing 7 changed files with 295 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@
 - Added support for `Series.str.ljust` and `Series.str.rjust`.
 - Added support for `Series.str.center`.
 - Added support for `Series.str.pad`.
+- Added support for `DataFrame.from_dict` and `DataFrame.from_records`.
 
 
 ## 1.26.0 (2024-12-05)

diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst
@@ -193,9 +193,9 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``floordiv``                | P                               | ``level``                        |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``from_dict``               | N                               |                                  |                                                    |
+| ``from_dict``               | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``from_records``            | N                               |                                  |                                                    |
+| ``from_records``            | P                               |                                  | ``N`` if parameter ``data`` is set to a DataFrame  |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``ge``                      | P                               | ``level``                        |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
@@ -1686,12 +1686,151 @@ def floordiv():
     @classmethod
     def from_dict():
         """
-        Construct ``DataFrame`` from dict of array-like or dicts.
+        Construct DataFrame from dict of array-like or dicts.
+
+        Creates DataFrame object from dictionary by columns or by index allowing dtype specification.
+
+        Parameters
+        ----------
+        data : dict
+            Of the form {field : array-like} or {field : dict}.
+        orient : {‘columns’, ‘index’, ‘tight’}, default ‘columns’
+            The “orientation” of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass ‘columns’ (default). Otherwise if the keys should be rows, pass ‘index’. If ‘tight’, assume a dict with keys [‘index’, ‘columns’, ‘data’, ‘index_names’, ‘column_names’].
+
+            Added in version 1.4.0: ‘tight’ as an allowed value for the orient argument
+
+        dtype : dtype, default None
+            Data type to force after DataFrame construction, otherwise infer.
+        columns : list, default None
+            Column labels to use when orient='index'. Raises a ValueError if used with orient='columns' or orient='tight'.
+
+        Returns
+        -------
+        DataFrame
+
+        See also
+        --------
+        DataFrame.from_records
+            DataFrame from structured ndarray, sequence of tuples or dicts, or DataFrame.
+        DataFrame
+            DataFrame object creation using constructor.
+        DataFrame.to_dict
+            Convert the DataFrame to a dictionary.
+
+        Examples
+        --------
+
+        By default the keys of the dict become the DataFrame columns:
+
+        >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
+        >>> pd.DataFrame.from_dict(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Specify orient='index' to create the DataFrame using dictionary keys as rows:
+
+        >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
+        >>> pd.DataFrame.from_dict(data, orient='index')
+               0  1  2  3
+        row_1  3  2  1  0
+        row_2  a  b  c  d
+
+        When using the ‘index’ orientation, the column names can be specified manually:
+
+        >>> pd.DataFrame.from_dict(data, orient='index',
+        ...                        columns=['A', 'B', 'C', 'D'])
+               A  B  C  D
+        row_1  3  2  1  0
+        row_2  a  b  c  d
+
+        Specify orient='tight' to create the DataFrame using a ‘tight’ format:
+
+        >>> data = {'index': [('a', 'b'), ('a', 'c')],
+        ...         'columns': [('x', 1), ('y', 2)],
+        ...         'data': [[1, 3], [2, 4]],
+        ...         'index_names': ['n1', 'n2'],
+        ...         'column_names': ['z1', 'z2']}
+        >>> pd.DataFrame.from_dict(data, orient='tight') # doctest: +NORMALIZE_WHITESPACE
+        z1     x  y
+        z2     1  2
+        n1 n2
+        a  b   1  3
+           c   2  4
         """
 
     def from_records():
         """
-        Convert structured or record ndarray to ``DataFrame``.
+        Convert structured or record ndarray to DataFrame.
+
+        Creates a DataFrame object from a structured ndarray, sequence of tuples or dicts, or DataFrame.
+
+        Parameters
+        ----------
+        data : structured ndarray, sequence of tuples or dicts, or DataFrame
+            Structured input data.
+
+            Deprecated since version 2.1.0: Passing a DataFrame is deprecated.
+
+        index : str, list of fields, array-like
+            Field of array to use as the index, alternately a specific set of input labels to use.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+        columns : sequence, default None
+            Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns).
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+
+        Returns
+        -------
+        DataFrame
+
+        See also
+        --------
+        DataFrame.from_dict
+            DataFrame from dict of array-like or dicts.
+        DataFrame
+            DataFrame object creation using constructor.
+
+        Examples
+        --------
+        Data can be provided as a structured ndarray:
+
+        >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
+        ...                 dtype=[('col_1', 'i4'), ('col_2', 'U1')])
+        >>> pd.DataFrame.from_records(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Data can be provided as a list of dicts:
+
+        >>> data = [{'col_1': 3, 'col_2': 'a'},
+        ...         {'col_1': 2, 'col_2': 'b'},
+        ...         {'col_1': 1, 'col_2': 'c'},
+        ...         {'col_1': 0, 'col_2': 'd'}]
+        >>> pd.DataFrame.from_records(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Data can be provided as a list of tuples with corresponding columns:
+
+        >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
+        >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
         """
 
     def ge():

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py
@@ -406,17 +406,27 @@ def __rdivmod__(self, other):
 # The from_dict and from_records accessors are class methods and cannot be overridden via the
 # extensions module, as they need to be foisted onto the namespace directly because they are not
 # routed through getattr. To this end, we manually set DataFrame.from_dict to our new method.
-@dataframe_not_implemented()
+@classmethod
 def from_dict(
     cls, data, orient="columns", dtype=None, columns=None
 ):  # pragma: no cover # noqa: PR01, RT01, D200
-    pass  # pragma: no cover
+    """
+    Construct ``DataFrame`` from dict of array-like or dicts.
+    """
+    return DataFrame(
+        native_pd.DataFrame.from_dict(
+            data=data,
+            orient=orient,
+            dtype=dtype,
+            columns=columns,
+        )
+    )
 
 
 DataFrame.from_dict = from_dict
 
 
-@dataframe_not_implemented()
+@classmethod
 def from_records(
     cls,
     data,
@@ -426,7 +436,23 @@ def from_records(
     coerce_float=False,
     nrows=None,
 ):  # pragma: no cover # noqa: PR01, RT01, D200
-    pass  # pragma: no cover
+    """
+    Convert structured or record ndarray to ``DataFrame``.
+    """
+    if isinstance(data, DataFrame):
+        raise NotImplementedError(
+            "Snowpark pandas 'DataFrame.from_records' method does not yet support 'data' parameter of type 'DataFrame'"
+        )
+    return DataFrame(
+        native_pd.DataFrame.from_records(
+            data=data,
+            index=index,
+            exclude=exclude,
+            columns=columns,
+            coerce_float=coerce_float,
+            nrows=nrows,
+        )
+    )
 
 
 DataFrame.from_records = from_records

diff --git a/tests/integ/modin/frame/test_from_dict.py b/tests/integ/modin/frame/test_from_dict.py
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+import modin.pandas as pd
+import pandas as native_pd
+
+from tests.integ.modin.utils import assert_frame_equal
+from tests.integ.utils.sql_counter import sql_count_checker
+
+
+@sql_count_checker(query_count=1)
+def test_from_dict_basic():
+    data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
+
+    assert_frame_equal(
+        pd.DataFrame.from_dict(data),
+        native_pd.DataFrame.from_dict(data),
+        check_dtype=False,
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_from_dict_orient_index():
+    data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]}
+
+    assert_frame_equal(
+        pd.DataFrame.from_dict(data, orient="index"),
+        native_pd.DataFrame.from_dict(data, orient="index"),
+        check_dtype=False,
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_from_dict_orient_index_columns():
+    data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]}
+
+    assert_frame_equal(
+        pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]),
+        native_pd.DataFrame.from_dict(
+            data, orient="index", columns=["A", "B", "C", "D"]
+        ),
+        check_dtype=False,
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_from_dict_orient_index_tight():
+    data = {
+        "index": [("a", "b"), ("a", "c")],
+        "columns": [("x", 1), ("y", 2)],
+        "data": [[1, 3], [2, 4]],
+        "index_names": ["n1", "n2"],
+        "column_names": ["z1", "z2"],
+    }
+
+    assert_frame_equal(
+        pd.DataFrame.from_dict(data, orient="tight"),
+        native_pd.DataFrame.from_dict(data, orient="tight"),
+        check_dtype=False,
+    )
diff --git a/tests/integ/modin/frame/test_from_records.py b/tests/integ/modin/frame/test_from_records.py
@@ -0,0 +1,61 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+import modin.pandas as pd
+import pandas as native_pd
+import numpy as np
+import pytest
+
+from tests.integ.modin.utils import assert_frame_equal
+from tests.integ.utils.sql_counter import sql_count_checker
+
+
+@sql_count_checker(query_count=1)
+def test_from_records_structured_ndarray():
+    data = np.array(
+        [(3, "a"), (2, "b"), (1, "c"), (0, "d")],
+        dtype=[("col_1", "i4"), ("col_2", "U1")],
+    )
+    assert_frame_equal(
+        pd.DataFrame.from_records(data),
+        native_pd.DataFrame.from_records(data),
+        check_dtype=False,
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_from_records_list_of_dicts():
+    data = [
+        {"col_1": 3, "col_2": "a"},
+        {"col_1": 2, "col_2": "b"},
+        {"col_1": 1, "col_2": "c"},
+        {"col_1": 0, "col_2": "d"},
+    ]
+
+    assert_frame_equal(
+        pd.DataFrame.from_records(data),
+        native_pd.DataFrame.from_records(data),
+        check_dtype=False,
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_from_records_list_of_records():
+    data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")]
+
+    assert_frame_equal(
+        pd.DataFrame.from_records(data),
+        native_pd.DataFrame.from_records(data),
+        check_dtype=False,
+    )
+
+
+@sql_count_checker(query_count=0)
+def test_from_records_neg():
+    data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Snowpark pandas 'DataFrame.from_records' method does not yet support 'data' parameter of type 'DataFrame'",
+    ):
+        pd.DataFrame.from_records(data),
diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py
@@ -71,8 +71,6 @@ def test_unsupported_general(general_method, kwargs):
         ["combine", {"other": "", "func": ""}],
         ["combine_first", {"other": ""}],
         ["filter", {}],
-        ["from_dict", {"data": ""}],
-        ["from_records", {"data": ""}],
         ["hist", {}],
         ["infer_objects", {}],
         ["interpolate", {}],