Skip to content

Commit

Permalink
SNOW-1855330, SNOW-1856158: Add support for DataFrame.from_dict, Data…
Browse files Browse the repository at this point in the history
…Frame.from_records
  • Loading branch information
sfc-gh-helmeleegy committed Dec 13, 2024
1 parent 3a66c84 commit f77f93c
Show file tree
Hide file tree
Showing 7 changed files with 295 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- Added support for `Series.str.ljust` and `Series.str.rjust`.
- Added support for `Series.str.center`.
- Added support for `Series.str.pad`.
- Added support for `DataFrame.from_dict` and `DataFrame.from_records`.


## 1.26.0 (2024-12-05)
Expand Down
4 changes: 2 additions & 2 deletions docs/source/modin/supported/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``floordiv`` | P | ``level`` | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``from_dict`` | N | | |
| ``from_dict`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``from_records`` | N | | |
| ``from_records`` | P | | ``N`` if parameter ``data`` is set to a DataFrame |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``ge`` | P | ``level`` | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
Expand Down
143 changes: 141 additions & 2 deletions src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1686,12 +1686,151 @@ def floordiv():
@classmethod
def from_dict():
"""
Construct ``DataFrame`` from dict of array-like or dicts.
Construct DataFrame from dict of array-like or dicts.
Creates DataFrame object from dictionary by columns or by index allowing dtype specification.
Parameters
----------
data : dict
Of the form {field : array-like} or {field : dict}.
orient : {‘columns’, ‘index’, ‘tight’}, default ‘columns’
The “orientation” of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass ‘columns’ (default). Otherwise if the keys should be rows, pass ‘index’. If ‘tight’, assume a dict with keys [‘index’, ‘columns’, ‘data’, ‘index_names’, ‘column_names’].
Added in version 1.4.0: ‘tight’ as an allowed value for the orient argument
dtype : dtype, default None
Data type to force after DataFrame construction, otherwise infer.
columns : list, default None
Column labels to use when orient='index'. Raises a ValueError if used with orient='columns' or orient='tight'.
Returns
-------
DataFrame
See also
--------
DataFrame.from_records
DataFrame from structured ndarray, sequence of tuples or dicts, or DataFrame.
DataFrame
DataFrame object creation using constructor.
DataFrame.to_dict
Convert the DataFrame to a dictionary.
Examples
--------
By default the keys of the dict become the DataFrame columns:
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
Specify orient='index' to create the DataFrame using dictionary keys as rows:
>>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data, orient='index')
0 1 2 3
row_1 3 2 1 0
row_2 a b c d
When using the ‘index’ orientation, the column names can be specified manually:
>>> pd.DataFrame.from_dict(data, orient='index',
... columns=['A', 'B', 'C', 'D'])
A B C D
row_1 3 2 1 0
row_2 a b c d
Specify orient='tight' to create the DataFrame using a ‘tight’ format:
>>> data = {'index': [('a', 'b'), ('a', 'c')],
... 'columns': [('x', 1), ('y', 2)],
... 'data': [[1, 3], [2, 4]],
... 'index_names': ['n1', 'n2'],
... 'column_names': ['z1', 'z2']}
>>> pd.DataFrame.from_dict(data, orient='tight') # doctest: +NORMALIZE_WHITESPACE
z1 x y
z2 1 2
n1 n2
a b 1 3
c 2 4
"""

def from_records():
"""
Convert structured or record ndarray to ``DataFrame``.
Convert structured or record ndarray to DataFrame.
Creates a DataFrame object from a structured ndarray, sequence of tuples or dicts, or DataFrame.
Parameters
----------
data : structured ndarray, sequence of tuples or dicts, or DataFrame
Structured input data.
Deprecated since version 2.1.0: Passing a DataFrame is deprecated.
index : str, list of fields, array-like
Field of array to use as the index, alternately a specific set of input labels to use.
exclude : sequence, default None
Columns or fields to exclude.
columns : sequence, default None
Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns).
coerce_float : bool, default False
Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets.
nrows : int, default None
Number of rows to read if data is an iterator.
Returns
-------
DataFrame
See also
--------
DataFrame.from_dict
DataFrame from dict of array-like or dicts.
DataFrame
DataFrame object creation using constructor.
Examples
--------
Data can be provided as a structured ndarray:
>>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
>>> pd.DataFrame.from_records(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
Data can be provided as a list of dicts:
>>> data = [{'col_1': 3, 'col_2': 'a'},
... {'col_1': 2, 'col_2': 'b'},
... {'col_1': 1, 'col_2': 'c'},
... {'col_1': 0, 'col_2': 'd'}]
>>> pd.DataFrame.from_records(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
Data can be provided as a list of tuples with corresponding columns:
>>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
>>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
"""

def ge():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,17 +406,27 @@ def __rdivmod__(self, other):
# The from_dict and from_records accessors are class methods and cannot be overridden via the
# extensions module, as they need to be foisted onto the namespace directly because they are not
# routed through getattr. To this end, we manually set DataFrame.from_dict to our new method.
@dataframe_not_implemented()
@classmethod
def from_dict(
cls, data, orient="columns", dtype=None, columns=None
): # pragma: no cover # noqa: PR01, RT01, D200
pass # pragma: no cover
"""
Construct ``DataFrame`` from dict of array-like or dicts.
"""
return DataFrame(
native_pd.DataFrame.from_dict(
data=data,
orient=orient,
dtype=dtype,
columns=columns,
)
)


DataFrame.from_dict = from_dict


@dataframe_not_implemented()
@classmethod
def from_records(
cls,
data,
Expand All @@ -426,7 +436,23 @@ def from_records(
coerce_float=False,
nrows=None,
): # pragma: no cover # noqa: PR01, RT01, D200
pass # pragma: no cover
"""
Convert structured or record ndarray to ``DataFrame``.
"""
if isinstance(data, DataFrame):
raise NotImplementedError(
"Snowpark pandas 'DataFrame.from_records' method does not yet support 'data' parameter of type 'DataFrame'"
)
return DataFrame(
native_pd.DataFrame.from_records(
data=data,
index=index,
exclude=exclude,
columns=columns,
coerce_float=coerce_float,
nrows=nrows,
)
)


DataFrame.from_records = from_records
Expand Down
60 changes: 60 additions & 0 deletions tests/integ/modin/frame/test_from_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import modin.pandas as pd
import pandas as native_pd

from tests.integ.modin.utils import assert_frame_equal
from tests.integ.utils.sql_counter import sql_count_checker


@sql_count_checker(query_count=1)
def test_from_dict_basic():
data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}

assert_frame_equal(
pd.DataFrame.from_dict(data),
native_pd.DataFrame.from_dict(data),
check_dtype=False,
)


@sql_count_checker(query_count=1)
def test_from_dict_orient_index():
data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]}

assert_frame_equal(
pd.DataFrame.from_dict(data, orient="index"),
native_pd.DataFrame.from_dict(data, orient="index"),
check_dtype=False,
)


@sql_count_checker(query_count=1)
def test_from_dict_orient_index_columns():
data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]}

assert_frame_equal(
pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]),
native_pd.DataFrame.from_dict(
data, orient="index", columns=["A", "B", "C", "D"]
),
check_dtype=False,
)


@sql_count_checker(query_count=1)
def test_from_dict_orient_index_tight():
data = {
"index": [("a", "b"), ("a", "c")],
"columns": [("x", 1), ("y", 2)],
"data": [[1, 3], [2, 4]],
"index_names": ["n1", "n2"],
"column_names": ["z1", "z2"],
}

assert_frame_equal(
pd.DataFrame.from_dict(data, orient="tight"),
native_pd.DataFrame.from_dict(data, orient="tight"),
check_dtype=False,
)
61 changes: 61 additions & 0 deletions tests/integ/modin/frame/test_from_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import modin.pandas as pd
import pandas as native_pd
import numpy as np
import pytest

from tests.integ.modin.utils import assert_frame_equal
from tests.integ.utils.sql_counter import sql_count_checker


@sql_count_checker(query_count=1)
def test_from_records_structured_ndarray():
data = np.array(
[(3, "a"), (2, "b"), (1, "c"), (0, "d")],
dtype=[("col_1", "i4"), ("col_2", "U1")],
)
assert_frame_equal(
pd.DataFrame.from_records(data),
native_pd.DataFrame.from_records(data),
check_dtype=False,
)


@sql_count_checker(query_count=1)
def test_from_records_list_of_dicts():
data = [
{"col_1": 3, "col_2": "a"},
{"col_1": 2, "col_2": "b"},
{"col_1": 1, "col_2": "c"},
{"col_1": 0, "col_2": "d"},
]

assert_frame_equal(
pd.DataFrame.from_records(data),
native_pd.DataFrame.from_records(data),
check_dtype=False,
)


@sql_count_checker(query_count=1)
def test_from_records_list_of_records():
data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")]

assert_frame_equal(
pd.DataFrame.from_records(data),
native_pd.DataFrame.from_records(data),
check_dtype=False,
)


@sql_count_checker(query_count=0)
def test_from_records_neg():
data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})

with pytest.raises(
NotImplementedError,
match="Snowpark pandas 'DataFrame.from_records' method does not yet support 'data' parameter of type 'DataFrame'",
):
pd.DataFrame.from_records(data),
2 changes: 0 additions & 2 deletions tests/unit/modin/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@ def test_unsupported_general(general_method, kwargs):
["combine", {"other": "", "func": ""}],
["combine_first", {"other": ""}],
["filter", {}],
["from_dict", {"data": ""}],
["from_records", {"data": ""}],
["hist", {}],
["infer_objects", {}],
["interpolate", {}],
Expand Down

0 comments on commit f77f93c

Please sign in to comment.