Skip to content

Commit

Permalink
add DataFrame.column_iter
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Nov 20, 2023
1 parent d896e65 commit a03cf9c
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 28 deletions.
22 changes: 11 additions & 11 deletions spec/API_specification/dataframe_api/dataframe_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING, Any, Literal, NoReturn, Protocol

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from collections.abc import Iterator, Mapping, Sequence

from typing_extensions import Self

Expand Down Expand Up @@ -275,6 +275,10 @@ def schema(self) -> dict[str, DType]:
"""
...

def columns_iter(self) -> Iterator[Column]:
"""Return iterator over columns."""
...

def sort(
self,
*keys: str,
Expand Down Expand Up @@ -995,23 +999,19 @@ def persist(self) -> Self:
.. code-block:: python
df: DataFrame
features = []
result = df.std() > 0
result = result.persist()
for column_name in df.column_names:
if result.col(column_name).get_value(0):
features.append(column_name)
features = [col.name for col in df.columns_iter() if col.get_value(0)]
instead of this:
.. code-block:: python
df: DataFrame
features = []
for column_name in df.column_names:
# Do NOT call `persist` on a `DataFrame` within a for-loop!
# This may re-trigger the same computation multiple times
if df.persist().col(column_name).std() > 0:
features.append(column_name)
result = df.std() > 0
# Do NOT do this!
features = [
col.name for col in df.columns_iter() if col.get_value(0).persist()
]
"""
...
11 changes: 5 additions & 6 deletions spec/API_specification/examples/01_standardise_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any:
df = df_non_standard.__dataframe_consortium_standard__(api_version="2023.09-beta")

for column_name in df.column_names:
if column_name == "species":
continue
new_column = df.col(column_name)
new_column = (new_column - new_column.mean()) / new_column.std()
df = df.assign(new_column.rename(f"{column_name}_scaled"))
new_columns = [
((col - col.mean()) / col.std()).rename(f"{col.name}_scaled")
for col in df.columns_iter()
]
df = df.assign(*new_columns)

return df.dataframe
6 changes: 3 additions & 3 deletions spec/API_specification/examples/04_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI:
namespace = df.__dataframe_namespace__()
df = df.select(
*[
col_name
for col_name in df.column_names
if isinstance(df.col(col_name).dtype, namespace.Int64)
col.name
for col in df.columns_iter()
if isinstance(col.dtype, namespace.Int64)
],
)
arr = df.to_array(namespace.Int64())
Expand Down
12 changes: 4 additions & 8 deletions spec/design_topics/execution_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,13 @@ not be supported in some cases.
For example, let's consider the following:
```python
df: DataFrame
features = []
for column_name in df.column_names:
if df.col(column_name).std() > 0:
features.append(column_name)
return features
features = [col.name for col in df.columns_iter() if col.std() > 0]
```
If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns
If `df` is a lazy dataframe, then the call `col.std() > 0` returns
a (ducktyped) Python boolean scalar. No issues so far. Problem is,
what happens when `if df.col(column_name).std() > 0` is called?
what happens when `if col.std() > 0` is called?

Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in
Under the hood, Python will call `(col.std() > 0).__bool__()` in
order to extract a Python boolean. This is a problem for "lazy" implementations,
as the laziness needs breaking in order to evaluate the above.

Expand Down

0 comments on commit a03cf9c

Please sign in to comment.