Skip to content

Commit

Permalink
Add DataFrame.iter_columns() and simplify (#326)
Browse files Browse the repository at this point in the history
* add DataFrame.column_iter

* iter_columns instead

* lint
  • Loading branch information
MarcoGorelli authored Dec 7, 2023
1 parent 21271f5 commit 8ecbea3
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 30 deletions.
23 changes: 12 additions & 11 deletions spec/API_specification/dataframe_api/dataframe_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING, Any, Literal, NoReturn, Protocol

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from collections.abc import Iterator, Mapping, Sequence

from typing_extensions import Self

Expand Down Expand Up @@ -275,6 +275,10 @@ def schema(self) -> dict[str, DType]:
"""
...

def iter_columns(self) -> Iterator[Column]:
"""Return iterator over columns."""
...

def sort(
self,
*keys: str,
Expand Down Expand Up @@ -905,23 +909,20 @@ def persist(self) -> Self:
.. code-block:: python
df: DataFrame
features = []
result = df.std() > 0
result = result.persist()
for column_name in df.column_names:
if result.col(column_name).get_value(0):
features.append(column_name)
features = [col.name for col in df.iter_columns() if col.get_value(0)]
instead of this:
.. code-block:: python
df: DataFrame
features = []
for column_name in df.column_names:
# Do NOT call `persist` on a `DataFrame` within a for-loop!
# This may re-trigger the same computation multiple times
if df.persist().col(column_name).std() > 0:
features.append(column_name)
result = df.std() > 0
features = [
# Do NOT do this! This will trigger execution of the entire
# pipeline for element in the for-loop!
col.name for col in df.iter_columns() if col.get_value(0).persist()
]
"""
...
11 changes: 5 additions & 6 deletions spec/API_specification/examples/01_standardise_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any:
df = df_non_standard.__dataframe_consortium_standard__(api_version="2023.09-beta")

for column_name in df.column_names:
if column_name == "species":
continue
new_column = df.col(column_name)
new_column = (new_column - new_column.mean()) / new_column.std()
df = df.assign(new_column.rename(f"{column_name}_scaled"))
new_columns = [
((col - col.mean()) / col.std()).rename(f"{col.name}_scaled")
for col in df.iter_columns()
]
df = df.assign(*new_columns)

return df.dataframe
6 changes: 1 addition & 5 deletions spec/API_specification/examples/04_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,7 @@ def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI:
df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta").persist()
pdx = df.__dataframe_namespace__()
df = df.select(
*[
col_name
for col_name in df.column_names
if isinstance(df.col(col_name).dtype, pdx.Int64)
],
*[col.name for col in df.iter_columns() if isinstance(col.dtype, pdx.Int64)],
)
arr = df.to_array()
arr = some_array_function(arr)
Expand Down
12 changes: 4 additions & 8 deletions spec/design_topics/execution_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,13 @@ not be supported in some cases.
For example, let's consider the following:
```python
df: DataFrame
features = []
for column_name in df.column_names:
if df.col(column_name).std() > 0:
features.append(column_name)
return features
features = [col.name for col in df.iter_columns() if col.std() > 0]
```
If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns
If `df` is a lazy dataframe, then the call `col.std() > 0` returns
a (ducktyped) Python boolean scalar. No issues so far. Problem is,
what happens when `if df.col(column_name).std() > 0` is called?
what happens when `if col.std() > 0` is called?

Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in
Under the hood, Python will call `(col.std() > 0).__bool__()` in
order to extract a Python boolean. This is a problem for "lazy" implementations,
as the laziness needs breaking in order to evaluate the above.

Expand Down

0 comments on commit 8ecbea3

Please sign in to comment.