diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 4e423fd2..321f51f2 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Literal, NoReturn, Protocol if TYPE_CHECKING: - from collections.abc import Mapping, Sequence + from collections.abc import Iterator, Mapping, Sequence from typing_extensions import Self @@ -275,6 +275,10 @@ def schema(self) -> dict[str, DType]: """ ... + def iter_columns(self) -> Iterator[Column]: + """Return iterator over columns.""" + ... + def sort( self, *keys: str, @@ -905,23 +909,20 @@ def persist(self) -> Self: .. code-block:: python df: DataFrame - features = [] result = df.std() > 0 result = result.persist() - for column_name in df.column_names: - if result.col(column_name).get_value(0): - features.append(column_name) + features = [col.name for col in df.iter_columns() if col.get_value(0)] instead of this: .. code-block:: python df: DataFrame - features = [] - for column_name in df.column_names: - # Do NOT call `persist` on a `DataFrame` within a for-loop! - # This may re-trigger the same computation multiple times - if df.persist().col(column_name).std() > 0: - features.append(column_name) + result = df.std() > 0 + features = [ + # Do NOT do this! This will trigger execution of the entire + # pipeline for element in the for-loop! + col.name for col in df.iter_columns() if col.get_value(0).persist() + ] """ ... diff --git a/spec/API_specification/examples/01_standardise_columns.py b/spec/API_specification/examples/01_standardise_columns.py index 808542e0..61c41c32 100644 --- a/spec/API_specification/examples/01_standardise_columns.py +++ b/spec/API_specification/examples/01_standardise_columns.py @@ -9,11 +9,10 @@ def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any: df = df_non_standard.__dataframe_consortium_standard__(api_version="2023.09-beta") - for column_name in df.column_names: - if column_name == "species": - continue - new_column = df.col(column_name) - new_column = (new_column - new_column.mean()) / new_column.std() - df = df.assign(new_column.rename(f"{column_name}_scaled")) + new_columns = [ + ((col - col.mean()) / col.std()).rename(f"{col.name}_scaled") + for col in df.iter_columns() + ] + df = df.assign(*new_columns) return df.dataframe diff --git a/spec/API_specification/examples/04_datatypes.py b/spec/API_specification/examples/04_datatypes.py index 660863bf..85b90cc3 100644 --- a/spec/API_specification/examples/04_datatypes.py +++ b/spec/API_specification/examples/04_datatypes.py @@ -12,11 +12,7 @@ def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI: df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta").persist() pdx = df.__dataframe_namespace__() df = df.select( - *[ - col_name - for col_name in df.column_names - if isinstance(df.col(col_name).dtype, pdx.Int64) - ], + *[col.name for col in df.iter_columns() if isinstance(col.dtype, pdx.Int64)], ) arr = df.to_array() arr = some_array_function(arr) diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index c81c7767..94f93a14 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -11,17 +11,13 @@ not be supported in some cases. For example, let's consider the following: ```python df: DataFrame -features = [] -for column_name in df.column_names: - if df.col(column_name).std() > 0: - features.append(column_name) -return features +features = [col.name for col in df.iter_columns() if col.std() > 0] ``` -If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns +If `df` is a lazy dataframe, then the call `col.std() > 0` returns a (ducktyped) Python boolean scalar. No issues so far. Problem is, -what happens when `if df.col(column_name).std() > 0` is called? +what happens when `if col.std() > 0` is called? -Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in +Under the hood, Python will call `(col.std() > 0).__bool__()` in order to extract a Python boolean. This is a problem for "lazy" implementations, as the laziness needs breaking in order to evaluate the above.