Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make DataFrame.any_rowwise top-level, rename to _horizontal #324

Merged
merged 8 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 117 additions & 1 deletion spec/API_specification/dataframe_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""Function stubs and API documentation for the DataFrame API standard."""
from __future__ import annotations

from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Literal

from .column_object import Column
from .dataframe_object import DataFrame
Expand Down Expand Up @@ -290,3 +290,119 @@ def date(year: int, month: int, day: int) -> Scalar:
... )
>>> df.filter(mask)
"""


def any_horizontal(*columns: Column, skip_nulls: bool = True) -> Column:
"""Reduction returns a Column.

Differs from :meth:`DataFrame.any` in that the reduction happens
for each row, rather than for each column.

All the `columns` must have the same parent DataFrame.
The return value has the same parent DataFrame as the input columns.

Raises
------
ValueError
If any of the columns is not boolean.

Examples
--------
>>> df: DataFrame
>>> ns = df.__dataframe_namespace__()
>>> mask = ns.any_horizontal(
... *[df.col(col_name) > 0 for col_name in df.column_names()]
... )
>>> df = df.filter(mask)
"""
...


def all_horizontal(*columns: Column, skip_nulls: bool = True) -> Column:
"""Reduction returns a Column.

Differs from :meth:`DataFrame.all` in that the reduction happens
for each row, rather than for each column.

All the `columns` must have the same parent DataFrame.
The return value has the same parent DataFrame as the input columns.

Raises
------
ValueError
If any of the columns is not boolean.

Examples
--------
>>> df: DataFrame
>>> ns = df.__dataframe_namespace__()
>>> mask = ns.all_horizontal(
... *[df.col(col_name) > 0 for col_name in df.column_names()]
... )
>>> df = df.filter(mask)
"""
...


def sorted_indices(
*columns: Column,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal["first", "last"] = "last",
) -> Column:
"""Return row numbers which would sort according to given columns.

If you need to sort the DataFrame, use :meth:`sort`.

Parameters
----------
*columns : Column
Columns to sort by.
ascending : Sequence[bool] or bool
If `True`, sort by all keys in ascending order.
If `False`, sort by all keys in descending order.
If a sequence, it must be the same length as `keys`,
and determines the direction with which to use each
key to sort by.
nulls_position : ``{'first', 'last'}``
Whether null values should be placed at the beginning
or at the end of the result.
Note that the position of NaNs is unspecified and may
vary based on the implementation.

Returns
-------
Column
The return value has the same parent DataFrame as the input columns.

Raises
------
ValueError
If `keys` and `ascending` are sequences of different lengths.
"""
...


def unique_indices(*columns: Column, skip_nulls: bool = True) -> Column:
"""Return indices corresponding to unique values across selected columns.

Parameters
----------
*columns : Column
Column names to consider when finding unique values.

Returns
-------
Column
Indices corresponding to unique values.

Notes
-----
There are no ordering guarantees. In particular, if there are multiple
indices corresponding to the same unique value(s), there is no guarantee
about which one will appear in the result.
If the original column(s) contain multiple `'NaN'` values, then
only a single index corresponding to those values will be returned.
Likewise for null values (if ``skip_nulls=False``).
To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``.
"""
...
2 changes: 1 addition & 1 deletion spec/API_specification/dataframe_api/column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def sort(
"""Sort column.

If you need the indices which would sort the column,
use :meth:`sorted_indices`.
use `sorted_indices`.

Parameters
----------
Expand Down
92 changes: 1 addition & 91 deletions spec/API_specification/dataframe_api/dataframe_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def sort(
"""Sort dataframe according to given columns.

If you only need the indices which would sort the dataframe, use
:meth:`sorted_indices`.
`sorted_indices`.

Parameters
----------
Expand Down Expand Up @@ -314,44 +314,6 @@ def sort(
"""
...

def sorted_indices(
self,
*keys: str,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal["first", "last"] = "last",
) -> Column:
"""Return row numbers which would sort according to given columns.

If you need to sort the DataFrame, use :meth:`sort`.

Parameters
----------
*keys : str
Names of columns to sort by.
If not specified, sort by all columns.
ascending : Sequence[bool] or bool
If `True`, sort by all keys in ascending order.
If `False`, sort by all keys in descending order.
If a sequence, it must be the same length as `keys`,
and determines the direction with which to use each
key to sort by.
nulls_position : ``{'first', 'last'}``
Whether null values should be placed at the beginning
or at the end of the result.
Note that the position of NaNs is unspecified and may
vary based on the implementation.

Returns
-------
Column

Raises
------
ValueError
If `keys` and `ascending` are sequences of different lengths.
"""
...

def __eq__(self, other: AnyScalar) -> Self: # type: ignore[override]
"""Compare for equality.

Expand Down Expand Up @@ -678,32 +640,6 @@ def all(self, *, skip_nulls: bool | Scalar = True) -> Self:
"""
...

def any_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column:
"""Reduction returns a Column.

Differs from ``DataFrame.any`` and that the reduction happens
for each row, rather than for each column.

Raises
------
ValueError
If any of the DataFrame's columns is not boolean.
"""
...

def all_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column:
"""Reduction returns a Column.

Differs from ``DataFrame.all`` and that the reduction happens
for each row, rather than for each column.

Raises
------
ValueError
If any of the DataFrame's columns is not boolean.
"""
...

def min(self, *, skip_nulls: bool | Scalar = True) -> Self:
"""Reduction returns a 1-row DataFrame."""
...
Expand Down Expand Up @@ -804,32 +740,6 @@ def is_nan(self) -> Self:
"""
...

def unique_indices(self, *keys: str, skip_nulls: bool | Scalar = True) -> Column:
"""Return indices corresponding to unique values across selected columns.

Parameters
----------
*keys : str
Column names to consider when finding unique values.
If not specified, all columns are considered.

Returns
-------
Column
Indices corresponding to unique values.

Notes
-----
There are no ordering guarantees. In particular, if there are multiple
indices corresponding to the same unique value(s), there is no guarantee
about which one will appear in the result.
If the original column(s) contain multiple `'NaN'` values, then
only a single index corresponding to those values will be returned.
Likewise for null values (if ``skip_nulls=False``).
To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``.
"""
...

def fill_nan(self, value: float | NullType | Scalar, /) -> Self:
"""Fill ``nan`` values with the given fill value.

Expand Down
29 changes: 29 additions & 0 deletions spec/API_specification/dataframe_api/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,35 @@ def is_dtype(self, dtype: DType, kind: str | tuple[str, ...]) -> bool:
def date(self, year: int, month: int, day: int) -> Scalar:
...

def any_horizontal(
self,
*columns: Column,
skip_nulls: bool = True,
) -> Column:
...

def all_horizontal(
self,
*columns: Column,
skip_nulls: bool = True,
) -> Column:
...

def sorted_indices(
self,
*columns: Column,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal["first", "last"] = "last",
) -> Column:
...

def unique_indices(
self,
*columns: Column,
skip_nulls: bool = True,
) -> Column:
...


DType = Union[
Namespace.Bool,
Expand Down
28 changes: 28 additions & 0 deletions spec/API_specification/examples/06_horizontal_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Example of how to use a horizontal function.

Horizontal functions are functions that take multiple columns as input and return a
single column as output.

Examples include:
- `any_horizontal`
- `all_horizontal`

These can be accessed by first using ``__dataframe_namespace__`` to get the
namespace object, and then calling the function on the namespace object and passing
an iterable of ``Column``s as input.
"""
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from dataframe_api.typing import SupportsDataFrameAPI


def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI:
df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta")
ns = df.__dataframe_namespace__()
df = df.filter(
ns.any_horizontal(*[df.col(col_name) > 0 for col_name in df.column_names]),
)
Comment on lines +25 to +27
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with #326, this could be simplified to

Suggested change
df = df.filter(
ns.any_horizontal(*[df.col(col_name) > 0 for col_name in df.column_names]),
)
df = df.filter(ns.any_horizontal(*[col > 0 for col in df.columns_iter()]))

return df.dataframe