diff --git a/docs/source/best_practices.md b/docs/source/best_practices.md index 5dbea51..cba8e06 100644 --- a/docs/source/best_practices.md +++ b/docs/source/best_practices.md @@ -1,7 +1,7 @@ # Best Practices for describing tabular data transformations Many data science and machine learning projects work with tabular data. Popular ways of describing transformations -of such tabular data with python are using pandas, polars, or SQL. This section provides best practices for working +of such tabular data with python are using pandas, polars, or SQL. This section provides best practices for working with tabular data in python. * [beware the flatfile & embrace working with entities](/examples/best_practices_entities) @@ -10,4 +10,4 @@ with tabular data in python. ```{toctree} /examples/best_practices_entities /examples/best_practices_sql_polars -``` \ No newline at end of file +``` diff --git a/docs/source/database_testing.md b/docs/source/database_testing.md index 0f5abf4..d735ce6 100644 --- a/docs/source/database_testing.md +++ b/docs/source/database_testing.md @@ -1,11 +1,11 @@ # Database testing Relational databases are quite effective for analyzing medium size tabular data. You can leave the data in the database -and just describe the transformation in python. All that needs to be exchanged between python and the database is the -SQL string that is executed within the database as a `CREATE TABLE ... AS SELECT ...` statement. The database can +and just describe the transformation in python. All that needs to be exchanged between python and the database is the +SQL string that is executed within the database as a `CREATE TABLE ... AS SELECT ...` statement. The database can execute query in an optimal and parallelized way. -In practice, a relational database is already running somewhere and all you need is a connection URL and access +In practice, a relational database is already running somewhere and all you need is a connection URL and access credentials. See [Table Backends](table_backends.md) for a list of currently supported databases. The following example shows how to launch a postgres database in a container with docker-compose and how to work with it diff --git a/docs/source/examples.md b/docs/source/examples.md index 8779334..50790ac 100644 --- a/docs/source/examples.md +++ b/docs/source/examples.md @@ -17,4 +17,4 @@ Some examples how to use pydiverse.transform: /examples/window_functions /database_testing /examples/duckdb_polars_parquet -``` \ No newline at end of file +``` diff --git a/docs/source/examples/aggregations.md b/docs/source/examples/aggregations.md index 5047b3f..c7d5104 100644 --- a/docs/source/examples/aggregations.md +++ b/docs/source/examples/aggregations.md @@ -15,4 +15,4 @@ tbl1 >> group_by(tbl1.a) >> summarize(sum_b=sum(b)) >> show() Typical aggregation functions are `sum()`, `mean()`, `count()`, `min()`, `max()`, `any()`, and `all()`. These functions can be used in the `summarize()` verb. They can also be used as [window functions](/examples/window_functions) in the `mutate()` verb in case aggregated -values shall be projected back to the rows of the original table expression. \ No newline at end of file +values shall be projected back to the rows of the original table expression. diff --git a/docs/source/examples/best_practices_entities.md b/docs/source/examples/best_practices_entities.md index e80cfc3..65748f1 100644 --- a/docs/source/examples/best_practices_entities.md +++ b/docs/source/examples/best_practices_entities.md @@ -1,24 +1,24 @@ # Best Practices: Beware the flatfile & embrace working with entities -In DataFrame libraries, joining different tables is often either cumbersome or slow. As a consequence, many data +In DataFrame libraries, joining different tables is often either cumbersome or slow. As a consequence, many data pipelines bring their main pieces of information together in one big table called flatfile. While this might be nice -for quick exploration of the data, it causes several problems for long term maintenance and speed of adding new +for quick exploration of the data, it causes several problems for long term maintenance and speed of adding new features: -1. The number of columns grows very large and may become hard to overlook by the users that don't know all the prefixes - and suffixes by heart. -2. Associated information with 1:n relationship either are duplicated (wasting space), - or written to an array column (reducing flexibility for further joins), or simply make +1. The number of columns grows very large and may become hard to overlook by the users that don't know all the prefixes + and suffixes by heart. +2. Associated information with 1:n relationship either are duplicated (wasting space), + or written to an array column (reducing flexibility for further joins), or simply make it prohibitively hard to add features on a certain granularity. -3. In case a table is historized, storing rows for each version of a data field, the table size grows quadratic with +3. In case a table is historized, storing rows for each version of a data field, the table size grows quadratic with the number of columns. The other alternative is to keep column groups with similar subject matter meaning or similar data sources together in separate tables called entities. Especially when creating data transformation code programmatically with a nice syntax, it can be made quite easy to work with typical groups of entities with code in the background joining underlying tables. -Often flatfiles are created before feature engineering. Due to the large number of features (columns), it becomes +Often flatfiles are created before feature engineering. Due to the large number of features (columns), it becomes necessary to build automatic tools for executing the code for each feature in the correct order and to avoid wasteful -execution. However, when using entity granularity (column groups of similar origin), it is more manageable to manually -wire all feature engineering computations. It is even very valuable code to see how the different computation steps / +execution. However, when using entity granularity (column groups of similar origin), it is more manageable to manually +wire all feature engineering computations. It is even very valuable code to see how the different computation steps / entities build on each other. This makes tracking down problems much easier in debugging and helps new-joiners a chance -to step through the code. \ No newline at end of file +to step through the code. diff --git a/docs/source/examples/best_practices_sql_polars.md b/docs/source/examples/best_practices_sql_polars.md index b53fd9e..be520ec 100644 --- a/docs/source/examples/best_practices_sql_polars.md +++ b/docs/source/examples/best_practices_sql_polars.md @@ -1,26 +1,26 @@ # Best Practices: start sql, finish polars -At the beginning of a data pipeline, there is typically the biggest amount of data touched with rather simple -operations: Data is combined, encodings are converted/harmonized, simple aggregations and computations are performed, -and data is heavily filtered. These operations lend themselves very well to using a powerful database, and converting -transformations to SQL `CREATE TABLE ... AS SELECT ...` statements. This way, the data stays within the database and +At the beginning of a data pipeline, there is typically the biggest amount of data touched with rather simple +operations: Data is combined, encodings are converted/harmonized, simple aggregations and computations are performed, +and data is heavily filtered. These operations lend themselves very well to using a powerful database, and converting +transformations to SQL `CREATE TABLE ... AS SELECT ...` statements. This way, the data stays within the database and the communication heavy operations can be performed efficiently (i.e. parallelized) right where the data is stored. -Towards the end of the pipeline, the vast open source ecosystem of training libraries, evaluation, and +Towards the end of the pipeline, the vast open source ecosystem of training libraries, evaluation, and visualization tools is needed which are best interfaced with classical Polars / Pandas DataFrames in Memory. -In the middle with feature engineering, there is still a large part of logic, that is predominantly simple enough for -typical SQL expressiveness with some exceptions. Thus, it is super helpful if we can jump between SQL and Polars for -performance reasons, but stay within the same pydiverse.transform syntax for describing transformations for the most -part. +In the middle with feature engineering, there is still a large part of logic, that is predominantly simple enough for +typical SQL expressiveness with some exceptions. Thus, it is super helpful if we can jump between SQL and Polars for +performance reasons, but stay within the same pydiverse.transform syntax for describing transformations for the most +part. -When moving code to production it is often the case that prediction calls are done with much less data than during +When moving code to production it is often the case that prediction calls are done with much less data than during training. This it might not be worth setting up a sophisticated database technology, in that case. Pydiverse.transform -allows to take code written for SQL execution during training and use the exact same code for executing on Polars for -production. In the long run, we also want to be able to generate ONNX graphs from transform code to make long term +allows to take code written for SQL execution during training and use the exact same code for executing on Polars for +production. In the long run, we also want to be able to generate ONNX graphs from transform code to make long term reliable deployments even easier. -The aim of pydiverse.transform is not feature completeness but rather versatility, ease of use, and very predictable +The aim of pydiverse.transform is not feature completeness but rather versatility, ease of use, and very predictable and reliable behavior. Thus it should always integrate nicely with other ways of writing data transformations. Together -with [pydiverse.pipedag](https://pydiversepipedag.readthedocs.io/en/latest/), this interoperability is made even much -easier. \ No newline at end of file +with [pydiverse.pipedag](https://pydiversepipedag.readthedocs.io/en/latest/), this interoperability is made even much +easier. diff --git a/docs/source/examples/duckdb_polars_parquet.md b/docs/source/examples/duckdb_polars_parquet.md index 8a69582..a1d79a1 100644 --- a/docs/source/examples/duckdb_polars_parquet.md +++ b/docs/source/examples/duckdb_polars_parquet.md @@ -10,7 +10,7 @@ tbl = pdt.Table(dict(x=[1, 2, 3], y=[4, 5, 6]), name="A") tbl2 = pdt.Table(dict(x=[2, 3], z=["b", "c"]), name="B") >> collect(DuckDb()) out = ( - tbl >> collect(DuckDb()) >> left_join(tbl2, tbl.x == tbl2.x) >> show_query() + tbl >> collect(DuckDb()) >> left_join(tbl2, tbl.x == tbl2.x) >> show_query() >> collect(Polars()) >> mutate(z=tbl.x + tbl.y) >> show() ) @@ -21,4 +21,4 @@ df2 = out >> export(Polars(lazy=False)) print(type(df2)) ``` -In the future, it is also intended to allow both DuckDB and Polars backends to read and write Parquet files. \ No newline at end of file +In the future, it is also intended to allow both DuckDB and Polars backends to read and write Parquet files. diff --git a/docs/source/examples/joining.md b/docs/source/examples/joining.md index 1e38b89..8fd05b1 100644 --- a/docs/source/examples/joining.md +++ b/docs/source/examples/joining.md @@ -50,7 +50,7 @@ shape: (3, 4) For DataFrame libraries, it is quite common that a join combines all columns of both tables, so the user then can pick the columns of interest for further expressions. In SQL, the act of joining is actually not bringing in any new columns. -It only adds the columns of the joined tables to the namespace of usable columns in expressions of the `mutate` and +It only adds the columns of the joined tables to the namespace of usable columns in expressions of the `mutate` and `summarize` verbs. In pydiverse.transform, the empty `select()` verb can be used to hide all columns of a table. But all columns can still @@ -64,13 +64,13 @@ tbl1 = pdt.Table(dict(a=["a", "b", "c"], b=[1, 2, 3])) tbl2 = pdt.Table(dict(a=["a", "b", "b", "d"], c=[1.1, 2.2, 2.3, 4.4]), name="tbl2") ( - tbl1 + tbl1 >> left_join(tbl2 >> select(), tbl1.a == tbl2.a) >> show() >> mutate(d=tbl1.b + tbl2.c) >> show() ) ``` -*dplyr* has also a verb called `transmute` which is very similar to `mutate`, but removes/hides all columns which were +*dplyr* has also a verb called `transmute` which is very similar to `mutate`, but removes/hides all columns which were not specified in the `mutate` call. This can be easily implemented in pydiverse.transform in user space: ```python @@ -85,4 +85,4 @@ def transmute(tbl, **kwargs): tbl1 = pdt.Table(dict(a=["a", "b", "c"], b=[1, 2, 3])) tbl1 >> transmute(a=tbl1.a, b_sqr=tbl1.b * tbl1.b) >> show() -``` \ No newline at end of file +``` diff --git a/docs/source/examples/window_functions.md b/docs/source/examples/window_functions.md index e4e057c..3b49924 100644 --- a/docs/source/examples/window_functions.md +++ b/docs/source/examples/window_functions.md @@ -1,15 +1,15 @@ # Window functions -Pydiverse.transform offers window functions with the `mutate()` verb. -Window functions are functions that operate on a set of rows related to the current row. -They can be computed independently on groups, can use the order of rows, and can be computed only on a filtered -subset of the table. The most simple window function is `shift(n)` which shifts a column by `n` rows. Defining an +Pydiverse.transform offers window functions with the `mutate()` verb. +Window functions are functions that operate on a set of rows related to the current row. +They can be computed independently on groups, can use the order of rows, and can be computed only on a filtered +subset of the table. The most simple window function is `shift(n)` which shifts a column by `n` rows. Defining an ordering is very important for this operation. -There are two notations which define the grouping and arranging in a different way. +There are two notations which define the grouping and arranging in a different way. The first is explicitly defining the `partition_by`, `order_by`, and `filter` arguments of the window function. -The second makes use of existing verbs like `group_by()` and `arrange()`. However, an additional verb `ungroup()` tells -that no `summarize()` will follow but rather that `group_by()` arguments should be used as `partition_by` and `arrange()` +The second makes use of existing verbs like `group_by()` and `arrange()`. However, an additional verb `ungroup()` tells +that no `summarize()` will follow but rather that `group_by()` arguments should be used as `partition_by` and `arrange()` arguments as `arrange` parameters to window functions. ```python @@ -22,4 +22,4 @@ tbl1 = pdt.Table(dict(a=[1, 1, 2, 2, 2, 3], b=[4, 5, 8, 7, 6, 9])) out1 = tbl1 >> mutate(b_shift=tbl1.b.shift(1, partition_by=tbl1.a, arrange=-tbl1.b, filter=tbl1.b < 8)) >> show() out2 = tbl1 >> group_by(tbl1.a) >> arrange(-tbl1.b) >> mutate(b_shift=tbl1.b.shift(1, filter=tbl1.b < 8)) >> ungroup() >> show() assert_frame_equal(out1 >> arrange(-tbl1.b) >> export(Polars()), out2 >> export(Polars())) -``` \ No newline at end of file +``` diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index 6361dae..208db72 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -3,7 +3,7 @@ ## Installation pydiverse.transform is distributed on [PyPi](https://pypi.org/project/pydiverse-transform/) and [Conda-Forge](https://anaconda.org/conda-forge/pydiverse-transform). -To use it, just install it with pip, conda, or pixi. Polars is installed as a dependency. We recommend also installing +To use it, just install it with pip, conda, or pixi. Polars is installed as a dependency. We recommend also installing duckdb since it is used in example code: ```shell @@ -32,11 +32,11 @@ tbl3 = pdt.Table(pd.DataFrame(dict(x=[1, 2, 3], y=[4, 5, 6]))) ### The pipe operator and printing For doing something with a table or for describing a data transformation, we use the pipe operator `>>`. The individual -operations within the pipe are called verbs. The pipe operator is used to chain verbs together. We call them verbs -because they do something. +operations within the pipe are called verbs. The pipe operator is used to chain verbs together. We call them verbs +because they do something. The `show` verb can be used to print a table. However, the python print function does pretty much the same: - + ```python import pydiverse.transform as pdt from pydiverse.transform import show @@ -48,7 +48,7 @@ tbl >> show() # same as `print(tbl)` but can be used inside a pipe ### Importing verbs Some verbs are extremely valuable in debugging (like `show`), but they might not be actually used in the final code. -Thus it is recommended to always import them with a wildcard import even though you might need to disable warnings for +Thus it is recommended to always import them with a wildcard import even though you might need to disable warnings for your favorite linter: ```python @@ -59,8 +59,8 @@ tbl = pdt.Table(dict(x=[1, 2, 3], y=[4, 5, 6])) tbl >> show() # same as `print(tbl)` but can be used inside a pipe ``` -For more convenience you might even consider to use `from pydiverse.transform.common import *` and if you don't mind -that some standard python functions like `filter` are overwritten in your scope, you can use: +For more convenience you might even consider to use `from pydiverse.transform.common import *` and if you don't mind +that some standard python functions like `filter` are overwritten in your scope, you can use: `from pydiverse.transform.extended import *` ### Simple transformations @@ -82,14 +82,14 @@ assert_frame_equal(out1 >> export(Polars()), out2 >> export(Polars())) ### Referencing, selecting and deselecting columns If column names are python identifiers, they can be referenced with `tbl.column_name`. If they are not, they can be -referenced with `tbl["column name"]`. Alternatively, columns can also be referenced by their name with `C.colum_name`. +referenced with `tbl["column name"]`. Alternatively, columns can also be referenced by their name with `C.column_name`. Even though it is very common in DataFrame libraries to only reference column names independent of their origin, it is discouraged to do this in pydiverse.transform since it is very nice to show the reader from which source table a column originated and then pydiverse.transform can provide better error messages in case of the errors can be forseen simply by analyzing types within expressions. -The `select` verb is used to select columns and the `drop` verb is used to drop -them. Please bear in mind that `select` and `drop` only hide columns and that they can still be used in subsequent +The `select` verb is used to select columns and the `drop` verb is used to drop +them. Please bear in mind that `select` and `drop` only hide columns and that they can still be used in subsequent `mutate`/`filter`/`group_by` expressions: ```python @@ -98,15 +98,15 @@ from pydiverse.transform.extended import * tbl = pdt.Table(dict(x=[1, 2, 3], y=[4, 5, 6])) ( - tbl - >> mutate(z=tbl.x + tbl.y) >> select(tbl.y, C.z) >> show() + tbl + >> mutate(z=tbl.x + tbl.y) >> select(tbl.y, C.z) >> show() >> drop(tbl.y) >> show() ) ``` ### Ingesting Data from SQL Database -You can reference a SQL Table within a database by providing its name and using a sqlalchemy engine: +You can reference a SQL Table within a database by providing its name and using a sqlalchemy engine: `tbl = pdt.Table("my_tbl", SqlAlchemy(engine, schema="transform"))` ```python @@ -131,7 +131,7 @@ with tempfile.TemporaryDirectory() as temp_dir: Output of show_query(): ```sql -SELECT my_tbl.a AS a, my_tbl.b AS b +SELECT my_tbl.a AS a, my_tbl.b AS b FROM transform.my_tbl AS my_tbl ``` @@ -146,4 +146,4 @@ shape: (1, 2) ╞═════╪═════╡ │ a ┆ 1 │ └─────┴─────┘ -``` \ No newline at end of file +``` diff --git a/docs/source/table_backends.md b/docs/source/table_backends.md index e232845..6b84980 100644 --- a/docs/source/table_backends.md +++ b/docs/source/table_backends.md @@ -53,4 +53,3 @@ with tempfile.TemporaryDirectory() as temp_dir: conn.commit() example(engine) ``` - diff --git a/src/pydiverse/transform/_internal/backend/polars.py b/src/pydiverse/transform/_internal/backend/polars.py index 6ad9474..28be5c8 100644 --- a/src/pydiverse/transform/_internal/backend/polars.py +++ b/src/pydiverse/transform/_internal/backend/polars.py @@ -167,7 +167,7 @@ def compile_col_expr(expr: ColExpr, name_in_df: dict[UUID, str]) -> pl.Expr: # TODO: currently, count is the only aggregation function where we don't want # to return null for cols containing only null values. If this happens for more # aggregation functions, make this configurable in e.g. the operator spec. - if expr.op.ftype == Ftype.AGGREGATE and expr.op != ops.len: + if expr.op.ftype == Ftype.AGGREGATE and expr.op != ops.count_star: # In `sum` / `any` and other aggregation functions, polars puts a # default value (e.g. 0, False) for empty columns, but we want to put # Null in this case to let the user decide about the default value via @@ -668,7 +668,7 @@ def _str_slice(x, offset, length): def _count(x): return x.count().cast(pl.Int64) - @impl(ops.len) + @impl(ops.count_star) def _len(): return pl.len().cast(pl.Int64) diff --git a/src/pydiverse/transform/_internal/backend/sql.py b/src/pydiverse/transform/_internal/backend/sql.py index 4629ac8..d3d56e6 100644 --- a/src/pydiverse/transform/_internal/backend/sql.py +++ b/src/pydiverse/transform/_internal/backend/sql.py @@ -888,7 +888,7 @@ def _all(x): def _count(x=None): return sqa.func.count(x) - @impl(ops.len) + @impl(ops.count_star) def _len(): return sqa.func.count() diff --git a/src/pydiverse/transform/_internal/ops/ops/aggregation.py b/src/pydiverse/transform/_internal/ops/ops/aggregation.py index e1bd9a0..134d3e9 100644 --- a/src/pydiverse/transform/_internal/ops/ops/aggregation.py +++ b/src/pydiverse/transform/_internal/ops/ops/aggregation.py @@ -44,4 +44,6 @@ def __init__( count = Aggregation("count", Signature(D, return_type=Int())) -len = Aggregation("len", Signature(return_type=Int()), generate_expr_method=False) +count_star = Aggregation( + "count", Signature(return_type=Int()), generate_expr_method=False +) diff --git a/src/pydiverse/transform/_internal/pipe/functions.py b/src/pydiverse/transform/_internal/pipe/functions.py index 7a927eb..260895c 100644 --- a/src/pydiverse/transform/_internal/pipe/functions.py +++ b/src/pydiverse/transform/_internal/pipe/functions.py @@ -84,6 +84,14 @@ def coalesce(arg: ColExpr, *args: ColExpr) -> ColExpr: return ColFn(ops.coalesce, arg, *args) +def count( + *, + partition_by: Col | ColName | Iterable[Col | ColName] | None = None, + filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, +) -> ColExpr[Int]: + return ColFn(ops.count_star, partition_by=partition_by, filter=filter) + + def dense_rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, @@ -148,14 +156,6 @@ def min(arg: ColExpr, *args: ColExpr) -> ColExpr: return ColFn(ops.horizontal_min, arg, *args) -def len( - *, - partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, -) -> ColExpr[Int]: - return ColFn(ops.len, partition_by=partition_by, filter=filter) - - def rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index ffbde6a..6bc2a7e 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -924,7 +924,7 @@ def __init__(self, op: Operator, *args: ColExpr, **kwargs: list[ColExpr | Order] if filters := self.context_kwargs.get("filter"): if len(self.args) == 0: - assert self.op.name == "len" + assert self.op == ops.count_star else: self.args[0] = CaseExpr( [ diff --git a/src/pydiverse/transform/extended.py b/src/pydiverse/transform/extended.py index 2a85a99..f7d2ad8 100644 --- a/src/pydiverse/transform/extended.py +++ b/src/pydiverse/transform/extended.py @@ -6,6 +6,7 @@ all, any, coalesce, + count, dense_rank, lit, max, @@ -22,6 +23,7 @@ __all__ = __common + [ "any", "all", + "count", "sum", "filter", "coalesce", diff --git a/tests/test_backend_equivalence/test_ops/test_functions.py b/tests/test_backend_equivalence/test_ops/test_functions.py index 9a4df0f..f03b942 100644 --- a/tests/test_backend_equivalence/test_ops/test_functions.py +++ b/tests/test_backend_equivalence/test_ops/test_functions.py @@ -14,7 +14,7 @@ def test_count(df4): lambda t: t >> mutate(**{col.name + "_count": col.count() for col in t}) >> mutate(o=LiteralCol(0).count(filter=t.col3 == 2)) - >> mutate(u=pdt.len(), v=pdt.len(filter=t.col4 > 0)), + >> mutate(u=pdt.count(), v=pdt.count(filter=t.col4 > 0)), ) diff --git a/tests/test_backend_equivalence/test_slice_head.py b/tests/test_backend_equivalence/test_slice_head.py index 837125b..9b62930 100644 --- a/tests/test_backend_equivalence/test_slice_head.py +++ b/tests/test_backend_equivalence/test_slice_head.py @@ -137,7 +137,7 @@ def test_with_group_by(df3): >> slice_head(1) >> alias() >> group_by(C.col1) - >> mutate(x=pdt.len()), + >> mutate(x=pdt.count()), ) assert_result_equal( @@ -159,7 +159,7 @@ def test_with_group_by(df3): >> slice_head(4) >> alias() >> group_by(C.key) - >> summarize(x=pdt.len()), + >> summarize(x=pdt.count()), ) @@ -170,7 +170,7 @@ def test_with_summarize(df3): >> arrange(*t) >> slice_head(4) >> alias() - >> summarize(count=pdt.len()), + >> summarize(count=pdt.count()), ) assert_result_equal(