From d8492e7701cddd5de44615ea2e155a79b80dd1ba Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Sun, 14 Jul 2024 01:22:20 +0200 Subject: [PATCH 1/3] Try out mkdocs-material --- .github/workflows/docs.yml | 23 + .readthedocs.yml | 14 - docs/Makefile | 20 - docs/api-documentation.md | 3 + .../development.rst => development.md} | 51 +- docs/examples/company-data.md | 128 +++ docs/examples/dates.md | 156 ++++ .../exploration.md} | 223 ++--- docs/examples/twitch.md | 276 ++++++ docs/{source => }/examples/twitch_process.py | 0 docs/{source => }/examples/twitch_report.html | 0 .../examples/twitch_specification.py | 0 docs/{source => }/examples/twitch_upload.py | 0 .../{source => }/examples/twitch_version1.csv | 0 .../{source => }/examples/twitch_version2.csv | 0 docs/{source => }/examples/twitchdata.csv | 0 ...getting_started.rst => getting-started.md} | 129 ++- docs/index.md | 23 + docs/installation.md | 15 + docs/make.bat | 35 - docs/{source/motivation.rst => motivation.md} | 17 +- docs/{source => }/report_failing_query1.png | Bin docs/{source => }/report_failing_query2.png | Bin docs/source/conf.py | 44 - docs/source/examples/example.rst | 151 --- docs/source/examples/example_dates.rst | 169 ---- docs/source/examples/example_twitch.rst | 332 ------- docs/source/examples/examples.rst | 7 - docs/source/index.rst | 23 - docs/source/installation.rst | 14 - docs/source/testing.rst | 323 ------- docs/testing.md | 298 ++++++ mkdocs.yml | 68 ++ pixi.lock | 880 ++++++++++-------- pixi.toml | 13 +- 35 files changed, 1675 insertions(+), 1760 deletions(-) create mode 100644 .github/workflows/docs.yml delete mode 100644 .readthedocs.yml delete mode 100644 docs/Makefile create mode 100644 docs/api-documentation.md rename docs/{source/development.rst => development.md} (66%) create mode 100644 docs/examples/company-data.md create mode 100644 docs/examples/dates.md rename docs/{source/examples/example_exploration.rst => examples/exploration.md} (55%) create mode 100644 docs/examples/twitch.md rename docs/{source => }/examples/twitch_process.py (100%) rename docs/{source => }/examples/twitch_report.html (100%) rename docs/{source => }/examples/twitch_specification.py (100%) rename docs/{source => }/examples/twitch_upload.py (100%) rename docs/{source => }/examples/twitch_version1.csv (100%) rename docs/{source => }/examples/twitch_version2.csv (100%) rename docs/{source => }/examples/twitchdata.csv (100%) rename docs/{source/getting_started.rst => getting-started.md} (72%) create mode 100644 docs/index.md create mode 100644 docs/installation.md delete mode 100644 docs/make.bat rename docs/{source/motivation.rst => motivation.md} (68%) rename docs/{source => }/report_failing_query1.png (100%) rename docs/{source => }/report_failing_query2.png (100%) delete mode 100644 docs/source/conf.py delete mode 100644 docs/source/examples/example.rst delete mode 100644 docs/source/examples/example_dates.rst delete mode 100644 docs/source/examples/example_twitch.rst delete mode 100644 docs/source/examples/examples.rst delete mode 100644 docs/source/index.rst delete mode 100644 docs/source/installation.rst delete mode 100644 docs/source/testing.rst create mode 100644 docs/testing.md create mode 100644 mkdocs.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..527937cb --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,23 @@ +name: Docs +on: + pull_request: + push: + branches: + - main +permissions: + contents: write +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout branch + uses: actions/checkout@v4 + - name: Set up pixi + uses: prefix-dev/setup-pixi@ba3bb36eb2066252b2363392b7739741bb777659 + with: + environments: docs + - name: Build docs + run: pixi run -e docs docs-build + - name: Deploy docs + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + run: pixi run -e docs mkdocs gh-deploy --force diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index f2e6c0ae..00000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,14 +0,0 @@ -version: 2 -build: - os: ubuntu-20.04 - tools: - python: mambaforge-latest - commands: - - mamba install -c conda-forge -c nodefaults pixi - - pixi run -e docs postinstall - - pixi run -e docs docs - - pixi run -e docs readthedocs -sphinx: - configuration: docs/source/conf.py -formats: - - pdf diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d0c3cbf1..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/api-documentation.md b/docs/api-documentation.md new file mode 100644 index 00000000..23122637 --- /dev/null +++ b/docs/api-documentation.md @@ -0,0 +1,3 @@ +# API Documentation + +::: datajudge diff --git a/docs/source/development.rst b/docs/development.md similarity index 66% rename from docs/source/development.rst rename to docs/development.md index 6c2395ca..595cb5cf 100644 --- a/docs/source/development.rst +++ b/docs/development.md @@ -1,55 +1,52 @@ -Development -=========== +# Development ``datajudge`` development relies on [pixi](https://pixi.sh/latest/). In order to work on ``datajudge``, you can create a development environment as follows: -:: - - git clone https://github.com/Quantco/datajudge - cd datajudge - pixi run postinstall +```bash +git clone https://github.com/Quantco/datajudge +cd datajudge +pixi run postinstall +``` Unit tests can be run by executing -:: - - pixi run test +```bash +pixi run test +``` Integration tests are run against a specific backend at a time. As of now, we provide helper scripts to spin up either a Postgres or MSSQL backend. To run integration tests against Postgres, first start a docker container with a Postgres database: -:: - - ./start_postgres.sh +```bash +./start_postgres.sh +``` In your current environment, install the ``psycopg2`` package. After this, you may execute integration tests as follows: -:: - - pixi run -e postgres-py38 test +```bash +pixi run -e postgres-py38 test +``` Analogously, for MSSQL, run -:: - - ./start_mssql.sh +```bash +./start_mssql.sh +``` and -:: - - pixi run -e mssql-py310 test +```bash +pixi run -e mssql-py310 test +``` or - -:: - - pixi run -e mssql-py310 test_freetds - +```bash +pixi run -e mssql-py310 test_freetds +``` depending on the driver you'd like to use. diff --git a/docs/examples/company-data.md b/docs/examples/company-data.md new file mode 100644 index 00000000..1090840d --- /dev/null +++ b/docs/examples/company-data.md @@ -0,0 +1,128 @@ +# Company data + +To get started, we will create a sample database using sqlite that contains a list of companies. + +The table "companies_archive" contains three entries: + +**companies_archive** + +| id | name | num_employees | +|----|---------|---------------| +| 1 | QuantCo | 90 | +| 2 | Google | 140,000 | +| 3 | BMW | 110,000 | + +While "companies" contains an additional entry: + +**companies** + +| id | name | num_employees | +|----|---------|---------------| +| 1 | QuantCo | 100 | +| 2 | Google | 150,000 | +| 3 | BMW | 120,000 | +| 4 | Apple | 145,000 | + +```python +import sqlalchemy as sa + +eng = sa.create_engine('sqlite:///example.db') + +with eng.connect() as con: + con.execute("CREATE TABLE companies (id INTEGER PRIMARY KEY, name TEXT, num_employees INTEGER)") + con.execute("INSERT INTO companies (name, num_employees) VALUES ('QuantCo', 100), ('Google', 150000), ('BMW', 120000), ('Apple', 145000)") + con.execute("CREATE TABLE companies_archive (id INTEGER PRIMARY KEY, name TEXT, num_employees INTEGER)") + con.execute("INSERT INTO companies_archive (name, num_employees) VALUES ('QuantCo', 90), ('Google', 140000), ('BMW', 110000)") +``` + +As an example, we will run 4 tests on this table: + +1. Does the table "companies" contain a column named "name"? +2. Does the table "companies" contain at least 1 entry with the name "QuantCo"? +3. Does the column "num_employees" of the "companies" table have all positive values? +4. Does the column "name" of the table "companies" contain at least all the values of + the corresponding column in "companies_archive"? + +```python +import pytest +import sqlalchemy as sa + +from datajudge import ( + Condition, + WithinRequirement, + BetweenRequirement, +) +from datajudge.pytest_integration import collect_data_tests + + +# We create a Requirement, within a table. This object will contain +# all the constraints we want to test on the specified table. +# To test another table or test the same table against another table, +# we would create another Requirement object. +companies_req = WithinRequirement.from_table( + db_name="example", schema_name=None, table_name="companies" +) + +# Constraint 1: Does the table "companies" contain a column named "name"? +companies_req.add_column_existence_constraint(columns=["name"]) + +# Constraint 2: Does the table "companies" contain at least 1 entry with the name "QuantCo"? +condition = Condition(raw_string="name = 'QuantCo'") +companies_req.add_n_rows_min_constraint(n_rows_min=1, condition=condition) + +# Constraint 3: Does the column "num_employees" of the "companies" table have all +# positive values? +companies_req.add_numeric_min_constraint(column="num_employees", min_value=1) + +# We create a new Requirement, this time between different tables. +# Concretely, we intent to test constraints between the table "companies" +# and the table "companies_archive". +companies_between_req = BetweenRequirement.from_tables( + db_name1="example", + schema_name1=None, + table_name1="companies", + db_name2="example", + schema_name2=None, + table_name2="companies_archive", +) + +# Constraint 4: Does the column "name" of the table "companies" contain at least all +# the values of the corresponding column in "companies_archive"? +companies_between_req.add_row_superset_constraint( + columns1=['name'], columns2=['name'], constant_max_missing_fraction=0 +) + +# collect_data_tests expects a pytest fixture with the name +# "datajudge_engine" that is a SQLAlchemy engine + +@pytest.fixture() +def datajudge_engine(): + return sa.create_engine("sqlite:///example.db") + +# We gather our distinct Requirements in a list. +requirements = [companies_req, companies_between_req] + +# "collect_data_tests" takes all requirements and turns their respective +# Constraints into individual tests. pytest will be able to pick +# up these tests. +test_constraint = collect_data_tests(requirements) +``` + +Saving this file as ``specification.py`` and running ``$ pytest specification.py`` +will verify that all constraints are satisfied. The output you see in the terminal +should be similar to this: + +``` +=================================== test session starts =================================== +... +collected 4 items + +specification.py::test_constraint[ColumnExistence::companies] PASSED [ 25%] +specification.py::test_constraint[NRowsMin::companies] PASSED [ 50%] +specification.py::test_constraint[NumericMin::companies] PASSED [ 75%] +specification.py::test_constraint[RowSuperset::companies|companies_archive] PASSED [100%] + +==================================== 4 passed in 0.31s ==================================== +``` + +You can also use a formatted html report using the ``--html=report.html`` flag. diff --git a/docs/examples/dates.md b/docs/examples/dates.md new file mode 100644 index 00000000..897c18ad --- /dev/null +++ b/docs/examples/dates.md @@ -0,0 +1,156 @@ +# Dates + +This example concerns itself with expressing ``Constraint``\s against data revolving +around dates. While date ``Constraint``\s between tables exist, we will only illustrate +``Constraint``\s on a single table and reference values here. As a consequence, we will +only use ``WithinRequirement``, as opposed to ``BetweenRequirement``. + +Concretely, we will assume a table containing prices for a given product of id 1. +Importantly, these prices are valid for a certain date range only. More precisely, +we assume that the price for a product - identified via the ``preduct_id`` column - +is indicated in the ``price`` column, the date from which it is valid - the date +itself included - in ``date_from`` and the the until when it is valid - the date +itself included - in the ``date_to`` column. + +Such a table might look as follows: + +**prices** + +| product_id | price | date_from | date_to | +|------------|-------|-----------|---------| +| 1 | 13.99 | 22/01/01 | 22/01/10| +| 1 | 14.5 | 22/01/11 | 22/01/17| +| 1 | 13.37 | 22/01/16 | 22/01/31| + +Given this table, we would like to ensure - for the sake of illustrational purposes - +that 6 constraints are satisfied: + +1. All values from column ``date_from`` should be in January 2022. +2. All values from column ``date_to`` should be in January 2022. +3. The minimum value in column ``date_from`` should be the first of January 2022. +4. The maximum value in column ``date_to`` should be the 31st of January 2022. +5. There is no gap between ``date_from`` and ``date_to``. In other words, every date + of January has to be assigned to at least one row for a given product. +6. There is no overlap between ``date_from`` and ``date_to``. In other words, every + date of January has to be assigned to at most one row for a given product. + + +Assuming that such a table exists in database, we can write a specification against it. + +```python +import pytest +import sqlalchemy as sa + +from datajudge import WithinRequirement +from datajudge.pytest_integration import collect_data_tests + +# We create a Requirement, within a table. This object will contain +# all the constraints we want to test on the specified table. +# To test another table or test the same table against another table, +# we would create another Requirement object. +prices_req = WithinRequirement.from_table( + db_name="example", schema_name=None, table_name="prices" +) + +# Constraint 1: +# All values from column date_from should be in January 2022. +prices_req.add_date_between_constraint( + column="date_from", + lower_bound="'20220101'", + upper_bound="'20220131'", + # We don't tolerate any violations of the constraint: + min_fraction=1, +) + +# Constraint 2: +# All values from column date_to should be in January 2022. +prices_req.add_date_between_constraint( + column="date_to", + lower_bound="'20220101'", + upper_bound="'20220131'", + # We don't tolerate any violations of the constraint: + min_fraction=1, +) + +# Constraint 3: +# The minimum value in column date_from should be the first of January 2022. + +# Ensure that the minimum is smaller or equal the reference value min_value. +prices_req.add_date_min_constraint(column="date_from", min_value="'20220101'") +# Ensure that the minimum is greater or equal the reference value min_value. +prices_req.add_date_min_constraint( + column="date_from", + min_value="'20220101'", + use_upper_bound_reference=True, +) + +# Constraint 4: +# The maximum value in column date_to should be the 31st of January 2022. + +# Ensure that the maximum is greater or equal the reference value max_value. +prices_req.add_date_max_constraint(column="date_to", max_value="'20220131'") +# Ensure that the maximum is smaller or equal the reference value max_value. +prices_req.add_date_max_constraint( + column="date_to", + max_value="'20220131'", + use_upper_bound_reference=True, +) + +# Constraint 5: +# There is no gap between date_from and date_to. In other words, every date +# of January has to be assigned to at least one row for a given product. +prices_req.add_date_no_gap_constraint( + start_column="date_from", + end_column="date_to", + # We don't want a gap of price date ranges for a given product. + # For different products, we allow arbitrary date gaps. + key_columns=["product_id"], + # As indicated in prose, date_from and date_to are included in ranges. + end_included=True, + # Again, we don't expect any violations of our constraint. + max_relative_violations=0, +) + +# Constraint 6: +# There is no overlap between date_from and date_to. In other words, every +# of January has to be assigned to at most one row for a given product. +princes_req.add_date_no_overlap_constraint( + start_column="date_from", + end_column="date_to", + # We want no overlap of price date ranges for a given product. + # For different products, we allow arbitrary date overlaps. + key_columns=["product_id"], + # As indicated in prose, date_from and date_to are included in ranges. + end_included=True, + # Again, we don't expect any violations of our constraint. + max_relative_violations=0, +) + +@pytest.fixture() +def datajudge_engine(): +# TODO: Insert actual connection string + return sa.create_engine("your_db://") + +# We gather our single Requirement in a list. +requirements = [prices_req] + +# "collect_data_tests" takes all requirements and turns their respective +# Constraints into individual tests. pytest will be able to pick +# up these tests. +test_constraint = collect_data_tests(requirements) +``` + +Please note that the ``DateNoOverlap`` and ``DateNoGap`` constraints also exist +in a slightly different form: ``DateNoOverlap2d`` and ``DateNoGap2d``. +As the names suggest, these can operate in 'two date dimensions'. + +For example, let's assume a table with four date columns, representing two +ranges in distinct dimensions, respectively: + +* ``date_from``: Date from when a price is valid +* ``date_to``: Date until when a price is valid +* ``date_definition_from``: Date when a price definition was inserted +* ``date_definition_to``: Date until when a price definition was used + +Analogously to the unidimensional scenario illustrated here, one might care +for certain constraints in two dimensions. diff --git a/docs/source/examples/example_exploration.rst b/docs/examples/exploration.md similarity index 55% rename from docs/source/examples/example_exploration.rst rename to docs/examples/exploration.md index cfee2734..33375a1a 100644 --- a/docs/source/examples/example_exploration.rst +++ b/docs/examples/exploration.md @@ -1,6 +1,4 @@ -Example: Exploration -==================== - +# Exploration While datajudge seeks to tackle the use case of expressing and evaluating tests against data, its fairly generic inner workings allow for using it in a rather explorative @@ -9,12 +7,11 @@ workflow as well. Let's first clarify terminology by exemplifying both scenarios. A person wishing to test data might ask the question - Has the number of rows not grown too much from version 1 of the table to version 2 - of the table? +> Has the number of rows not grown too much from version 1 of the table to version 2 of the table? whereas a person wishing to explore the data might ask the question - By how much has the number of rows grown from version 1 to version 2 of the table? +> By how much has the number of rows grown from version 1 to version 2 of the table? Put differently, a test typically revolves around a binary outcome while an exploration usually doesn't. @@ -34,11 +31,10 @@ the hood. Importantly, ``Constraint`` s typically come with ``Constraint`` s key reference value in the case of a ``WithinRequirement`` Moreover, as is the case when using datajudge for testing purposes, these approaches rely -on a `sqlalchemy engine `_. The +on a [sqlalchemy engine](ttps://docs.sqlalchemy.org/en/14/core/connections.html). The latter is the gateway to the database at hand. -Example 1: Comparing numbers of rows ------------------------------------- +## Example 1: Comparing numbers of rows Assume we have two tables in the same database called ``table1`` and ``table2``. Now we would like to compare their numbers of rows. Naturally, we would like to retrieve @@ -46,25 +42,23 @@ the respective numbers of rows before we can compare them. For this purpose we c a ``BetweenTableRequirement`` referring to both tables and add a ``NRowsEquality`` ``Constraint`` onto it. - -.. code-block:: python - - import sqlalchemy as sa - from datajudge import BetweenRequirement - - engine = sa.create_engine(your_connection_string) - req = BetweenRequirement.from_tables( - db_name, - schema_name, - "table1", - db_name, - schema_name, - "table2", - ) - req.add_n_rows_equality_constraint() - n_rows1 = req[0].get_factual_value(engine) - n_rows2 = req[0].get_target_value(engine) - +```python +import sqlalchemy as sa +from datajudge import BetweenRequirement + +engine = sa.create_engine(your_connection_string) +req = BetweenRequirement.from_tables( + db_name, + schema_name, + "table1", + db_name, + schema_name, + "table2", +) +req.add_n_rows_equality_constraint() +n_rows1 = req[0].get_factual_value(engine) +n_rows2 = req[0].get_target_value(engine) +``` Note that here, we access the first (and only) ``Constraint`` that has been added to the ``BetweenRequirement`` by writing ``req[0]``. ``Requirements`` are are sequences of @@ -74,24 +68,22 @@ Once the numbers of rows are retrieved, we can compare them as we wish. For inst could compute the absolute and relative growth (or loss) of numbers of rows from ``table1`` to ``table2``: -.. code-block:: python - - absolute_change = abs(n_rows2 - n_rows1) - relative_change = (absolute_change) / n_rows1 if n_rows1 != 0 else None - +```python +absolute_change = abs(n_rows2 - n_rows1) +relative_change = (absolute_change) / n_rows1 if n_rows1 != 0 else None +``` Importantly, many datajudge staples, such as ``Condition`` s can be used, too. We shall see this in our next example. -Example 2: Investigating unique values --------------------------------------- +## Example 2: Investigating unique values In this example we will suppose that there is a table called ``table`` consisting of several columns. Two of its columns are supposed to be called ``col_int`` and ``col_varchar``. We are now interested in the unique values in these two columns combined. Put differently, we are wondering: - Which unique pairs of values in ``col_int`` and ``col_varchar`` have we encountered? +> Which unique pairs of values in ``col_int`` and ``col_varchar`` have we encountered? To add to the mix, we will moreover only be interested in tuples in which ``col_int`` has a value of larger than 10. @@ -99,38 +91,34 @@ value of larger than 10. As before, we will start off by creating a ``Requirement``. Since we are only dealing with a single table this time, we will create a ``WithinRequirement``. +```python +import sqlalchemy as sa +from datajudge import WithinRequirement, Condition -.. code-block:: python +engine = sa.create_engine(your_connection_string) - import sqlalchemy as sa - from datajudge import WithinRequirement, Condition +req = requirements.WithinRequirement.from_table( + db_name, + schema_name, + "table", +) - engine = sa.create_engine(your_connection_string) - - req = requirements.WithinRequirement.from_table( - db_name, - schema_name, - "table", - ) - - condition = Condition(raw_string="col_int >= 10") - - req.add_uniques_equality_constraint( - columns=["col_int", "col_varchar"], - uniques=[], # This is really just a placeholder. - condition=condition, - ) - uniques = req[0].get_factual_value(engine) +condition = Condition(raw_string="col_int >= 10") +req.add_uniques_equality_constraint( + columns=["col_int", "col_varchar"], + uniques=[], # This is really just a placeholder. + condition=condition, +) +uniques = req[0].get_factual_value(engine) +``` If one was to investigate this ``uniques`` variable further, one could, e.g. see the following: - -.. code-block:: python - - ([(10, 'hi10'), (11, 'hi11'), (12, 'hi12'), (13, 'hi13'), (14, 'hi14'), (15, 'hi15'), (16, 'hi16'), (17, 'hi17'), (18, 'hi18'), (19, 'hi19')], [1, 100, 12, 1, 7, 8, 1, 1, 1337, 1]) - +```python +([(10, 'hi10'), (11, 'hi11'), (12, 'hi12'), (13, 'hi13'), (14, 'hi14'), (15, 'hi15'), (16, 'hi16'), (17, 'hi17'), (18, 'hi18'), (19, 'hi19')], [1, 100, 12, 1, 7, 8, 1, 1, 1337, 1]) +``` This becomes easier to parse when inspecting the underlying ``retrieve`` method of the ``UniquesEquality`` ``Constraint``: the first value of the tuple corresponds to the list @@ -142,34 +130,29 @@ can use the fact that ``retrieve`` methods typically return an actual result or as well as the sqlalchemy selections that led to said result or value. We can use these selections and compile them to a standard, textual SQL query: - -.. code-block:: python - - values, selections = req[0].retrieve(engine, constraint.ref) - print(str(selections[0].compile(engine, compile_kwargs={"literal_binds": True})) - +```python +values, selections = req[0].retrieve(engine, constraint.ref) +print(str(selections[0].compile(engine, compile_kwargs={"literal_binds": True})) +``` In the case from above, this would return the following query: - -.. code-block:: sql - - SELECT - anon_1.col_int, - anon_1.col_varchar, - count(*) AS count_1 +```sql +SELECT + anon_1.col_int, +anon_1.col_varchar, +count(*) AS count_1 +FROM + (SELECT + tempdb.dbo.table.col_int AS col_int, + tempdb.dbo.table.col_varchar AS col_varchar FROM - (SELECT - tempdb.dbo.table.col_int AS col_int, - tempdb.dbo.table.col_varchar AS col_varchar - FROM - tempdb.dbo.table WITH (NOLOCK) - WHERE col_int >= 10) AS anon_1 - GROUP BY anon_1.col_int, anon_1.col_varchar + tempdb.dbo.table WITH (NOLOCK) + WHERE col_int >= 10) AS anon_1 +GROUP BY anon_1.col_int, anon_1.col_varchar +``` - -Example 3: Comparing column structure -------------------------------------- +## Example 3: Comparing column structure While we often care about value tuples of given columns, i.e. rows, it can also provide meaningful insights to compare the column structure of two tables. In particular, we @@ -186,58 +169,54 @@ method to retrieve the values of interest for the first table passed to the ``BetweenRequirement`` and the ``get_target_value`` method for the second table passed to the ``BetweenRequirement``. -.. code-block:: python - - import sqlalchemy as sa - from datajudge import BetweenRequirement +```python +import sqlalchemy as sa +from datajudge import BetweenRequirement - engine = sa.create_engine(your_connection_string) +engine = sa.create_engine(your_connection_string) - req = BetweenRequirement.from_tables( - db_name, - schema_name, - "table1", - db_name, - schema_name, - "table2", - ) +req = BetweenRequirement.from_tables( + db_name, + schema_name, + "table1", + db_name, + schema_name, + "table2", +) - req.add_column_subset_constraint() +req.add_column_subset_constraint() - columns1 = req[0].get_factual_value(engine) - columns2 = req[0].get_target_value(engine) +columns1 = req[0].get_factual_value(engine) +columns2 = req[0].get_target_value(engine) - print(f"Columns present in both: {set(columns1) & set(columns2)}") - print(f"Columns present in only table1: {set(columns1) - set(columns2)}") - print(f"Columns present in only table2: {set(columns2) - set(columns1)}") +print(f"Columns present in both: {set(columns1) & set(columns2)}") +print(f"Columns present in only table1: {set(columns1) - set(columns2)}") +print(f"Columns present in only table2: {set(columns2) - set(columns1)}") +``` This could, for instance result in the following printout: -.. code-block:: - - Columns present in both: {'col_varchar', 'col_int'} - Columns present in only table1: set() - Columns present in only table2: {'col_date'} - +``` +Columns present in both: {'col_varchar', 'col_int'} +Columns present in only table1: set() +Columns present in only table2: {'col_date'} +``` Now, we can investigate the types of the columns present in both tables: - -.. code-block:: python - - for column in set(columns1) & set(columns2): - req.add_column_type_constraint(column1=column, column2=column) - type1 = req[0].get_factual_value(engine) - type2 = req[0].get_target_value(engine) - print(f"Column '{column}' has type '{type1}' in table1 and type '{type2}' in table2.") - +```python +for column in set(columns1) & set(columns2): + req.add_column_type_constraint(column1=column, column2=column) +type1 = req[0].get_factual_value(engine) +type2 = req[0].get_target_value(engine) +print(f"Column '{column}' has type '{type1}' in table1 and type '{type2}' in table2.") +``` Depending on the underlying database management system and data, the output of this could for instance be: - -.. code-block:: - - Column 'col_varchar' has type 'varchar' in table1 and type 'varchar' in table2. - Column 'col_int' has type 'integer' in table1 and type 'integer' in table2. +``` +Column 'col_varchar' has type 'varchar' in table1 and type 'varchar' in table2. +Column 'col_int' has type 'integer' in table1 and type 'integer' in table2. +``` diff --git a/docs/examples/twitch.md b/docs/examples/twitch.md new file mode 100644 index 00000000..2e7a6534 --- /dev/null +++ b/docs/examples/twitch.md @@ -0,0 +1,276 @@ +# Dumps of Twitch data + +This example is based on data capturing statistics and properties of popular Twitch channels. +The setup is such that we have two data sets 'of the same kind' but from different points in time. + +In other words, a 'version' of the data set represents a temporal notion. +For example, version 1 might stem from end of March and version 2 from end of April. +Moreover, we will assume that the first, version 1, has been vetted and approved with the +help of manual investigation and domain knowledge. The second data set, version 2, has just been +made available. We would like to use it but can't be sure of its validity just yet. As a consequence +we would like to assess the quality of the data in version 2. + +In order to have a database Postgres instance to begin with, it might be useful to use our +[script](https://github.com/Quantco/datajudge/blob/main/start_postgres.sh), spinning up +a dockerized Postgres database: + +```bash +./start_postgres.sh +``` + +The original data set can be found on [kaggle](https://www.kaggle.com/datasets/aayushmishra1512/twitchdata). +For the sake of this tutorial, we slightly process it and provide two versions of it. +One can either recreate this by executing this +[processing script](https://github.com/Quantco/datajudge/tree/main/docs/source/examples/twitch_process.py) +oneself on the original data or download our processed files ( +[version 1](https://github.com/Quantco/datajudge/tree/main/docs/source/examples/twitch_version1.csv) +and +[version 2](https://github.com/Quantco/datajudge/tree/main/docs/source/examples/twitch_version2.csv)) +right away. + +Once both version of the data exist, they can be uploaded to the tabase. We provide an +[uploading script](https://github.com/Quantco/datajudge/tree/main/docs/source/examples/twitch_upload.py) +creating and populating one table per version of the data in a Postgres database. It resembles the +following: + +```python +address = os.environ.get("DB_ADDR", "localhost") +connection_string = f"postgresql://datajudge:datajudge@{address}:5432/datajudge" +engine = sa.create_engine(connection_string) +df_v2.to_sql("twitch_v2", engine, schema="public", if_exists="replace") +df_v1.to_sql("twitch_v1", engine, schema="public", if_exists="replace") +``` + +Once the tables are stored in a database, we can actually write a ``datajudge`` +specification against them. But first, we'll have a look at what the data roughly +looks like by investigating a random sample of four rows: + +**A sample of the data** + +| channel | watch_time | stream_time | peak_viewers | average_viewers | followers | followers_gained | views_gained | partnered | mature | language | +|----------|------------|-------------|--------------|-----------------|-----------|------------------|--------------|-----------|--------|-----------| +| xQcOW | 6196161750 | 215250 | 222720 | 27716 | 3246298 | 1734810 | 93036735 | True | False | English | +| summit1g | 6091677300 | 211845 | 310998 | 25610 | 5310163 | 1374810 | 89705964 | True | False | English | +| Gaules | 5644590915 | 515280 | 387315 | 10976 | 1767635 | 1023779 | 102611607 | True | True | Portuguese| +| ESL_CSGO | 3970318140 | 517740 | 300575 | 7714 | 3944850 | 703986 | 106546942 | True | False | English | + +Note that we expect both version 1 and version 2 to follow this structure. Due to them +being assembled at different points in time, merely their rows shows differ. + +Now let's write an actual specification, expressing our expectations against the data. +First, we need to make sure a connection to the database can be established at test execution +time. How this is done exactly depends on how you set up your database. When using our +default setup with running, this would look as follows: + +```python +import os +import pytest +import sqlalchemy as sa + + +@pytest.fixture(scope="module") +def datajudge_engine(): + address = os.environ.get("DB_ADDR", "localhost") + connection_string = f"postgresql://datajudge:datajudge@{address}:5432/datajudge" + return sa.create_engine(connection_string) +``` + +Once a way to connect to the database is defined, we want to declare our data sources and +express expectations against them. In this example, we have two tables in the same database - +one table per version of the Twitch data. + +Yet, let's start with a straightforward example only using version 2. We want to use our +domain knowledge that constrains the values of the ``language`` column only to contain letters +and have a length strictly larger than 0. + +```python +from datajudge import WithinRequirement + + +# Postgres' default database. +db_name = "tempdb" +# Postgres' default schema. +schema_name = "public" + +within_requirement = WithinRequirement.from_table( + table_name="twitch_v2", + schema_name=schema_name, +db_name=db_name, +) +within_requirement.add_varchar_regex_constraint( +column="language", +regex="^[a-zA-Z]+$", +) +``` + +Done! Now onto comparisons between the table representing the approved version 1 of the +data and the to be assessed version 2 of the data. + +```python +from datajudge import BetweenRequirement, Condition + +between_requirement_version = BetweenRequirement.from_tables( + db_name1=db_name, + db_name2=db_name, + schema_name1=schema_name, + schema_name2=schema_name, + table_name1="twitch_v1", + table_name2="twitch_v2", +) +between_requirement_version.add_column_subset_constraint() +between_requirement_version.add_column_superset_constraint() +columns = ["channel", "partnered", "mature"] +between_requirement_version.add_row_subset_constraint( +columns 1=columns, columns2=columns, constant_max_missing_fraction=0 +) +between_requirement_version.add_row_matching_equality_constraint( + matching_columns1=["channel"], + matching_columns2=["channel"], + comparison_columns1=["language"], + comparison_columns2=["language"], + max_missing_fraction=0, +) + +between_requirement_version.add_ks_2sample_constraint( + column1="average_viewers", + column2="average_viewers", + significance_level=0.05, +) +between_requirement_version.add_uniques_equality_constraint( + columns1=["language"], + columns2=["language"], +) +``` + +Now having compared the 'same kind of data' between version 1 and version 2, +we may as well compare 'different kind of data' within version 2, as a means of +a sanity check. This sanity check consists of checking whether the mean +``average_viewer`` value of mature channels should deviate at most 10% from +the overall mean. + +```python +between_requirement_columns = BetweenRequirement.from_tables( + db_name1=db_name, + db_name2=db_name, + schema_name1=schema_name, + schema_name2=schema_name, + table_name1="twitch_v2", + table_name2="twitch_v2", +) + +between_requirement_columns.add_numeric_mean_constraint( + column1="average_viewers", + column2="average_viewers", + condition1=None, + condition2=Condition(raw_string="mature IS TRUE"), + max_absolute_deviation=0.1, +) +``` + +Lastly, we need to collect all of our requirements in a list and make sure +``pytest`` can find them by calling ``collect_data_tests``. + +```python +from datajudge.pytest_integration import collect_data_tests +requirements = [ + within_requirement, + between_requirement_version, + between_requirement_columns, +] +test_func = collect_data_tests(requirements) +``` + +If we then test these expectations against the data by running + +```bash +pytest specification.py`` -- where ``specification.py +``` + +contains all of the code outlined before (you can find it +[here](https://github.com/Quantco/datajudge/tree/main/docs/source/examples/twitch_specification.py)) +-- we see that the new version of the data is +not quite on par with what we'd expect: + +```bash +$ pytest twitch_specification.py +================================== test session starts =================================== +platform darwin -- Python 3.10.5, pytest-7.1.2, pluggy-1.0.0 +rootdir: /Users/kevin/Code/datajudge/docs/source/examples +plugins: html-3.1.1, cov-3.0.0, metadata-2.0.2 +collected 8 items + +twitch_specification.py F.....FF [100%] + +======================================== FAILURES ======================================== +____________________ test_func[VarCharRegex::tempdb.public.twitch_v2] ____________________ + +constraint = +datajudge_engine = Engine(postgresql://datajudge:***@localhost:5432/datajudge) + +@pytest.mark.parametrize( + "constraint", all_constraints, ids=Constraint.get_description +) +def test_constraint(constraint, datajudge_engine): + test_result = constraint.test(datajudge_engine) +> assert test_result.outcome, test_result.failure_message +E AssertionError: tempdb.public.twitch_v2's column(s) 'language' breaks regex + '^[a-zA-Z]+$' in 0.045454545454545456 > 0.0 of the cases. In absolute terms, 1 + of the 22 samples violated the regex. Some counterexamples consist of the + following: ['Sw3d1zh']. + +../../../src/datajudge/pytest_integration.py:25: AssertionError +____________ test_func[UniquesEquality::public.twitch_v1 | public.twitch_v2] _____________ + +constraint = +datajudge_engine = Engine(postgresql://datajudge:***@localhost:5432/datajudge) + +@pytest.mark.parametrize( + "constraint", all_constraints, ids=Constraint.get_description +) +def test_constraint(constraint, datajudge_engine): + test_result = constraint.test(datajudge_engine) +> assert test_result.outcome, test_result.failure_message +E AssertionError: tempdb.public.twitch_v1's column(s) 'language' doesn't have + the element(s) '{'Sw3d1zh'}' when compared with the reference values. + +../../../src/datajudge/pytest_integration.py:25: AssertionError +______________ test_func[NumericMean::public.twitch_v2 | public.twitch_v2] _______________ + +constraint = +datajudge_engine = Engine(postgresql://datajudge:***@localhost:5432/datajudge) + +@pytest.mark.parametrize( + "constraint", all_constraints, ids=Constraint.get_description +) +def test_constraint(constraint, datajudge_engine): + test_result = constraint.test(datajudge_engine) +> assert test_result.outcome, test_result.failure_message +E AssertionError: tempdb.public.twitch_v2's column(s) 'average_viewers' has + mean 4734.9780000000000000, deviating more than 0.1 from + tempdb.public.twitch_v2's column(s) 'average_viewers''s + 3599.9826086956521739. Condition on second table: WHERE mature IS TRUE + +../../../src/datajudge/pytest_integration.py:25: AssertionError +================================ short test summary info ================================= +FAILED twitch_specification.py::test_func[VarCharRegex::tempdb.public.twitch_v2] - Asse... +FAILED twitch_specification.py::test_func[UniquesEquality::public.twitch_v1 | public.twitch_v2] +FAILED twitch_specification.py::test_func[NumericMean::public.twitch_v2 | public.twitch_v2] +============================== 3 failed, 5 passed in 1.52s =============================== +``` + +Alternatively, you can also look at these test results in +[this html report](./twitch_report.html) +generated by +[pytest-html](https://github.com/pytest-dev/pytest-html). + +Hence we see that we might not want to blindly trust version 2 of the data as is. Rather, we might need +to investigate what is wrong with the data, what this has been caused by and how to fix it. + +Concretely, what exactly do we learn from the error messages? + +* The column ``language`` now has a row with value ``'Sw3d1zh'``. This break two of our + constraints. The ``VarCharRegex`` constraint compared the columns' values to a regular + expression. The ``UniquesEquality`` constraint expected the unique values of the + ``language`` column to not have changed between version 1 and version 2. +* The mean value of ``average_viewers`` of ``mature`` channels is substantially - more + than our 10% tolerance - lower than the global mean. diff --git a/docs/source/examples/twitch_process.py b/docs/examples/twitch_process.py similarity index 100% rename from docs/source/examples/twitch_process.py rename to docs/examples/twitch_process.py diff --git a/docs/source/examples/twitch_report.html b/docs/examples/twitch_report.html similarity index 100% rename from docs/source/examples/twitch_report.html rename to docs/examples/twitch_report.html diff --git a/docs/source/examples/twitch_specification.py b/docs/examples/twitch_specification.py similarity index 100% rename from docs/source/examples/twitch_specification.py rename to docs/examples/twitch_specification.py diff --git a/docs/source/examples/twitch_upload.py b/docs/examples/twitch_upload.py similarity index 100% rename from docs/source/examples/twitch_upload.py rename to docs/examples/twitch_upload.py diff --git a/docs/source/examples/twitch_version1.csv b/docs/examples/twitch_version1.csv similarity index 100% rename from docs/source/examples/twitch_version1.csv rename to docs/examples/twitch_version1.csv diff --git a/docs/source/examples/twitch_version2.csv b/docs/examples/twitch_version2.csv similarity index 100% rename from docs/source/examples/twitch_version2.csv rename to docs/examples/twitch_version2.csv diff --git a/docs/source/examples/twitchdata.csv b/docs/examples/twitchdata.csv similarity index 100% rename from docs/source/examples/twitchdata.csv rename to docs/examples/twitchdata.csv diff --git a/docs/source/getting_started.rst b/docs/getting-started.md similarity index 72% rename from docs/source/getting_started.rst rename to docs/getting-started.md index af41dc89..81ebd5db 100644 --- a/docs/source/getting_started.rst +++ b/docs/getting-started.md @@ -1,9 +1,6 @@ -Getting Started -=============== +# Getting Started - -Glossary --------- +## Glossary - A ``DataSource`` represents a way to retrieve data from database. Typically, this corresponds to a table in the database. Yet, it could also be a more elaborate object. See the section on 'Alternative ``DataSource`` s' for more detail. @@ -17,41 +14,38 @@ Glossary - turns these ``Requirement`` s' ``Constraint`` s into individual tests - can be 'tested' by pytest - -Creating a specification ------------------------- +## Creating a specification In order to get going, you might want to use the following snippet in a new python file. This file will represent a specification. -.. code-block:: python - - import pytest - import sqlalchemy as sa - from datajudge.pytest_integration import collect_data_tests +```python +import pytest +import sqlalchemy as sa +from datajudge.pytest_integration import collect_data_tests - @pytest.fixture(scope="module") - def datajudge_engine(): - # TODO: Adapt connection string to database at hand. - return sa.create_engine("your_connection_string") +@pytest.fixture(scope="module") +def datajudge_engine(): + # TODO: Adapt connection string to database at hand. + return sa.create_engine("your_connection_string") - # TODO: Insert Requirement objects to list. - requirements = [] +# TODO: Insert Requirement objects to list. +requirements = [] - test_constraints = collect_data_tests(requirements) +test_constraints = collect_data_tests(requirements) +``` This file will eventually lead as an input to pytest. More on that in the section 'Testing a specification'. -In case you haven't worked with sqlalchemy engines before, you might need to install drivers to connect to your database. You might want to install snowflake-sqlalchemy when using Snowflake, pyscopg when using Postgres and platform-specific drivers (`Windows `_, `Linux `_, `macOS `_) when using MSSQL. +In case you haven't worked with sqlalchemy engines before, you might need to install drivers to connect to your database. You might want to install snowflake-sqlalchemy when using Snowflake, pyscopg when using Postgres and platform-specific drivers ([Windows](https://docs.microsoft.com/en-us/sql/connect/odbc/windows/microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15), [Linux](https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server?view=sql-server-ver15), [macOS](https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15)) when using MSSQL. - -Specifying Constraints ----------------------- +## Specifying Constraints In order to discover possible ``Constraint`` s, please investigate the ``_add_*_constraint`` methods -for :class:`~datajudge.requirements.BetweenRequirement` and :class:`~datajudge.requirements.WithinRequirement` respectively. +for [`BetweenRequirement`](datajudge.requirements.BetweenRequirement) and [`WithinRequirement`](datajudge.requirements.WithinRequirement) respectively. +TODO: FIX LINKS These methods are meant to be mostly self-documenting through the usage of expressive parameters. @@ -63,9 +57,7 @@ see ``tests/unit/test_condition.py``. Many ``Constraint`` s have optional ``columns`` parameters. If no argument is given, all available columns will be used. - -Defining limitations of change ------------------------------- +## Defining limitations of change ``BetweenRequirement`` s allow for ``Constraint`` s expressing the limitation of a loss or gain. For example, the ``NRowsMinGain`` ``Constraint`` expresses by how much the number of rows must at least grow from the first ``DataSource`` to the second. In the example of ``NRowsMinGain`` , @@ -79,51 +71,46 @@ Generally, such relative limitations can be defined in two ways: The former would translate to -:: - - #rows_table_2 > (1 + min_relative_gain) * #rows_table_1 +``` +#rows_table_2 > (1 + min_relative_gain) * #rows_table_1 +``` while the latter would translate to -:: - - date_growth := (max_date_table_2 - min_date_table_2) / (max_date_table_1 - min_date_table_1) - #rows_table_2 > (1 + date_growth) * #rows_table_1 - +``` +date_growth := (max_date_table_2 - min_date_table_2) / (max_date_table_1 - min_date_table_1) +#rows_table_2 > (1 + date_growth) * #rows_table_1 +``` In the latter case a date column must be passed during the instantiation of the ``BetweenRequirement``. Moreover, the ``date_range_*`` must be passed in the respective ``add_*_constraint`` method. When using date ranges as an indicator of change, the ``constant_max_*`` argument can safely be ignored. Additionally, an additional buffer to the date growth can be added with help of the ``date_range_gain_deviation`` parameter: -:: - - date_growth := (max_date_table_2 - min_date_table_2) / (max_date_table_1 - min_date_table_1) - #rows_table_2 > (1 + date_growth + date_range_gain_deviation) + * #rows_table_1 +``` +date_growth := (max_date_table_2 - min_date_table_2) / (max_date_table_1 - min_date_table_1) +#rows_table_2 > (1 + date_growth + date_range_gain_deviation) + * #rows_table_1 +``` This example revolving around ``NRowsMinGain`` generalizes to many ``Constraint`` s concerned with growth, gain, loss or shrinkage limitations. - -Testing a specification ------------------------ +## Testing a specification In order to test whether the ``Constraint`` s expressed in a specification hold true, you can simply run -:: - - pytest your_specification.py +```bash +pytest your_specification.py +``` This will produce results directly in your terminal. If you prefer to additionally generate a report, you can run -:: - - pytest your_specification.py --html=your_report.html +```bash +pytest your_specification.py --html=your_report.html +``` -As the testing relies on `pytest `_, all of `pytest`'s features can be used. More on this in the article on :doc:`testing `. +As the testing relies on [pytest](https://docs.pytest.org/en/latest), all of `pytest`'s features can be used. More on this in the article on [testing](testing.md). - -Test information ----------------- +## Test information When calling a ``Constraint``'s ``test`` method, a ``TestResult`` is returned. The latter comes with a ``logging_message`` field. This field comprises information about the test failure, the constraint at hand @@ -132,36 +119,31 @@ as well as the underlying database queries. Depending on the use case at hand, it might make sense to rely on this information for logging or data investigation purposes. Again, more on this in the article on :doc:`testing `. -Assertion Message Styling -------------------------- +## Assertion Message Styling + Constraints can use styling to increase the readability of their assertion messages. The styling can be set independently of the platform and converted to e.g. ANSI color codes for command line output or CSS color tags for HTML reports. The styling tags describe use cases and not concrete colors, so formatters can use arbitrary color palettes, and these are not fixed by the constraint. The following table lists all the supported codes, along with their descriptions and examples of how they can be used: +### Supported styling codes -.. list-table:: Supported styling codes - :header-rows: 1 - - * - Code - - Description - - Example - * - `numMatch` - - Indicates the part of a number that matches the expected value. - - `[numMatch]3.141[/numMatch]` - * - `numDiff` - - Indicates the part of a number that differs. - - `[numDiff]6[/numDiff]` +| Code | Description | Example | +|------|-------------|---------| +| `numMatch` | Indicates the part of a number that matches the expected value. | `[numMatch]3.141[/numMatch]` | +| `numDiff` | Indicates the part of a number that differs. | `[numDiff]6[/numDiff]` | -Alternative DataSources ---------------------------- +## Alternative DataSources -A ``Requirement`` is instantiated with either one or two fixed ``DataSource`` s. +A ``Requirement`` is instantiated with either one or two fixed ``DataSource``s. While the most typical example of a ``DataSource`` would be a table in a database, ``datajudge`` allows for other ``DataSource`` s as well. These are often derived from primitive tables of a database. +TODO: FIX TABLE + + Typically, a user does not need to instantiate a corresponding ``DataSource`` themselves. Rather, this is taken care of by using the appropriate constructor for ``WithinRequirement`` or ``BetweenRequirement``. @@ -190,9 +173,7 @@ Note that in principle, several tables can be combined to make up for a single ` the time when trying to compare two tables, it is more convenient to create a ``BetweenRequirement`` and use the ``from_tables`` constructor. - -Column capitalization ---------------------- +## Column capitalization Different database management systems handle the capitalization of entities, such as column names, differently. For the time being: @@ -201,7 +182,7 @@ For the time being: - Postgres: ``datajudge`` expects lowercase column names. - Snowflake: ``datajudge`` will lowercase independently of the capitalization provided. -The Snowflake behavior is due to an upstream `bug `_ +The Snowflake behavior is due to an upstream [bug](https://github.com/snowflakedb/snowflake-sqlalchemy/issues/157) in snowflake-sqlalchemy. This behavior is subject to change. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..cd1e913b --- /dev/null +++ b/docs/index.md @@ -0,0 +1,23 @@ +# datajudge + +``datajudge`` allows for assessing whether data from database complies with reference information. + +While meant to be as agnostic to concrete database management systems as possible, ``datajudge`` currently explicitly supports: + +- Postgres +- MSSQL +- Snowflake + +[API Documentation](api-documentation.md) + +## Contents + +TODO: Proper table of contents + +installation +Getting Started +testing +motivation +Examples +development +API Reference diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..d841d527 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,15 @@ +# Installation + +To install, execute + +```bash +pip install datajudge +``` + +or to install it into a conda environment + +```bash +pixi add datajudge +# or +conda install datajudge -c conda-forge +``` diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 9534b018..00000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/source/motivation.rst b/docs/motivation.md similarity index 68% rename from docs/source/motivation.rst rename to docs/motivation.md index b4368bdf..fe6695b4 100644 --- a/docs/source/motivation.rst +++ b/docs/motivation.md @@ -1,15 +1,12 @@ -Motivation -========== +# Motivation Ensuring data quality is of great importance for many use cases. ``datajudge`` seeks to make this convenient. -``datajudge`` allows for the expression of expectations held against data stored in databases. In particular, it allows for comparing different ``DataSource`` s. Yet, it also comes with functionalities to compare data from a single ``DataSource`` to fixed reference values derived from explicit domain knowledge. +``datajudge`` allows for the expression of expectations held against data stored in databases. In particular, it allows for comparing different ``DataSource``s. Yet, it also comes with functionalities to compare data from a single ``DataSource`` to fixed reference values derived from explicit domain knowledge. Not trying to reinvent the wheel, ``datajudge`` relies on ``pytest`` to execute the data expectations. - -Comparisons between DataSources -------------------------------- +## Comparisons between DataSources The data generating process can be obscure for a variety of reasons. In such scenarios one might ask the questions of @@ -18,14 +15,12 @@ The data generating process can be obscure for a variety of reasons. In such sce In both cases one might want to compare different data -- either from different points in time or from different transformation steps -- to each other. +## Why not Great Expectations? -Why not Great Expectations? ---------------------------- - -The major selling point is to be able to conveniently express expectations **between** different ``DataSource`` s. Great Expectations, in contrast, focuses on expectations against a single ``DataSource``. +The major selling point is to be able to conveniently express expectations **between** different ``DataSource``s. Great Expectations, in contrast, focuses on expectations against a single ``DataSource``. Moreover, some users have pointed out the following advantages: -- lots of 'query writing' is taken care of by having tailored ``Constraint`` s +- lots of 'query writing' is taken care of by having tailored ``Constraint``s - easier and faster onboarding - assertion messages with counterexamples and other context information, speeding up the data debugging process diff --git a/docs/source/report_failing_query1.png b/docs/report_failing_query1.png similarity index 100% rename from docs/source/report_failing_query1.png rename to docs/report_failing_query1.png diff --git a/docs/source/report_failing_query2.png b/docs/report_failing_query2.png similarity index 100% rename from docs/source/report_failing_query2.png rename to docs/report_failing_query2.png diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index f41c8011..00000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,44 +0,0 @@ -# Configuration file for the Sphinx documentation builder. - -# -- Project information -project = "datajudge" -copyright = "(C) 2022 QuantCo Inc." -author = "QuantCo Inc." - -release = "1.0" -version = "1.0.0" - -extensions = [ - "numpydoc", - "sphinxcontrib.apidoc", - "sphinx.ext.autodoc", -] - - -apidoc_module_dir = "../../src/datajudge" -apidoc_output_dir = "api" -apidoc_separate_modules = True -apidoc_excluded_paths = [ - "../../src/datajudge/db_access.py", - "../../src/datajudge/constraints", - # Requirements should be part of the exposed API documentation. - # Yet, they are already exposed via the top-level module. - "../../src/datajudge/requirements.py", -] -apidoc_extra_args = ["--implicit-namespaces"] - -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] -html_theme = "sphinx_rtd_theme" -html_static_path = ["_static"] - -autodoc_default_options = { - "members": True, - "member-order": "bysource", - "inherited-members": True, - "undoc-members": True, -} -autodoc_typehints = "description" - -# Copied from https://stackoverflow.com/questions/65198998/sphinx-warning-autosummary-stub-file-not-found-for-the-methods-of-the-class-c/ -# Also tested numpydoc_class_members_toctree = False but it does still create a TOC -numpydoc_show_class_members = False diff --git a/docs/source/examples/example.rst b/docs/source/examples/example.rst deleted file mode 100644 index c7663077..00000000 --- a/docs/source/examples/example.rst +++ /dev/null @@ -1,151 +0,0 @@ -Example: Company data -===================== - - -To get started, we will create a sample database using sqlite that contains a list of companies. - -The table "companies_archive" contains three entries: - -.. list-table:: companies_archive - :header-rows: 1 - - * - id - - name - - num_employees - * - 1 - - QuantCo - - 90 - * - 2 - - Google - - 140,000 - * - 3 - - BMW - - 110,000 - -While "companies" contains an additional entry: - -.. list-table:: companies - :header-rows: 1 - - * - id - - name - - num_employees - * - 1 - - QuantCo - - 100 - * - 2 - - Google - - 150,000 - * - 3 - - BMW - - 120,000 - * - 4 - - Apple - - 145,000 - - -.. code-block:: python - - import sqlalchemy as sa - - eng = sa.create_engine('sqlite:///example.db') - - with eng.connect() as con: - con.execute("CREATE TABLE companies (id INTEGER PRIMARY KEY, name TEXT, num_employees INTEGER)") - con.execute("INSERT INTO companies (name, num_employees) VALUES ('QuantCo', 100), ('Google', 150000), ('BMW', 120000), ('Apple', 145000)") - con.execute("CREATE TABLE companies_archive (id INTEGER PRIMARY KEY, name TEXT, num_employees INTEGER)") - con.execute("INSERT INTO companies_archive (name, num_employees) VALUES ('QuantCo', 90), ('Google', 140000), ('BMW', 110000)") - - -As an example, we will run 4 tests on this table: - -1. Does the table "companies" contain a column named "name"? -2. Does the table "companies" contain at least 1 entry with the name "QuantCo"? -3. Does the column "num_employees" of the "companies" table have all positive values? -4. Does the column "name" of the table "companies" contain at least all the values of - the corresponding column in "companies_archive"? - -.. code-block:: python - - import pytest - import sqlalchemy as sa - - from datajudge import ( - Condition, - WithinRequirement, - BetweenRequirement, - ) - from datajudge.pytest_integration import collect_data_tests - - - # We create a Requirement, within a table. This object will contain - # all the constraints we want to test on the specified table. - # To test another table or test the same table against another table, - # we would create another Requirement object. - companies_req = WithinRequirement.from_table( - db_name="example", schema_name=None, table_name="companies" - ) - - # Constraint 1: Does the table "companies" contain a column named "name"? - companies_req.add_column_existence_constraint(columns=["name"]) - - # Constraint 2: Does the table "companies" contain at least 1 entry with the name "QuantCo"? - condition = Condition(raw_string="name = 'QuantCo'") - companies_req.add_n_rows_min_constraint(n_rows_min=1, condition=condition) - - # Constraint 3: Does the column "num_employees" of the "companies" table have all - # positive values? - companies_req.add_numeric_min_constraint(column="num_employees", min_value=1) - - # We create a new Requirement, this time between different tables. - # Concretely, we intent to test constraints between the table "companies" - # and the table "companies_archive". - companies_between_req = BetweenRequirement.from_tables( - db_name1="example", - schema_name1=None, - table_name1="companies", - db_name2="example", - schema_name2=None, - table_name2="companies_archive", - ) - - # Constraint 4: Does the column "name" of the table "companies" contain at least all - # the values of the corresponding column in "companies_archive"? - companies_between_req.add_row_superset_constraint( - columns1=['name'], columns2=['name'], constant_max_missing_fraction=0 - ) - - # collect_data_tests expects a pytest fixture with the name - # "datajudge_engine" that is a SQLAlchemy engine - - @pytest.fixture() - def datajudge_engine(): - return sa.create_engine("sqlite:///example.db") - - # We gather our distinct Requirements in a list. - requirements = [companies_req, companies_between_req] - - # "collect_data_tests" takes all requirements and turns their respective - # Constraints into individual tests. pytest will be able to pick - # up these tests. - test_constraint = collect_data_tests(requirements) - - -Saving this file as ``specification.py`` and running ``$ pytest specification.py`` -will verify that all constraints are satisfied. The output you see in the terminal -should be similar to this: - -.. code-block:: - - =================================== test session starts =================================== - ... - collected 4 items - - specification.py::test_constraint[ColumnExistence::companies] PASSED [ 25%] - specification.py::test_constraint[NRowsMin::companies] PASSED [ 50%] - specification.py::test_constraint[NumericMin::companies] PASSED [ 75%] - specification.py::test_constraint[RowSuperset::companies|companies_archive] PASSED [100%] - - ==================================== 4 passed in 0.31s ==================================== - -You can also use a formatted html report using the ``--html=report.html`` flag. diff --git a/docs/source/examples/example_dates.rst b/docs/source/examples/example_dates.rst deleted file mode 100644 index 702e5791..00000000 --- a/docs/source/examples/example_dates.rst +++ /dev/null @@ -1,169 +0,0 @@ -Example: Dates -============== - -This example concerns itself with expressing ``Constraint``\s against data revolving -around dates. While date ``Constraint``\s between tables exist, we will only illustrate -``Constraint``\s on a single table and reference values here. As a consequence, we will -only use ``WithinRequirement``, as opposed to ``BetweenRequirement``. - -Concretely, we will assume a table containing prices for a given product of id 1. -Importantly, these prices are valid for a certain date range only. More precisely, -we assume that the price for a product - identified via the ``preduct_id`` column -- is indicated in the ``price`` column, the date from which it is valid - the date -itself included - in ``date_from`` and the the until when it is valid - the date -itself included - in the ``date_to`` column. - -Such a table might look as follows: - -.. list-table:: prices - :header-rows: 1 - - * - product_id - - price - - date_from - - date_to - * - 1 - - 13.99 - - 22/01/01 - - 22/01/10 - * - 1 - - 14.5 - - 22/01/11 - - 22/01/17 - * - 1 - - 13.37 - - 22/01/16 - - 22/01/31 - -Given this table, we would like to ensure - for the sake of illustrational purposes - -that 6 constraints are satisfied: - -1. All values from column ``date_from`` should be in January 2022. -2. All values from column ``date_to`` should be in January 2022. -3. The minimum value in column ``date_from`` should be the first of January 2022. -4. The maximum value in column ``date_to`` should be the 31st of January 2022. -5. There is no gap between ``date_from`` and ``date_to``. In other words, every date - of January has to be assigned to at least one row for a given product. -6. There is no overlap between ``date_from`` and ``date_to``. In other words, every - date of January has to be assigned to at most one row for a given product. - - -Assuming that such a table exists in database, we can write a specification against it. - -.. code-block:: python - - import pytest - import sqlalchemy as sa - - from datajudge import WithinRequirement - from datajudge.pytest_integration import collect_data_tests - - # We create a Requirement, within a table. This object will contain - # all the constraints we want to test on the specified table. - # To test another table or test the same table against another table, - # we would create another Requirement object. - prices_req = WithinRequirement.from_table( - db_name="example", schema_name=None, table_name="prices" - ) - - # Constraint 1: - # All values from column date_from should be in January 2022. - prices_req.add_date_between_constraint( - column="date_from", - lower_bound="'20220101'", - upper_bound="'20220131'", - # We don't tolerate any violations of the constraint: - min_fraction=1, - ) - - # Constraint 2: - # All values from column date_to should be in January 2022. - prices_req.add_date_between_constraint( - column="date_to", - lower_bound="'20220101'", - upper_bound="'20220131'", - # We don't tolerate any violations of the constraint: - min_fraction=1, - ) - - # Constraint 3: - # The minimum value in column date_from should be the first of January 2022. - - # Ensure that the minimum is smaller or equal the reference value min_value. - prices_req.add_date_min_constraint(column="date_from", min_value="'20220101'") - # Ensure that the minimum is greater or equal the reference value min_value. - prices_req.add_date_min_constraint( - column="date_from", - min_value="'20220101'", - use_upper_bound_reference=True, - ) - - # Constraint 4: - # The maximum value in column date_to should be the 31st of January 2022. - - # Ensure that the maximum is greater or equal the reference value max_value. - prices_req.add_date_max_constraint(column="date_to", max_value="'20220131'") - # Ensure that the maximum is smaller or equal the reference value max_value. - prices_req.add_date_max_constraint( - column="date_to", - max_value="'20220131'", - use_upper_bound_reference=True, - ) - - # Constraint 5: - # There is no gap between date_from and date_to. In other words, every date - # of January has to be assigned to at least one row for a given product. - prices_req.add_date_no_gap_constraint( - start_column="date_from", - end_column="date_to", - # We don't want a gap of price date ranges for a given product. - # For different products, we allow arbitrary date gaps. - key_columns=["product_id"], - # As indicated in prose, date_from and date_to are included in ranges. - end_included=True, - # Again, we don't expect any violations of our constraint. - max_relative_violations=0, - ) - - # Constraint 6: - # There is no overlap between date_from and date_to. In other words, every - # of January has to be assigned to at most one row for a given product. - princes_req.add_date_no_overlap_constraint( - start_column="date_from", - end_column="date_to", - # We want no overlap of price date ranges for a given product. - # For different products, we allow arbitrary date overlaps. - key_columns=["product_id"], - # As indicated in prose, date_from and date_to are included in ranges. - end_included=True, - # Again, we don't expect any violations of our constraint. - max_relative_violations=0, - ) - - @pytest.fixture() - def datajudge_engine(): - # TODO: Insert actual connection string - return sa.create_engine("your_db://") - - # We gather our single Requirement in a list. - requirements = [prices_req] - - # "collect_data_tests" takes all requirements and turns their respective - # Constraints into individual tests. pytest will be able to pick - # up these tests. - test_constraint = collect_data_tests(requirements) - -Please note that the ``DateNoOverlap`` and ``DateNoGap`` constraints also exist -in a slightly different form: ``DateNoOverlap2d`` and ``DateNoGap2d``. -As the names suggest, these can operate in 'two date dimensions'. - -For example, let's assume a table with four date columns, representing two -ranges in distinct dimensions, respectively: - -* ``date_from``: Date from when a price is valid -* ``date_to``: Date until when a price is valid -* ``date_definition_from``: Date when a price definition was inserted -* ``date_definition_to``: Date until when a price definition was used - -Analogously to the unidimensional scenario illustrated here, one might care -for certain constraints in two dimensions. diff --git a/docs/source/examples/example_twitch.rst b/docs/source/examples/example_twitch.rst deleted file mode 100644 index 2666831c..00000000 --- a/docs/source/examples/example_twitch.rst +++ /dev/null @@ -1,332 +0,0 @@ -Example: Dumps of Twitch data -============================= - -This example is based on data capturing statistics and properties of popular Twitch channels. -The setup is such that we have two data sets 'of the same kind' but from different points in time. - -In other words, a 'version' of the data set represents a temporal notion. -For example, version 1 might stem from end of March and version 2 from end of April. -Moreover, we will assume that the first, version 1, has been vetted and approved with the -help of manual investigation and domain knowledge. The second data set, version 2, has just been -made available. We would like to use it but can't be sure of its validity just yet. As a consequence -we would like to assess the quality of the data in version 2. - -In order to have a database Postgres instance to begin with, it might be useful to use our -`script `_, spinning up -a dockerized Postgres database: - -.. code-block:: console - - $ ./start_postgres.sh - - -The original data set can be found on `kaggle `_. -For the sake of this tutorial, we slightly process it and provide two versions of it. -One can either recreate this by executing this -`processing script `_ -oneself on the original data or download our processed files ( -`version 1 `_ -and -`version 2 `_) -right away. - -Once both version of the data exist, they can be uploaded to the tabase. We provide an -`uploading script `_ -creating and populating one table per version of the data in a Postgres database. It resembles the -following: - -.. code-block:: python - - address = os.environ.get("DB_ADDR", "localhost") - connection_string = f"postgresql://datajudge:datajudge@{address}:5432/datajudge" - engine = sa.create_engine(connection_string) - df_v2.to_sql("twitch_v2", engine, schema="public", if_exists="replace") - df_v1.to_sql("twitch_v1", engine, schema="public", if_exists="replace") - - -Once the tables are stored in a database, we can actually write a ``datajudge`` -specification against them. But first, we'll have a look at what the data roughly -looks like by investigating a random sample of four rows: - -.. list-table:: A sample of the data - :header-rows: 1 - - * - channel - - watch_time - - stream_time - - peak_viewers - - average_viewers - - followers - - followers_gained - - views_gained - - partnered - - mature - - language - * - xQcOW - - 6196161750 - - 215250 - - 222720 - - 27716 - - 3246298 - - 1734810 - - 93036735 - - True - - False - - English - * - summit1g - - 6091677300 - - 211845 - - 310998 - - 25610 - - 5310163 - - 1374810 - - 89705964 - - True - - False - - English - * - Gaules - - 5644590915 - - 515280 - - 387315 - - 10976 - - 1767635 - - 1023779 - - 102611607 - - True - - True - - Portuguese - * - ESL_CSGO - - 3970318140 - - 517740 - - 300575 - - 7714 - - 3944850 - - 703986 - - 106546942 - - True - - False - - English - -Note that we expect both version 1 and version 2 to follow this structure. Due to them -being assembled at different points in time, merely their rows shows differ. - - -Now let's write an actual specification, expressing our expectations against the data. -First, we need to make sure a connection to the database can be established at test execution -time. How this is done exactly depends on how you set up your database. When using our -default setup with running, this would look as follows: - -.. code-block:: python - - import os - import pytest - import sqlalchemy as sa - - - @pytest.fixture(scope="module") - def datajudge_engine(): - address = os.environ.get("DB_ADDR", "localhost") - connection_string = f"postgresql://datajudge:datajudge@{address}:5432/datajudge" - return sa.create_engine(connection_string) - -Once a way to connect to the database is defined, we want to declare our data sources and -express expectations against them. In this example, we have two tables in the same database - -one table per version of the Twitch data. - - -Yet, let's start with a straightforward example only using version 2. We want to use our -domain knowledge that constrains the values of the ``language`` column only to contain letters -and have a length strictly larger than 0. - - -.. code-block:: python - - from datajudge import WithinRequirement - - - # Postgres' default database. - db_name = "tempdb" - # Postgres' default schema. - schema_name = "public" - - within_requirement = WithinRequirement.from_table( - table_name="twitch_v2", - schema_name=schema_name, - db_name=db_name, - ) - within_requirement.add_varchar_regex_constraint( - column="language", - regex="^[a-zA-Z]+$", - ) - - -Done! Now onto comparisons between the table representing the approved version 1 of the -data and the to be assessed version 2 of the data. - -.. code-block:: python - - from datajudge import BetweenRequirement, Condition - - between_requirement_version = BetweenRequirement.from_tables( - db_name1=db_name, - db_name2=db_name, - schema_name1=schema_name, - schema_name2=schema_name, - table_name1="twitch_v1", - table_name2="twitch_v2", - ) - between_requirement_version.add_column_subset_constraint() - between_requirement_version.add_column_superset_constraint() - columns = ["channel", "partnered", "mature"] - between_requirement_version.add_row_subset_constraint( - columns1=columns, columns2=columns, constant_max_missing_fraction=0 - ) - between_requirement_version.add_row_matching_equality_constraint( - matching_columns1=["channel"], - matching_columns2=["channel"], - comparison_columns1=["language"], - comparison_columns2=["language"], - max_missing_fraction=0, - ) - - between_requirement_version.add_ks_2sample_constraint( - column1="average_viewers", - column2="average_viewers", - significance_level=0.05, - ) - between_requirement_version.add_uniques_equality_constraint( - columns1=["language"], - columns2=["language"], - ) - - -Now having compared the 'same kind of data' between version 1 and version 2, -we may as well compare 'different kind of data' within version 2, as a means of -a sanity check. This sanity check consists of checking whether the mean -``average_viewer`` value of mature channels should deviate at most 10% from -the overall mean. - -.. code-block:: python - - between_requirement_columns = BetweenRequirement.from_tables( - db_name1=db_name, - db_name2=db_name, - schema_name1=schema_name, - schema_name2=schema_name, - table_name1="twitch_v2", - table_name2="twitch_v2", - ) - - between_requirement_columns.add_numeric_mean_constraint( - column1="average_viewers", - column2="average_viewers", - condition1=None, - condition2=Condition(raw_string="mature IS TRUE"), - max_absolute_deviation=0.1, - ) - - -Lastly, we need to collect all of our requirements in a list and make sure -``pytest`` can find them by calling ``collect_data_tests``. - - -.. code-block:: python - - from datajudge.pytest_integration import collect_data_tests - requirements = [ - within_requirement, - between_requirement_version, - between_requirement_columns, - ] - test_func = collect_data_tests(requirements) - -If we then test these expectations against the data by running -``$ pytest specification.py`` -- where ``specification.py`` -contains all of the code outlined before (you can find it -`here `_ ) --- we see that the new version of the data is -not quite on par with what we'd expect: - -.. code-block:: console - - $ pytest twitch_specification.py - ================================== test session starts =================================== - platform darwin -- Python 3.10.5, pytest-7.1.2, pluggy-1.0.0 - rootdir: /Users/kevin/Code/datajudge/docs/source/examples - plugins: html-3.1.1, cov-3.0.0, metadata-2.0.2 - collected 8 items - - twitch_specification.py F.....FF [100%] - - ======================================== FAILURES ======================================== - ____________________ test_func[VarCharRegex::tempdb.public.twitch_v2] ____________________ - - constraint = - datajudge_engine = Engine(postgresql://datajudge:***@localhost:5432/datajudge) - - @pytest.mark.parametrize( - "constraint", all_constraints, ids=Constraint.get_description - ) - def test_constraint(constraint, datajudge_engine): - test_result = constraint.test(datajudge_engine) - > assert test_result.outcome, test_result.failure_message - E AssertionError: tempdb.public.twitch_v2's column(s) 'language' breaks regex - '^[a-zA-Z]+$' in 0.045454545454545456 > 0.0 of the cases. In absolute terms, 1 - of the 22 samples violated the regex. Some counterexamples consist of the - following: ['Sw3d1zh']. - - ../../../src/datajudge/pytest_integration.py:25: AssertionError - ____________ test_func[UniquesEquality::public.twitch_v1 | public.twitch_v2] _____________ - - constraint = - datajudge_engine = Engine(postgresql://datajudge:***@localhost:5432/datajudge) - - @pytest.mark.parametrize( - "constraint", all_constraints, ids=Constraint.get_description - ) - def test_constraint(constraint, datajudge_engine): - test_result = constraint.test(datajudge_engine) - > assert test_result.outcome, test_result.failure_message - E AssertionError: tempdb.public.twitch_v1's column(s) 'language' doesn't have - the element(s) '{'Sw3d1zh'}' when compared with the reference values. - - ../../../src/datajudge/pytest_integration.py:25: AssertionError - ______________ test_func[NumericMean::public.twitch_v2 | public.twitch_v2] _______________ - - constraint = - datajudge_engine = Engine(postgresql://datajudge:***@localhost:5432/datajudge) - - @pytest.mark.parametrize( - "constraint", all_constraints, ids=Constraint.get_description - ) - def test_constraint(constraint, datajudge_engine): - test_result = constraint.test(datajudge_engine) - > assert test_result.outcome, test_result.failure_message - E AssertionError: tempdb.public.twitch_v2's column(s) 'average_viewers' has - mean 4734.9780000000000000, deviating more than 0.1 from - tempdb.public.twitch_v2's column(s) 'average_viewers''s - 3599.9826086956521739. Condition on second table: WHERE mature IS TRUE - - ../../../src/datajudge/pytest_integration.py:25: AssertionError - ================================ short test summary info ================================= - FAILED twitch_specification.py::test_func[VarCharRegex::tempdb.public.twitch_v2] - Asse... - FAILED twitch_specification.py::test_func[UniquesEquality::public.twitch_v1 | public.twitch_v2] - FAILED twitch_specification.py::test_func[NumericMean::public.twitch_v2 | public.twitch_v2] - ============================== 3 failed, 5 passed in 1.52s =============================== - -Alternatively, you can also look at these test results in -`this html report `_ -generated by -`pytest-html `_. - -Hence we see that we might not want to blindly trust version 2 of the data as is. Rather, we might need -to investigate what is wrong with the data, what this has been caused by and how to fix it. - -Concretely, what exactly do we learn from the error messages? - -* The column ``language`` now has a row with value ``'Sw3d1zh'``. This break two of our - constraints. The ``VarCharRegex`` constraint compared the columns' values to a regular - expression. The ``UniquesEquality`` constraint expected the unique values of the - ``language`` column to not have changed between version 1 and version 2. -* The mean value of ``average_viewers`` of ``mature`` channels is substantially - more - than our 10% tolerance - lower than the global mean. diff --git a/docs/source/examples/examples.rst b/docs/source/examples/examples.rst deleted file mode 100644 index 236cbef5..00000000 --- a/docs/source/examples/examples.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. toctree:: - :maxdepth: 2 - - Example: Companies - Example: Twitch - Example: Dates - Example: Exploration diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index d7917ce6..00000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -Welcome to datajudges's documentation! -====================================== - -``datajudge`` allows for assessing whether data from database complies with reference information. - -While meant to be as agnostic to concrete database management systems as possible, ``datajudge`` currently explicitly supports: - -- Postgres -- MSSQL -- Snowflake - -Contents --------- - -.. toctree:: - - installation - Getting Started - testing - motivation - Examples - development - API Reference diff --git a/docs/source/installation.rst b/docs/source/installation.rst deleted file mode 100644 index 7311cf59..00000000 --- a/docs/source/installation.rst +++ /dev/null @@ -1,14 +0,0 @@ -Installation -============ - -To install, execute - -:: - - pip install datajudge - -or from a conda environment - -:: - - conda install datajudge -c conda-forge diff --git a/docs/source/testing.rst b/docs/source/testing.rst deleted file mode 100644 index f6b44cee..00000000 --- a/docs/source/testing.rst +++ /dev/null @@ -1,323 +0,0 @@ -Testing -======= - -While ``datajudge`` allows to express expectations via specifications, ``Requirement`` s -and ``Constraint`` s, the execution of tests is delegated to pytest. As a -consequence, one may use any functionalities that pytest has to offer. Here, -we want to illustrate some of these advanced functionalities that might turn out useful. - -Yet, it should be noted that for most intents and purposes, using ``datajudge`` 's helper -function :func:`~datajudge.pytest_integration.collect_data_tests` is a good starting -point. It should work out of the box and hides some complexity. For exemplary applications -see, the -:doc:`companies example ` or the -:doc:`twitch example `. - -Throughout this article we will not rely on ``collect_data_tests``. Instead we will more -explicitly create a mechanism turning a List of ``Requirement`` objects into something -that can be tested by pytest manually. Importantly, we want every ``Constraint`` of every -``Requirement`` to be tested independently of each other. For instance, we would not like -one failing test to halt all others. - -Many of these approaches rely on adapting pytest's ``conftest.py``. If you are not familiar -with this concept, you might want to read up on it -`in the pytest docs `_. - -Subselection ------------- - -Most often one might want to run all tests defined by a specification. - -Yet, for example after believing to have fixed a data problem, one might simply -want to test whether a single test, which had previously been failing, succeeds -at last. - -Another example for when one would like to test a subset of tests is if the data at -hand is not available in its entirety. Rather, it could be that one would like -to run a subset of the test suite against a subsample of the typical dataset. - -In this section, we present two approaches to do a subselection of tests. - -Ex-post: subselecting generated tests -************************************* - -Instead of merely running ``$ pytest specification.py`` one may add pytests's -``-k`` flag and specify the ``Constraint`` (s) one cares about. - -Importantly, every ``Constraint`` object can be identified via a name. If one wants -to figure out how this string is built, please refer to the implementation of -:meth:`~datajudge.constraints.base.Constraint.get_description`. -Otherwise, one could also just run all of the tests once and investigate -the resulting test report to find the relevant names. - -When only caring about the ``UniquesEquality`` constraint in our -:doc:`twitch example `. -one might for instance use the following prefix the filter for it: - -.. code-block:: console - - $ pytest twitch_specification.py -k "UniquesEquality::public.twitch_v1" - -Ex-ante: Defining categories of tests -************************************* - -Another option to subselect a certain set of tests is by use of -`pytest markers `_. -The following is one way of using markers in conjunction with ``datajudge``. - -In this particular illustration we'll allow for two markers: - -* ``basic``: indicating that only truly fundamental tests should be run -* ``all``: indicating that any available test should be run - -For that matter we'll add a bit of pytest magic to the respective ``conftest.py``. - -.. code-block:: python - :caption: ``conftest.py`` - - def pytest_generate_tests(metafunc): - if "basic_constraint" in metafunc.fixturenames: - metafunc.parametrize( - "basic_constraint", - # Find these functions in specification.py. - metafunc.module.get_basic_constraints(), - ids=metafunc.module.idfn, - ) - if "constraint" in metafunc.fixturenames: - metafunc.parametrize( - "constraint", - # Find these functions in specification.py. - metafunc.module.get_all_constraints(), - ids=metafunc.module.idfn, - ) - - -Moreover, we'll have to register these markers in pytest's ``pytest.ini`` file. -You can read more about these files -`here `_. - - -.. code-block:: - :caption: ``pytest.ini`` - - [pytest] - addopts = --strict-markers - markers = basic: basic specification - all: entire specification - -Once that is taken care of, one can adapt one's specification as follows: - -.. code-block:: python - :caption: ``specification.py`` - - def get_basic_requirements() -> List[Requirement]: - # Create relevant Requirement objects and respective Constraints. - # ... - - return requirements - - def get_advanced_requirements() -> List[Requirement]: - # Create relevant Requirement objects and respective Constraints. - # ... - - return requirements - - def get_basic_constraints() -> List[Constraint]: - return [constraint for requirement in get_basic_requirements() for constraint in requirement] - - def get_all_constraints() -> List[Constraint]: - all_requirements = get_basic_requirements() + get_advanced_requirements() - return [constraint for requirement in all_requirements for constraint in requirement] - - # Function used in conftest.py. - # Given a constraint, returns an identifier used to refer to it as a test. - def idfn(constraint): - return constraint.get_description() - - @pytest.mark.basic - def test_basic_constraint(basic_constraint: Constraint, datajudge_engine): - test_result = basic_constraint.test(datajudge_engine) - assert test_result.outcome, test_result.failure_message - - @pytest.mark.all - def test_all_constraint(constraint: Constraint, datajudge_engine): - test_result = constraint.test(datajudge_engine) - assert test_result.outcome, test_result.failure_message - -Once these changes are taken care of, one may run - -.. code-block:: console - - $ pytest specification.py -m basic - -to only test the basic ``Requirement`` s or - -.. code-block:: console - - $ pytest specification.py -m all - -to test all ``Requirement`` s. - - -Using parameters in a specification ------------------------------------ - -A given specification might rely on identifiers such as database names or -table names. Moreover it might be that, e.g. when iterating from one version -of the data to another, these names change. - -In other words, it could be that the logic should remain unchanged while pointers -to data might change. Therefore, one might just as well consider -those pointers or identifiers as parameters of the specification. - -For the sake of concreteness, we will assume here that we wish frame two -identifiers as parameters: - -* ``new_db``: the name of the 'new database' -* ``old_db``: the name of the 'old database' - -In light of that we will again adapt pytest's ``conftest.py``: - -.. code-block:: python - :caption: ``conftest.py`` - - def pytest_addoption(parser): - parser.addoption("--new_db", action="store", help="name of the new database") - parser.addoption("--old_db", action="store", help="name of the old database") - - - def pytest_generate_tests(metafunc): - params = { - "db_name_new": metafunc.config.option.new_db, - "db_name_old": metafunc.config.option.old_db, - } - metafunc.parametrize( - "constraint", - metafunc.module.get_constraints(params), - ids=metafunc.module.idfn, - ) - -Now, we can make the creation of our ``Requirement`` s and ``Constraint`` s -dependent on these parameters: - -.. code-block:: python - :caption: ``specification.py`` - - def get_requirements(params): - between_requirement = BetweenRequirement.from_tables( - db_name1=params["old_db"], - db_name2=params["new_db"], - # ... - ) - # ... - return requirements - - def get_constraints(params): - return [ - constraint for requirement in get_requirements(params) for constraint in requirement - ] - - def idfn(constraint): - return constraint.get_description() - - def test_constraint(constraint, datajudge_engine): - test_result = constraint.test(datajudge_engine) - assert test_result.outcome, test_result.failure_message - -Once the specification is defined to be dependent on such parameters, they can -simply be passed via CLI: - -.. code-block:: console - - $ pytest specification.py --new_db=db_v1 --old_db=db_v2 - -Html reports ------------- - -By default, running ``pytest`` tests will output test results to one's respective shell. -Alternatively, one might want to generate an html report summarizing and expanding on -all test results. This can be advantageous for - -* Sharing test results with colleagues -* Archiving and tracking test results over time -* Make underlying sql queries conveniently accessible - -Concretely, such an html report can be generated by -`pytest-html `_. Once installed, using it is as simple -as appending ``--html=myreport.html`` to the pytest call. - -In our twitch example, this generates `this html report `_. - - -Retrieving queries ------------------- - -Usually we not only care about knowing whether there is a problem with the data -at hand and what it is. Rather, we would also like to fix it as fast and -conveniently as possible. - -For that matter, ``datajudge`` makes the queries it uses to assert testing predicates -available via the :class:`datajudge.constraints.base.TestResult` -class. Hence, if a test is failing, the user can jumpstart the investigation of the -problem by reusing and potentially adapting the underlying queries. - -Instead of simply running ``assert constraint.test(engine).outcome``, one may add -the ``TestResult`` 's ``logging_message`` to e.g. a ``logger`` or add it to pytest -``extra``: - -.. code-block:: python - - from pytest_html import extras - - def test_constraint(constraint: Constraint, engine, extra): - test_result = constraint.test(engine) - message = test_result.logging_message - - if not test_result.outcome: - # Send to logger. - logger.info(message) - # Add to html report. - extra.append( - extras.extra( - content=message, - format_type="text", - name="failing_query", - mime_type="text/plain", - extension="sql", - ) - ) - - assert test_result.outcome - - -Such a ``logging_message`` - with ready to execute sql queries - can look as follows: - -.. code-block:: sql - - /* - Failure message: - tempdb.public.twitch_v1's column(s) 'language' doesn't have the - element(s) '{'Sw3d1zh'}' when compared with the reference values. - */ - - --Factual queries: - SELECT anon_1.language, count(*) AS count_1 - FROM (SELECT public.twitch_v1.language AS language - FROM public.twitch_v1) AS anon_1 GROUP BY anon_1.language - - -- Target queries: - SELECT anon_1.language, count(*) AS count_1 - FROM (SELECT public.twitch_v2.language AS language - FROM public.twitch_v2) AS anon_1 GROUP BY anon_1.language - - -If using a mechanism - as previously outlined - to forward these messages to -an html report, this can look as follows: - - -.. image:: report_failing_query1.png - :width: 800 - - -.. image:: report_failing_query2.png - :width: 800 diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 00000000..fc17c6bb --- /dev/null +++ b/docs/testing.md @@ -0,0 +1,298 @@ +# Testing + +While ``datajudge`` allows to express expectations via specifications, ``Requirement``s +and ``Constraint``s, the execution of tests is delegated to pytest. As a +consequence, one may use any functionalities that pytest has to offer. Here, +we want to illustrate some of these advanced functionalities that might turn out useful. + +Yet, it should be noted that for most intents and purposes, using ``datajudge`` 's helper +function :func:`~datajudge.pytest_integration.collect_data_tests` is a good starting +point. It should work out of the box and hides some complexity. For exemplary applications +see, the +[companies example](examples/company-data.md) or the [twitch example](examples/twitch.md). + +Throughout this article we will not rely on ``collect_data_tests``. Instead we will more +explicitly create a mechanism turning a List of ``Requirement`` objects into something +that can be tested by pytest manually. Importantly, we want every ``Constraint`` of every +``Requirement`` to be tested independently of each other. For instance, we would not like +one failing test to halt all others. + +Many of these approaches rely on adapting pytest's ``conftest.py``. If you are not familiar +with this concept, you might want to read up on it +[in the pytest docs](https://docs.pytest.org/en/latest/writing_plugins.html#conftest-py-plugins). + +## Subselection + +Most often one might want to run all tests defined by a specification. + +Yet, for example after believing to have fixed a data problem, one might simply +want to test whether a single test, which had previously been failing, succeeds +at last. + +Another example for when one would like to test a subset of tests is if the data at +hand is not available in its entirety. Rather, it could be that one would like +to run a subset of the test suite against a subsample of the typical dataset. + +In this section, we present two approaches to do a subselection of tests. + +### Ex-post: subselecting generated tests + +Instead of merely running ``$ pytest specification.py`` one may add pytests's +``-k`` flag and specify the ``Constraint`` (s) one cares about. + +Importantly, every ``Constraint`` object can be identified via a name. If one wants +to figure out how this string is built, please refer to the implementation of +:meth:`~datajudge.constraints.base.Constraint.get_description`. +Otherwise, one could also just run all of the tests once and investigate +the resulting test report to find the relevant names. + +When only caring about the ``UniquesEquality`` constraint in our +:doc:`twitch example `. +one might for instance use the following prefix the filter for it: + +```bash +pytest twitch_specification.py -k "UniquesEquality::public.twitch_v1" +``` + +### Ex-ante: Defining categories of tests + +Another option to subselect a certain set of tests is by use of +[pytest markers](https://docs.pytest.org/en/latest/example/markers.html). +The following is one way of using markers in conjunction with ``datajudge``. + +In this particular illustration we'll allow for two markers: + +* ``basic``: indicating that only truly fundamental tests should be run +* ``all``: indicating that any available test should be run + +For that matter we'll add a bit of pytest magic to the respective ``conftest.py``. + +```python title="conftest.py" +def pytest_generate_tests(metafunc): + if "basic_constraint" in metafunc.fixturenames: + metafunc.parametrize( + "basic_constraint", + # Find these functions in specification.py. + metafunc.module.get_basic_constraints(), + ids=metafunc.module.idfn, + ) + if "constraint" in metafunc.fixturenames: + metafunc.parametrize( + "constraint", + # Find these functions in specification.py. + metafunc.module.get_all_constraints(), + ids=metafunc.module.idfn, + ) +``` + +Moreover, we'll have to register these markers in pytest's ``[tool.pytest.ini_options]`` in `pyproject.toml`. +You can read more about these files [here](https://docs.pytest.org/en/latest/customize.html). + +```toml title="pyproject.toml" +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "basic: basic specification", + "all: entire specification", +] +``` + +Once that is taken care of, one can adapt one's specification as follows: + +```python title="specification.py" +def get_basic_requirements() -> List[Requirement]: + # Create relevant Requirement objects and respective Constraints. + # ... + return requirements + +def get_advanced_requirements() -> List[Requirement]: + # Create relevant Requirement objects and respective Constraints. + # ... + return requirements + +def get_basic_constraints() -> List[Constraint]: + return [constraint for requirement in get_basic_requirements() for constraint in requirement] + +def get_all_constraints() -> List[Constraint]: + all_requirements = get_basic_requirements() + get_advanced_requirements() + return [constraint for requirement in all_requirements for constraint in requirement] + +# Function used in conftest.py. +# Given a constraint, returns an identifier used to refer to it as a test. +def idfn(constraint): + return constraint.get_description() + +@pytest.mark.basic +def test_basic_constraint(basic_constraint: Constraint, datajudge_engine): + test_result = basic_constraint.test(datajudge_engine) + assert test_result.outcome, test_result.failure_message + +@pytest.mark.all +def test_all_constraint(constraint: Constraint, datajudge_engine): + test_result = constraint.test(datajudge_engine) + assert test_result.outcome, test_result.failure_message +``` + +Once these changes are taken care of, one may run + +```bash +pytest specification.py -m basic +``` + +to only test the basic ``Requirement`` s or + +```bash +pytest specification.py -m all +``` + +to test all ``Requirement``s. + +## Using parameters in a specification + +A given specification might rely on identifiers such as database names or +table names. Moreover it might be that, e.g. when iterating from one version +of the data to another, these names change. + +In other words, it could be that the logic should remain unchanged while pointers +to data might change. Therefore, one might just as well consider +those pointers or identifiers as parameters of the specification. + +For the sake of concreteness, we will assume here that we wish frame two +identifiers as parameters: + +* ``new_db``: the name of the 'new database' +* ``old_db``: the name of the 'old database' + +In light of that we will again adapt pytest's ``conftest.py``: + +```python title="conftest.py" +def pytest_addoption(parser): + parser.addoption("--new_db", action="store", help="name of the new database") + parser.addoption("--old_db", action="store", help="name of the old database") + + +def pytest_generate_tests(metafunc): + params = { + "db_name_new": metafunc.config.option.new_db, + "db_name_old": metafunc.config.option.old_db, + } + metafunc.parametrize( + "constraint", + metafunc.module.get_constraints(params), + ids=metafunc.module.idfn, + ) +``` + +Now, we can make the creation of our ``Requirement``s and ``Constraint``s +dependent on these parameters: + +```python title="application.py" +def get_requirements(params): + between_requirement = BetweenRequirement.from_tables( + db_name1=params["old_db"], + db_name2=params["new_db"], + # ... + ) + # ... + return requirements + +def get_constraints(params): + return [ + constraint for requirement in get_requirements(params) for constraint in requirement + ] + +def idfn(constraint): + return constraint.get_description() + +def test_constraint(constraint, datajudge_engine): + test_result = constraint.test(datajudge_engine) + assert test_result.outcome, test_result.failure_message +``` + +Once the specification is defined to be dependent on such parameters, they can +simply be passed via CLI: + +```bash +pytest specification.py --new_db=db_v1 --old_db=db_v2 +``` + +## Html reports + +By default, running ``pytest`` tests will output test results to one's respective shell. +Alternatively, one might want to generate an html report summarizing and expanding on +all test results. This can be advantageous for + +* Sharing test results with colleagues +* Archiving and tracking test results over time +* Make underlying sql queries conveniently accessible + +Concretely, such an html report can be generated by +[pytest-html](https://github.com/pytest-dev/pytest-html). Once installed, using it is as simple +as appending ``--html=myreport.html`` to the pytest call. + +In our twitch example, this generates [this html report](https://github.com/Quantco/datajudge/tree/main/docs/examples/twitch_report.html). + +## Retrieving queries + +Usually we not only care about knowing whether there is a problem with the data +at hand and what it is. Rather, we would also like to fix it as fast and +conveniently as possible. + +For that matter, ``datajudge`` makes the queries it uses to assert testing predicates +available via the :class:`datajudge.constraints.base.TestResult` +class. Hence, if a test is failing, the user can jumpstart the investigation of the +problem by reusing and potentially adapting the underlying queries. + +Instead of simply running ``assert constraint.test(engine).outcome``, one may add +the ``TestResult``'s ``logging_message`` to e.g. a ``logger`` or add it to pytest +``extra``: + +```python +from pytest_html import extras + +def test_constraint(constraint: Constraint, engine, extra): + test_result = constraint.test(engine) + message = test_result.logging_message + + if not test_result.outcome: + # Send to logger. + logger.info(message) + # Add to html report. + extra.append( + extras.extra( + content=message, + format_type="text", + name="failing_query", + mime_type="text/plain", + extension="sql", + ) + ) + + assert test_result.outcome +``` + +Such a ``logging_message`` - with ready to execute sql queries - can look as follows: + +```sql +/* +Failure message: +tempdb.public.twitch_v1's column(s) 'language' doesn't have the +element(s) '{'Sw3d1zh'}' when compared with the reference values. +*/ + + --Factual queries: + SELECT anon_1.language, count(*) AS count_1 +FROM (SELECT public.twitch_v1.language AS language +FROM public.twitch_v1) AS anon_1 GROUP BY anon_1.language + +-- Target queries: + SELECT anon_1.language, count(*) AS count_1 +FROM (SELECT public.twitch_v2.language AS language +FROM public.twitch_v2) AS anon_1 GROUP BY anon_1.language +``` + +If using a mechanism - as previously outlined - to forward these messages to +an html report, this can look as follows: + +![report failing query 1](./report_failing_query1.png) +![report failing query 2](./report_failing_query2.png) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..2a129df3 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,68 @@ +site_name: datajudge +site_description: Assessing whether data from database complies with reference information. +site_url: https://quantco.github.io/datajudge +theme: + name: material + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + primary: deep purple + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to system preference + primary: deep purple + features: + - content.action.edit + - search.suggest + - search.highlight + - content.code.annotate + - content.code.copy + icon: + repo: fontawesome/brands/github-alt + edit: material/pencil +repo_name: quantco/datajudge +repo_url: https://github.com/quantco/datajudge +edit_uri: edit/main/docs/ +plugins: + - search + - mkdocstrings: + handlers: + python: + paths: [src] + options: + unwrap_annotated: true + show_symbol_type_heading: true + docstring_style: numpy + docstring_section_style: spacy + separate_signature: true + merge_init_into_class: true + +nav: + - installation.md + - getting-started.md + - testing.md + - motivation.md + - Examples: + - examples/company-data.md + - examples/twitch.md + - examples/dates.md + - examples/exploration.md + - development.md + - api-documentation.md +markdown_extensions: + - admonition + - pymdownx.highlight + - pymdownx.superfences + - pymdownx.inlinehilite diff --git a/pixi.lock b/pixi.lock index 4a020157..90ece7f7 100644 --- a/pixi.lock +++ b/pixi.lock @@ -2005,7 +2005,7 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/babel-2.14.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h30efb56_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda @@ -2013,17 +2013,19 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.16.0-py312hf06ca03_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.3.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-unix_pyh707e725_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/docutils-0.20.1-py312h7900ff3_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/flit-3.9.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/flit-core-3.9.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ghp-import-2.1.0-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/greenlet-3.0.3-py312h30efb56_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/griffe-0.47.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.1.0-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.0.0-pyh9f0ad1d_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.0.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.4-pyhd8ed1ab_0.conda @@ -2044,41 +2046,43 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-h4ab18f5_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/make-4.3-hd18ef5c_1.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-2.1.5-py312h98912ed_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mergedeep-1.3.4-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-1.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-autorefs-1.0.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-get-deps-0.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-9.5.28-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-extensions-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-0.25.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-python-1.10.5-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.0.0-py312h22e1c76_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.7.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4ab18f5_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-24.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pbr-6.0.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/paginate-0.5.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.2.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.18.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pymdown-extensions-10.8.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-8.2.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.4-h194c7f8_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.12-4_cp312.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.1-py312h98912ed_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pyyaml-env-tag-0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2024.5.15-py312h9a8786e_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.3-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/scipy-1.14.0-py312hc2bc53b_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-2.2.0-pyhd8ed1ab_0.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-7.3.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-autodoc-typehints-2.2.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx_rtd_theme-2.0.0-pyha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-apidoc-0.3.0-py_1.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.8-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.6-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.5-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jquery-4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/sqlalchemy-2.0.31-py312h9a8786e_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tabulate-0.9.0-pyhd8ed1ab_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-w-1.0.0-pyhd8ed1ab_0.tar.bz2 @@ -2086,13 +2090,15 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.12.2-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.2-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/watchdog-4.0.1-py312h7900ff3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.22.0-py312h5b18bf6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda osx-arm64: - - conda: https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/babel-2.14.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.1.0-py312h9f69965_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-h93a5062_5.conda @@ -2100,17 +2106,19 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cffi-1.16.0-py312h8e38eb3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.3.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-unix_pyh707e725_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/docutils-0.20.1-py312h81bd7bf_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/flit-3.9.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/flit-core-3.9.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ghp-import-2.1.0-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/greenlet-3.0.3-py312h20a0b95_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/griffe-0.47.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.1.0-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.0.0-pyh9f0ad1d_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.0.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.4-pyhd8ed1ab_0.conda @@ -2126,41 +2134,43 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libsqlite-3.46.0-hfb93653_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-hfb2fe0b_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/llvm-openmp-18.1.8-hde57baf_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/make-4.3-he57ea6c_1.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/markupsafe-2.1.5-py312he37b823_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mergedeep-1.3.4-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-1.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-autorefs-1.0.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-get-deps-0.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-9.5.28-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-extensions-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-0.25.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-python-1.10.5-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/ncurses-6.5-hb89a1cb_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/numpy-2.0.0-py312hb544834_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.7.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.3.1-hfb2fe0b_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-24.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pbr-6.0.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/paginate-0.5.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.2.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.18.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pymdown-extensions-10.8.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-8.2.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.4-h30c5eda_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python_abi-3.12-4_cp312.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyyaml-6.0.1-py312h02f2b3b_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pyyaml-env-tag-0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h92ec313_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/regex-2024.5.15-py312h7e5086c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.3-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/scipy-1.14.0-py312h14ffa8f_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-2.2.0-pyhd8ed1ab_0.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-7.3.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-autodoc-typehints-2.2.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx_rtd_theme-2.0.0-pyha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-apidoc-0.3.0-py_1.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.8-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.6-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.5-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jquery-4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/sqlalchemy-2.0.31-py312h7e5086c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tabulate-0.9.0-pyhd8ed1ab_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h5083fa2_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-w-1.0.0-pyhd8ed1ab_0.tar.bz2 @@ -2168,13 +2178,15 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.12.2-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.2-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/watchdog-4.0.1-py312h7e5086c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/xz-5.2.6-h57fd34a_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h3422bc3_2.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstandard-0.22.0-py312h721a963_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.6-hb46c0d2_0.conda win-64: - - conda: https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/babel-2.14.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/brotli-python-1.1.0-py312h53d5487_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-hcfcfb64_5.conda @@ -2182,17 +2194,19 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cffi-1.16.0-py312he70551f_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.3.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-win_pyh7428d3b_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/win-64/docutils-0.20.1-py312h2e8e312_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.0-pyhd8ed1ab_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/flit-3.9.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/flit-core-3.9.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ghp-import-2.1.0-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/win-64/greenlet-3.0.3-py312h53d5487_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/griffe-0.47.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.1.0-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.0.0-pyh9f0ad1d_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.0.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.0.0-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/intel-openmp-2024.2.0-h57928b3_978.conda @@ -2207,47 +2221,43 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.46.0-h2466b09_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libxml2-2.12.7-h283a6d9_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-gmp-6.1.0-2.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/win-64/make-4.3-h3d2af85_1.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/markupsafe-2.1.5-py312he70551f_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mergedeep-1.3.4-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-1.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-autorefs-1.0.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-get-deps-0.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-9.5.28-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-extensions-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-0.25.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-python-1.10.5-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/mkl-2024.1.0-h66d3029_692.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/msys2-conda-epoch-20160418-1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/win-64/numpy-2.0.0-py312h49bc9c5_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.7.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/openssl-3.3.1-h2466b09_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-24.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pbr-6.0.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/paginate-0.5.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.2.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/pthreads-win32-2.9.1-hfa6e2cd_3.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.18.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pymdown-extensions-10.8.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh0701188_6.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-8.2.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.12.4-h889d299_0_cpython.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/python_abi-3.12-4_cp312.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/pyyaml-6.0.1-py312he70551f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pyyaml-env-tag-0.1-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/regex-2024.5.15-py312h4389bb4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.3-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/scipy-1.14.0-py312h1f4e10d_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-70.1.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-2.2.0-pyhd8ed1ab_0.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-7.3.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-autodoc-typehints-2.2.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx_rtd_theme-2.0.0-pyha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-apidoc-0.3.0-py_1.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.8-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.6-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.5-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jquery-4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.7-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/win-64/sqlalchemy-2.0.31-py312h4389bb4_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tabulate-0.9.0-pyhd8ed1ab_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/win-64/tbb-2021.12.0-hc790b64_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h5226925_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2 @@ -2260,9 +2270,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/vc-14.3-h8a93ad2_20.conda - conda: https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.40.33810-ha82c5b3_20.conda - conda: https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.40.33810-h3bf8584_20.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/watchdog-4.0.1-py312h2e8e312_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/win_inet_pton-1.1.0-pyhd8ed1ab_6.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/win-64/xz-5.2.6-h8d14728_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/win-64/yaml-0.2.5-h8ffe710_2.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstandard-0.22.0-py312h7606c53_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.6-h0ea2cb4_0.conda @@ -9304,21 +9316,6 @@ packages: license_family: APACHE size: 12730 timestamp: 1667935912504 -- kind: conda - name: alabaster - version: 0.7.16 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda - sha256: fd39ad2fabec1569bbb0dfdae34ab6ce7de6ec09dcec8638f83dad0373594069 - md5: def531a3ac77b7fb8c21d17bb5d0badb - depends: - - python >=3.9 - license: BSD-3-Clause - license_family: BSD - size: 18365 - timestamp: 1704848898483 - kind: conda name: asn1crypto version: 1.5.1 @@ -9334,6 +9331,21 @@ packages: license_family: MIT size: 81077 timestamp: 1647369241204 +- kind: conda + name: astunparse + version: 1.6.3 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_0.tar.bz2 + sha256: e5173d1ed038038e24c0623f0219dc587ee8663cf7efa737e7075128edbc6c60 + md5: 000b6f68a0bfaba800ced7500c11780f + depends: + - python >=3.6 + - six >=1.6.1,<2.0 + license: BSD-3-Clause AND PSF-2.0 + size: 15539 + timestamp: 1610696401707 - kind: conda name: async-timeout version: 4.0.3 @@ -10241,6 +10253,39 @@ packages: license_family: MIT size: 46597 timestamp: 1698833765762 +- kind: conda + name: click + version: 8.1.7 + build: unix_pyh707e725_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-unix_pyh707e725_0.conda + sha256: f0016cbab6ac4138a429e28dbcb904a90305b34b3fe41a9b89d697c90401caec + md5: f3ad426304898027fc619827ff428eca + depends: + - __unix + - python >=3.8 + license: BSD-3-Clause + license_family: BSD + size: 84437 + timestamp: 1692311973840 +- kind: conda + name: click + version: 8.1.7 + build: win_pyh7428d3b_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-win_pyh7428d3b_0.conda + sha256: 90236b113b9a20041736e80b80ee965167f9aac0468315c55e2bad902d673fb0 + md5: 3549ecbceb6cd77b91a105511b7d0786 + depends: + - __win + - colorama + - python >=3.8 + license: BSD-3-Clause + license_family: BSD + size: 85051 + timestamp: 1692312207348 - kind: conda name: colorama version: 0.4.6 @@ -11079,6 +11124,21 @@ packages: license_family: GPL size: 2728420 timestamp: 1712512328692 +- kind: conda + name: ghp-import + version: 2.1.0 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/ghp-import-2.1.0-pyhd8ed1ab_0.tar.bz2 + sha256: 097d9b4c946b195800bc68f68393370049238509b08ef828c06fbf481bbc139c + md5: 6d8d61116031a3f5b1f32e7899785866 + depends: + - python >=3.6 + - python-dateutil >=2.8.1 + license: LicenseRef-Tumbolia-Public + size: 15504 + timestamp: 1651585848291 - kind: conda name: gnutls version: 3.7.9 @@ -11611,6 +11671,23 @@ packages: license_family: MIT size: 199149 timestamp: 1703202186890 +- kind: conda + name: griffe + version: 0.47.0 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/griffe-0.47.0-pyhd8ed1ab_0.conda + sha256: 870cd50f0fbc9b8b7f9866b82f9ebdf395fa768d5c824ae748d7ebc89830ca3b + md5: 7ccc670475bc540c67a9281f2122efee + depends: + - astunparse >=1.6 + - colorama >=0.4 + - python >=3.8 + license: MIT + license_family: MIT + size: 93598 + timestamp: 1718730712927 - kind: conda name: grpcio version: 1.62.2 @@ -12286,21 +12363,6 @@ packages: license_family: BSD size: 52718 timestamp: 1713279497047 -- kind: conda - name: imagesize - version: 1.4.1 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2 - sha256: c2bfd7043e0c4c12d8b5593de666c1e81d67b83c474a0a79282cc5c4ef845460 - md5: 7de5386c8fea29e76b303f37dde4c352 - depends: - - python >=3.4 - license: MIT - license_family: MIT - size: 10164 - timestamp: 1656939625410 - kind: conda name: importlib-metadata version: 8.0.0 @@ -14051,48 +14113,21 @@ packages: size: 31928 timestamp: 1608166099896 - kind: conda - name: make - version: '4.3' - build: h3d2af85_1 - build_number: 1 - subdir: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/make-4.3-h3d2af85_1.tar.bz2 - sha256: f31b00c710df71f2f75c641272ecb1f9bd1e15a5a77510055120641215487fbb - md5: c3be283d3d278c379b50137a2a17f869 - depends: - - m2w64-gcc-libs - license: GPL-3.0-or-later - license_family: GPL - size: 6245358 - timestamp: 1602706995515 -- kind: conda - name: make - version: '4.3' - build: hd18ef5c_1 - build_number: 1 - subdir: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/make-4.3-hd18ef5c_1.tar.bz2 - sha256: 4a5fe7c80bb0de0015328e2d3fc8db1736f528cb1fd53cd0d5527e24269a4f7c - md5: 4049ebfd3190b580dffe76daed26155a + name: markdown + version: '3.6' + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/markdown-3.6-pyhd8ed1ab_0.conda + sha256: fce1fde00359696983989699c00f9891194c4ebafea647a8d21b7e2e3329b56e + md5: 06e9bebf748a0dea03ecbe1f0e27e909 depends: - - libgcc-ng >=7.5.0 - license: GPL-3.0-or-later - license_family: GPL - size: 518896 - timestamp: 1602706451788 -- kind: conda - name: make - version: '4.3' - build: he57ea6c_1 - build_number: 1 - subdir: osx-arm64 - url: https://conda.anaconda.org/conda-forge/osx-arm64/make-4.3-he57ea6c_1.tar.bz2 - sha256: a011e3e1c4caec821eb4213d0a0154d39e5f81a44d2e8bafe6f84e7840c3909e - md5: 1939d04ef89e38fde652ee8c669e092f - license: GPL-3.0-or-later - license_family: GPL - size: 253227 - timestamp: 1602706492919 + - importlib-metadata >=4.4 + - python >=3.6 + license: BSD-3-Clause + license_family: BSD + size: 78331 + timestamp: 1710435316163 - kind: conda name: markdown-it-py version: 3.0.0 @@ -14544,6 +14579,175 @@ packages: license: Apache-2.0 AND BSD-3-Clause size: 700003 timestamp: 1719607261614 +- kind: conda + name: mergedeep + version: 1.3.4 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mergedeep-1.3.4-pyhd8ed1ab_0.tar.bz2 + sha256: 41ad8c16876820981adfc6e17a62935c950214bd9a9bb092e6aaefdc89a33f0b + md5: 1a160a3cab5cb6bd46264b52cd6f69a2 + depends: + - python >=3.6 + license: MIT + license_family: MIT + size: 9598 + timestamp: 1612711404414 +- kind: conda + name: mkdocs + version: 1.6.0 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocs-1.6.0-pyhd8ed1ab_0.conda + sha256: d15180924550ece7ef30c1868af23b4a3a02d650533c72977a27ba4fca3cc3a3 + md5: e938f734bcc0cfdccf85e5f7ed573c8e + depends: + - click >=7.0 + - colorama >=0.4 + - ghp-import >=1.0 + - importlib-metadata >=4.4 + - jinja2 >=2.11.1 + - markdown >=3.3.6 + - markupsafe >=2.0.1 + - mergedeep >=1.3.4 + - mkdocs-get-deps >=0.2.0 + - packaging >=20.5 + - pathspec >=0.11.1 + - python >=3.8 + - pyyaml >=5.1 + - pyyaml-env-tag >=0.1 + - watchdog >=2.0 + constrains: + - babel >=2.9.0 + license: BSD-2-Clause + license_family: BSD + size: 3523311 + timestamp: 1714253186566 +- kind: conda + name: mkdocs-autorefs + version: 1.0.1 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocs-autorefs-1.0.1-pyhd8ed1ab_0.conda + sha256: 9b989ec9968e37e677bf3f9cf79e1ce8e5f3d9aa4fc892332c745f5cbc5a44c7 + md5: 285f3149fa14c81786994e402db5443e + depends: + - markdown >=3.3 + - markupsafe >=2.0.1 + - mkdocs >=1.1 + - pymdown-extensions + - python >=3.8,<4.0 + license: ISC + size: 21614 + timestamp: 1709500020733 +- kind: conda + name: mkdocs-get-deps + version: 0.2.0 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocs-get-deps-0.2.0-pyhd8ed1ab_0.conda + sha256: aa6207994b15a15b5f82a442804c279bf78f6c4680f0903fb015294c41e34b30 + md5: 0365c9a6e4e41732bde159112b0aef4d + depends: + - importlib-metadata >=4.3 + - mergedeep >=1.3.4 + - platformdirs >=2.2.0 + - python >=3.8 + - pyyaml >=5.1 + license: MIT + license_family: MIT + size: 14733 + timestamp: 1713710951974 +- kind: conda + name: mkdocs-material + version: 9.5.28 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-9.5.28-pyhd8ed1ab_0.conda + sha256: fffe0522db94f7c080b500932ce6bac340840f2ecca8255e68b8b8f6af06ecdd + md5: 2fc60925d61201bfa8f41f7028ed1dbf + depends: + - babel ~=2.10 + - colorama ~=0.4 + - jinja2 ~=3.0 + - markdown ~=3.2 + - mkdocs ~=1.6 + - mkdocs-material-extensions ~=1.3 + - paginate ~=0.5 + - pygments ~=2.16 + - pymdown-extensions ~=10.2 + - python >=3.8 + - pyyaml + - regex >=2022.4 + - requests ~=2.26 + license: MIT + license_family: MIT + size: 5032136 + timestamp: 1719937850151 +- kind: conda + name: mkdocs-material-extensions + version: 1.3.1 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocs-material-extensions-1.3.1-pyhd8ed1ab_0.conda + sha256: e01a349f4816ba7513f8b230ca2c4f703a7ccc7f7d78535076f9215ca766ec78 + md5: 6e7e399b351756b9d181c64a362bdcb5 + depends: + - python >=3.8 + constrains: + - mkdocs-material >=5.0.0 + license: MIT + license_family: MIT + size: 16011 + timestamp: 1700695213251 +- kind: conda + name: mkdocstrings + version: 0.25.1 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-0.25.1-pyhd8ed1ab_0.conda + sha256: f8265323a2300de8a960cd8160a6ffb603b7969ea0551da63e4cc426fabbdfe6 + md5: 4578b973243ecddbd5d7126d0ad3cc05 + depends: + - click >=7.0 + - importlib-metadata >=4.6 + - jinja2 >=2.11.1 + - markdown >=3.3 + - markupsafe >=1.1 + - mkdocs >=1.4 + - mkdocs-autorefs >=0.3.1 + - platformdirs >=2.2.0 + - pymdown-extensions >=6.3 + - python >=3.8,<4.0 + - typing-extensions >=4.1 + license: MIT + license_family: MIT + size: 30261 + timestamp: 1714935440032 +- kind: conda + name: mkdocstrings-python + version: 1.10.5 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/mkdocstrings-python-1.10.5-pyhd8ed1ab_0.conda + sha256: d75702551d877b4acd9b1364552a55267e2791949062ab809b9a80d3e1ca1f6e + md5: 2a851f98b01a639df196bceaf5d1301e + depends: + - griffe >=0.47 + - mkdocstrings >=0.25 + - python >=3.8 + license: MIT + license_family: MIT + size: 46684 + timestamp: 1718826931223 - kind: conda name: mkl version: 2024.1.0 @@ -15191,25 +15395,6 @@ packages: license_family: BSD size: 7755925 timestamp: 1718615443567 -- kind: conda - name: numpydoc - version: 1.7.0 - build: pyhd8ed1ab_1 - build_number: 1 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.7.0-pyhd8ed1ab_1.conda - sha256: 5adeb26861eb2aa8a9c86d945f0817c0c33544d96d209fe6578423959c5988af - md5: 66798cbfdcb003d9fbccd92cd08eb3ac - depends: - - python >=3.8 - - sphinx >=6 - - tabulate >=0.8.10 - - tomli >=1.1.0 - license: BSD-3-Clause - license_family: BSD - size: 57592 - timestamp: 1717502988256 - kind: conda name: openssl version: 3.3.1 @@ -15297,6 +15482,21 @@ packages: license_family: APACHE size: 50290 timestamp: 1718189540074 +- kind: conda + name: paginate + version: 0.5.6 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/paginate-0.5.6-pyhd8ed1ab_0.conda + sha256: 8d9d18c2892d49c33fab3e215cdbc55a2ba30a28c1f52e5e5d61cb435803726b + md5: 5d454974a1b5c6f4d468f91812331d53 + depends: + - python >=3.4 + license: MIT + license_family: MIT + size: 18537 + timestamp: 1693246970487 - kind: conda name: pandas-stubs version: 2.2.2.240603 @@ -15315,21 +15515,20 @@ packages: size: 97949 timestamp: 1717510726829 - kind: conda - name: pbr - version: 6.0.0 + name: pathspec + version: 0.12.1 build: pyhd8ed1ab_0 subdir: noarch noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/pbr-6.0.0-pyhd8ed1ab_0.conda - sha256: 4c83853fc6349de163c2871613e064e5fdab91723db9b50bcda681adc05e4b87 - md5: 8dbab5ba746ed14aa32cb232dc437f8f + url: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_0.conda + sha256: 4e534e66bfe8b1e035d2169d0e5b185450546b17e36764272863e22e0370be4d + md5: 17064acba08d3686f1135b5ec1b32b12 depends: - - pip - - python >=3.6 - license: Apache-2.0 - license_family: Apache - size: 73106 - timestamp: 1699384879677 + - python >=3.7 + license: MPL-2.0 + license_family: MOZILLA + size: 41173 + timestamp: 1702250135032 - kind: conda name: perl version: 5.32.1.1 @@ -16069,6 +16268,23 @@ packages: license_family: MIT size: 24906 timestamp: 1706895211122 +- kind: conda + name: pymdown-extensions + version: 10.8.1 + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/pymdown-extensions-10.8.1-pyhd8ed1ab_0.conda + sha256: 72aaeb14c9a0af5a515786dc4b8951a0b75da8ae22a048a86022919f33d46b42 + md5: 027d741bee97d67b689a39df3ef812fb + depends: + - markdown >=3.6 + - python >=3.7 + - pyyaml + license: MIT + license_family: MIT + size: 158717 + timestamp: 1714261991332 - kind: conda name: pyodbc version: 5.1.0 @@ -17187,6 +17403,22 @@ packages: license_family: MIT size: 167932 timestamp: 1695374097139 +- kind: conda + name: pyyaml-env-tag + version: '0.1' + build: pyhd8ed1ab_0 + subdir: noarch + noarch: python + url: https://conda.anaconda.org/conda-forge/noarch/pyyaml-env-tag-0.1-pyhd8ed1ab_0.tar.bz2 + sha256: 900319483135730d9836855a807822f0500b1a239520749103e9ef9b7ba9f246 + md5: 626ed9060ddeb681ddc42bcad89156ab + depends: + - python >=3.6 + - pyyaml + license: MIT + license_family: MIT + size: 7473 + timestamp: 1624389117412 - kind: conda name: re2 version: 2023.09.01 @@ -17263,6 +17495,57 @@ packages: license_family: GPL size: 250351 timestamp: 1679532511311 +- kind: conda + name: regex + version: 2024.5.15 + build: py312h4389bb4_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/regex-2024.5.15-py312h4389bb4_0.conda + sha256: 956b88e8e5913b0b8a9c9c9712ae5614e462e903fdb082ab91961d6786f2478b + md5: c2f6f40dbf193a77b979e7fc814b458c + depends: + - python >=3.12,<3.13.0a0 + - python_abi 3.12.* *_cp312 + - ucrt >=10.0.20348.0 + - vc >=14.2,<15 + - vc14_runtime >=14.29.30139 + license: Python-2.0 + license_family: PSF + size: 358496 + timestamp: 1715829078813 +- kind: conda + name: regex + version: 2024.5.15 + build: py312h7e5086c_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/regex-2024.5.15-py312h7e5086c_0.conda + sha256: 7cf8fe1c9c70c0fb9c162dba3a9043319710311bff7fd8ab4c1510337ba8fae0 + md5: 32fbee5a1711a9ad21c157f1b9ee6ea3 + depends: + - __osx >=11.0 + - python >=3.12,<3.13.0a0 + - python >=3.12,<3.13.0a0 *_cpython + - python_abi 3.12.* *_cp312 + license: Python-2.0 + license_family: PSF + size: 360656 + timestamp: 1715828723075 +- kind: conda + name: regex + version: 2024.5.15 + build: py312h9a8786e_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/regex-2024.5.15-py312h9a8786e_0.conda + sha256: 4050b3f70bd3ef81ae175acab0dfc2019fde84ab71b6b12903b3eb9bbd35661e + md5: d3c8a64188a7331e3df3be6b06d5309e + depends: + - libgcc-ng >=12 + - python >=3.12,<3.13.0a0 + - python_abi 3.12.* *_cp312 + license: Python-2.0 + license_family: PSF + size: 398199 + timestamp: 1715828558963 - kind: conda name: requests version: 2.32.3 @@ -17883,21 +18166,6 @@ packages: license_family: MIT size: 14259 timestamp: 1620240338595 -- kind: conda - name: snowballstemmer - version: 2.2.0 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-2.2.0-pyhd8ed1ab_0.tar.bz2 - sha256: a0fd916633252d99efb6223b1050202841fa8d2d53dacca564b0ed77249d3228 - md5: 4d22a9315e78c6827f806065957d566e - depends: - - python >=2 - license: BSD-3-Clause - license_family: BSD - size: 58824 - timestamp: 1637143137377 - kind: conda name: snowflake-connector-python version: 3.11.0 @@ -18154,200 +18422,6 @@ packages: license_family: APACHE size: 26314 timestamp: 1621217159824 -- kind: conda - name: sphinx - version: 7.3.7 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinx-7.3.7-pyhd8ed1ab_0.conda - sha256: 41101e2b0b8722087f06bd73251ba95ef89db515982b6a89aeebfa98ebcb65a1 - md5: 7b1465205e28d75d2c0e1a868ee00a67 - depends: - - alabaster >=0.7.14,<0.8.dev0 - - babel >=2.9 - - colorama >=0.4.5 - - docutils >=0.18.1,<0.22 - - imagesize >=1.3 - - importlib-metadata >=4.8 - - jinja2 >=3.0 - - packaging >=21.0 - - pygments >=2.14 - - python >=3.9 - - requests >=2.25.0 - - snowballstemmer >=2.0 - - sphinxcontrib-applehelp - - sphinxcontrib-devhelp - - sphinxcontrib-htmlhelp >=2.0.0 - - sphinxcontrib-jsmath - - sphinxcontrib-qthelp - - sphinxcontrib-serializinghtml >=1.1.9 - - tomli >=2.0 - license: BSD-2-Clause - license_family: BSD - size: 1345378 - timestamp: 1713555005540 -- kind: conda - name: sphinx-autodoc-typehints - version: 2.2.2 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinx-autodoc-typehints-2.2.2-pyhd8ed1ab_0.conda - sha256: 55bbfb986afa52113d1d2d4e6622670ff879a7b6d2d70f7bb758be22756829e1 - md5: 0a5918f92de9e8d885b74f535c5050a9 - depends: - - python >=3.9 - - sphinx >=7.3.5 - license: MIT - license_family: MIT - size: 23631 - timestamp: 1719256047409 -- kind: conda - name: sphinx_rtd_theme - version: 2.0.0 - build: pyha770c72_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinx_rtd_theme-2.0.0-pyha770c72_0.conda - sha256: 8545c806d03092fd0236db6663c88036eab2dc99e34c91cd36c0704db03b148a - md5: baf6d9a33df1a789ca55e3b404c7ea28 - depends: - - docutils <0.21 - - python >=3.6 - - sphinx >=5,<8 - - sphinxcontrib-jquery >=4,<5 - license: MIT - license_family: MIT - size: 2614217 - timestamp: 1701183633165 -- kind: conda - name: sphinxcontrib-apidoc - version: 0.3.0 - build: py_1 - build_number: 1 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-apidoc-0.3.0-py_1.tar.bz2 - sha256: 6dd136a86576c400b0bdbfffbdba4a35015846a0a7eb1129a1401a17d4f60b19 - md5: 855b087883443abb10f5faf6eef40860 - depends: - - pbr - - python - license: BSD-2-Clause - license_family: BSD - size: 10555 - timestamp: 1553967001880 -- kind: conda - name: sphinxcontrib-applehelp - version: 1.0.8 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-1.0.8-pyhd8ed1ab_0.conda - sha256: 710013443a063518d587d2af82299e92ab6d6695edf35a676ac3a0ccc9e3f8e6 - md5: 611a35a27914fac3aa37611a6fe40bb5 - depends: - - python >=3.9 - - sphinx >=5 - license: BSD-2-Clause - license_family: BSD - size: 29539 - timestamp: 1705126465971 -- kind: conda - name: sphinxcontrib-devhelp - version: 1.0.6 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-1.0.6-pyhd8ed1ab_0.conda - sha256: 63a6b60653ef13a6712848f4b3c4b713d4b564da1dae571893f1a3659cde85f3 - md5: d7e4954df0d3aea2eacc7835ad12671d - depends: - - python >=3.9 - - sphinx >=5 - license: BSD-2-Clause - license_family: BSD - size: 24474 - timestamp: 1705126153592 -- kind: conda - name: sphinxcontrib-htmlhelp - version: 2.0.5 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.5-pyhd8ed1ab_0.conda - sha256: 512f393cfe34cb3de96ade7a7ad900d6278e2087a1f0e5732aa60fadee396d99 - md5: 7e1e7437273682ada2ed5e9e9714b140 - depends: - - python >=3.9 - - sphinx >=5 - license: BSD-2-Clause - license_family: BSD - size: 33499 - timestamp: 1705118297318 -- kind: conda - name: sphinxcontrib-jquery - version: '4.1' - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jquery-4.1-pyhd8ed1ab_0.conda - sha256: 2e5f16a2d58f9a31443ffbb8ce3852cfccf533a6349045828cd2e994ef0679ca - md5: 914897066d5873acfb13e75705276ad1 - depends: - - python >=2.7 - - sphinx >=1.8 - license: 0BSD AND MIT - size: 112985 - timestamp: 1678809100921 -- kind: conda - name: sphinxcontrib-jsmath - version: 1.0.1 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_0.conda - sha256: d4337d83b8edba688547766fc80f1ac86d6ec86ceeeda93f376acc04079c5ce2 - md5: da1d979339e2714c30a8e806a33ec087 - depends: - - python >=3.5 - license: BSD-2-Clause - license_family: BSD - size: 10431 - timestamp: 1691604844204 -- kind: conda - name: sphinxcontrib-qthelp - version: 1.0.7 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-1.0.7-pyhd8ed1ab_0.conda - sha256: dd35b52f056c39081cd0ae01155174277af579b69e5d83798a33e9056ec78d63 - md5: 26acae54b06f178681bfb551760f5dd1 - depends: - - python >=3.9 - - sphinx >=5 - license: BSD-2-Clause - license_family: BSD - size: 27005 - timestamp: 1705126340442 -- kind: conda - name: sphinxcontrib-serializinghtml - version: 1.1.10 - build: pyhd8ed1ab_0 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_0.conda - sha256: bf80e4c0ff97d5e8e5f6db0831ba60007e820a3a438e8f1afd868aa516d67d6f - md5: e507335cb4ca9cff4c3d0fa9cdab255e - depends: - - python >=3.9 - - sphinx >=5 - license: BSD-2-Clause - license_family: BSD - size: 28776 - timestamp: 1705118378942 - kind: conda name: sqlalchemy version: 1.4.49 @@ -18839,22 +18913,6 @@ packages: license_family: MIT size: 33181 timestamp: 1713451629196 -- kind: conda - name: tabulate - version: 0.9.0 - build: pyhd8ed1ab_1 - build_number: 1 - subdir: noarch - noarch: python - url: https://conda.anaconda.org/conda-forge/noarch/tabulate-0.9.0-pyhd8ed1ab_1.tar.bz2 - sha256: f6e4a0dd24ba060a4af69ca79d32361a6678e61d78c73eb5e357909b025b4620 - md5: 4759805cce2d914c38472f70bf4d8bcb - depends: - - python >=3.7 - license: MIT - license_family: MIT - size: 35912 - timestamp: 1665138565317 - kind: conda name: taplo version: 0.9.1 @@ -19566,6 +19624,56 @@ packages: license_family: BSD size: 17395 timestamp: 1717709043353 +- kind: conda + name: watchdog + version: 4.0.1 + build: py312h2e8e312_0 + subdir: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/watchdog-4.0.1-py312h2e8e312_0.conda + sha256: 35c657fd70de86e69dd8fcb04697df660da79410b4098a263acab55d363117ef + md5: 29cbd97528b7f7ce91a59186e391c0db + depends: + - python >=3.12,<3.13.0a0 + - python_abi 3.12.* *_cp312 + - pyyaml >=3.10 + license: Apache-2.0 + license_family: APACHE + size: 162034 + timestamp: 1716562347718 +- kind: conda + name: watchdog + version: 4.0.1 + build: py312h7900ff3_0 + subdir: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/watchdog-4.0.1-py312h7900ff3_0.conda + sha256: c4786da0c938a65cea07e2bb3fe76dbeed6968c322994c66395176307cf78425 + md5: 7cc94a3b5e9698eecc2c39dbf7a173db + depends: + - python >=3.12,<3.13.0a0 + - python_abi 3.12.* *_cp312 + - pyyaml >=3.10 + license: Apache-2.0 + license_family: APACHE + size: 136444 + timestamp: 1716561872155 +- kind: conda + name: watchdog + version: 4.0.1 + build: py312h7e5086c_0 + subdir: osx-arm64 + url: https://conda.anaconda.org/conda-forge/osx-arm64/watchdog-4.0.1-py312h7e5086c_0.conda + sha256: f018376037c19c38e5b55602d09653106cc4fada7db3f02f871cee7be6befcd6 + md5: ce33cf6a4a69aa2beb93c8f7258bfe55 + depends: + - __osx >=11.0 + - python >=3.12,<3.13.0a0 + - python >=3.12,<3.13.0a0 *_cpython + - python_abi 3.12.* *_cp312 + - pyyaml >=3.10 + license: Apache-2.0 + license_family: APACHE + size: 145420 + timestamp: 1716562106758 - kind: conda name: wheel version: 0.43.0 diff --git a/pixi.toml b/pixi.toml index ea046919..dbb59464 100644 --- a/pixi.toml +++ b/pixi.toml @@ -14,15 +14,12 @@ colorama = "*" pytest = "*" [feature.docs.dependencies] -make = "*" -numpydoc = "*" -sphinx = "*" -sphinx_rtd_theme = "*" -sphinxcontrib-apidoc = "*" -sphinx-autodoc-typehints = "*" +mkdocs-material = "*" +mkdocstrings = "*" +mkdocstrings-python = "*" [feature.docs.tasks] -docs = "cd docs && make html " -readthedocs = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/build/html $READTHEDOCS_OUTPUT/html" +docs = "mkdocs serve" +docs-build = "mkdocs build --strict" [feature.py38.dependencies] python = "3.8.*" From 644c5b64beb2c7998b7bc7d6c87787ef5bd4e893 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Sun, 14 Jul 2024 01:32:19 +0200 Subject: [PATCH 2/3] WIP --- docs/development.md | 6 +-- docs/examples/company-data.md | 26 +++++------ docs/examples/dates.md | 51 ++++++++++---------- docs/examples/exploration.md | 69 ++++++++++++++------------- docs/examples/twitch.md | 30 ++++++------ docs/getting-started.md | 70 ++++++++++++++-------------- docs/index.md | 4 +- docs/motivation.md | 10 ++-- docs/testing.md | 66 +++++++++++++------------- pyproject.toml | 15 ++++++ src/datajudge/__init__.py | 3 +- src/datajudge/constraints/column.py | 3 +- src/datajudge/constraints/stats.py | 7 +-- src/datajudge/constraints/uniques.py | 3 +- src/datajudge/db_access.py | 11 ++--- src/datajudge/requirements.py | 54 +++++++-------------- src/datajudge/utils.py | 12 ++--- 17 files changed, 211 insertions(+), 229 deletions(-) diff --git a/docs/development.md b/docs/development.md index 595cb5cf..fa72c0aa 100644 --- a/docs/development.md +++ b/docs/development.md @@ -1,7 +1,7 @@ # Development -``datajudge`` development relies on [pixi](https://pixi.sh/latest/). -In order to work on ``datajudge``, you can create a development environment as follows: +`datajudge` development relies on [pixi](https://pixi.sh/latest/). +In order to work on `datajudge`, you can create a development environment as follows: ```bash git clone https://github.com/Quantco/datajudge @@ -24,7 +24,7 @@ To run integration tests against Postgres, first start a docker container with a ./start_postgres.sh ``` -In your current environment, install the ``psycopg2`` package. +In your current environment, install the `psycopg2` package. After this, you may execute integration tests as follows: ```bash diff --git a/docs/examples/company-data.md b/docs/examples/company-data.md index 1090840d..8c275d11 100644 --- a/docs/examples/company-data.md +++ b/docs/examples/company-data.md @@ -6,22 +6,22 @@ The table "companies_archive" contains three entries: **companies_archive** -| id | name | num_employees | -|----|---------|---------------| -| 1 | QuantCo | 90 | -| 2 | Google | 140,000 | -| 3 | BMW | 110,000 | +| id | name | num_employees | +| --- | ------- | ------------- | +| 1 | QuantCo | 90 | +| 2 | Google | 140,000 | +| 3 | BMW | 110,000 | While "companies" contains an additional entry: **companies** -| id | name | num_employees | -|----|---------|---------------| -| 1 | QuantCo | 100 | -| 2 | Google | 150,000 | -| 3 | BMW | 120,000 | -| 4 | Apple | 145,000 | +| id | name | num_employees | +| --- | ------- | ------------- | +| 1 | QuantCo | 100 | +| 2 | Google | 150,000 | +| 3 | BMW | 120,000 | +| 4 | Apple | 145,000 | ```python import sqlalchemy as sa @@ -108,7 +108,7 @@ requirements = [companies_req, companies_between_req] test_constraint = collect_data_tests(requirements) ``` -Saving this file as ``specification.py`` and running ``$ pytest specification.py`` +Saving this file as `specification.py` and running `$ pytest specification.py` will verify that all constraints are satisfied. The output you see in the terminal should be similar to this: @@ -125,4 +125,4 @@ specification.py::test_constraint[RowSuperset::companies|companies_archive] PASS ==================================== 4 passed in 0.31s ==================================== ``` -You can also use a formatted html report using the ``--html=report.html`` flag. +You can also use a formatted html report using the `--html=report.html` flag. diff --git a/docs/examples/dates.md b/docs/examples/dates.md index 897c18ad..29fc1215 100644 --- a/docs/examples/dates.md +++ b/docs/examples/dates.md @@ -1,40 +1,39 @@ # Dates -This example concerns itself with expressing ``Constraint``\s against data revolving -around dates. While date ``Constraint``\s between tables exist, we will only illustrate -``Constraint``\s on a single table and reference values here. As a consequence, we will -only use ``WithinRequirement``, as opposed to ``BetweenRequirement``. +This example concerns itself with expressing `Constraint`\s against data revolving +around dates. While date `Constraint`\s between tables exist, we will only illustrate +`Constraint`\s on a single table and reference values here. As a consequence, we will +only use `WithinRequirement`, as opposed to `BetweenRequirement`. Concretely, we will assume a table containing prices for a given product of id 1. Importantly, these prices are valid for a certain date range only. More precisely, -we assume that the price for a product - identified via the ``preduct_id`` column - -is indicated in the ``price`` column, the date from which it is valid - the date -itself included - in ``date_from`` and the the until when it is valid - the date -itself included - in the ``date_to`` column. +we assume that the price for a product - identified via the `preduct_id` column - +is indicated in the `price` column, the date from which it is valid - the date +itself included - in `date_from` and the the until when it is valid - the date +itself included - in the `date_to` column. Such a table might look as follows: **prices** -| product_id | price | date_from | date_to | -|------------|-------|-----------|---------| -| 1 | 13.99 | 22/01/01 | 22/01/10| -| 1 | 14.5 | 22/01/11 | 22/01/17| -| 1 | 13.37 | 22/01/16 | 22/01/31| +| product_id | price | date_from | date_to | +| ---------- | ----- | --------- | -------- | +| 1 | 13.99 | 22/01/01 | 22/01/10 | +| 1 | 14.5 | 22/01/11 | 22/01/17 | +| 1 | 13.37 | 22/01/16 | 22/01/31 | Given this table, we would like to ensure - for the sake of illustrational purposes - that 6 constraints are satisfied: -1. All values from column ``date_from`` should be in January 2022. -2. All values from column ``date_to`` should be in January 2022. -3. The minimum value in column ``date_from`` should be the first of January 2022. -4. The maximum value in column ``date_to`` should be the 31st of January 2022. -5. There is no gap between ``date_from`` and ``date_to``. In other words, every date +1. All values from column `date_from` should be in January 2022. +2. All values from column `date_to` should be in January 2022. +3. The minimum value in column `date_from` should be the first of January 2022. +4. The maximum value in column `date_to` should be the 31st of January 2022. +5. There is no gap between `date_from` and `date_to`. In other words, every date of January has to be assigned to at least one row for a given product. -6. There is no overlap between ``date_from`` and ``date_to``. In other words, every +6. There is no overlap between `date_from` and `date_to`. In other words, every date of January has to be assigned to at most one row for a given product. - Assuming that such a table exists in database, we can write a specification against it. ```python @@ -140,17 +139,17 @@ requirements = [prices_req] test_constraint = collect_data_tests(requirements) ``` -Please note that the ``DateNoOverlap`` and ``DateNoGap`` constraints also exist -in a slightly different form: ``DateNoOverlap2d`` and ``DateNoGap2d``. +Please note that the `DateNoOverlap` and `DateNoGap` constraints also exist +in a slightly different form: `DateNoOverlap2d` and `DateNoGap2d`. As the names suggest, these can operate in 'two date dimensions'. For example, let's assume a table with four date columns, representing two ranges in distinct dimensions, respectively: -* ``date_from``: Date from when a price is valid -* ``date_to``: Date until when a price is valid -* ``date_definition_from``: Date when a price definition was inserted -* ``date_definition_to``: Date until when a price definition was used +- `date_from`: Date from when a price is valid +- `date_to`: Date until when a price is valid +- `date_definition_from`: Date when a price definition was inserted +- `date_definition_to`: Date until when a price definition was used Analogously to the unidimensional scenario illustrated here, one might care for certain constraints in two dimensions. diff --git a/docs/examples/exploration.md b/docs/examples/exploration.md index 33375a1a..c8cff67c 100644 --- a/docs/examples/exploration.md +++ b/docs/examples/exploration.md @@ -19,16 +19,16 @@ usually doesn't. In the following we will attempt to illustrate possible usages of datajudge for exploration by looking at three simple examples. -These examples rely on some insight about how most datajudge ``Constraint`` s work under -the hood. Importantly, ``Constraint`` s typically come with +These examples rely on some insight about how most datajudge `Constraint` s work under +the hood. Importantly, `Constraint` s typically come with -* a ``retrieve`` method: this method fetches relevant data from database, given a - ``DataReference`` -* a ``get_factual_value`` method: this is typically a wrapper around ``retrieve`` for the - first ``DataReference`` of the given ``Requirement`` / ``Constraint`` -* a ``get_target_value`` method: this is either a wrapper around ``retrieve`` for the - second ``DataReference`` in the case of a ``BetweenRequirement`` or an echoing of the - ``Constraint`` s key reference value in the case of a ``WithinRequirement`` +- a `retrieve` method: this method fetches relevant data from database, given a + `DataReference` +- a `get_factual_value` method: this is typically a wrapper around `retrieve` for the + first `DataReference` of the given `Requirement` / `Constraint` +- a `get_target_value` method: this is either a wrapper around `retrieve` for the + second `DataReference` in the case of a `BetweenRequirement` or an echoing of the + `Constraint` s key reference value in the case of a `WithinRequirement` Moreover, as is the case when using datajudge for testing purposes, these approaches rely on a [sqlalchemy engine](ttps://docs.sqlalchemy.org/en/14/core/connections.html). The @@ -36,11 +36,11 @@ latter is the gateway to the database at hand. ## Example 1: Comparing numbers of rows -Assume we have two tables in the same database called ``table1`` and ``table2``. Now we +Assume we have two tables in the same database called `table1` and `table2`. Now we would like to compare their numbers of rows. Naturally, we would like to retrieve the respective numbers of rows before we can compare them. For this purpose we create -a ``BetweenTableRequirement`` referring to both tables and add a ``NRowsEquality`` -``Constraint`` onto it. +a `BetweenTableRequirement` referring to both tables and add a `NRowsEquality` +`Constraint` onto it. ```python import sqlalchemy as sa @@ -60,36 +60,36 @@ n_rows1 = req[0].get_factual_value(engine) n_rows2 = req[0].get_target_value(engine) ``` -Note that here, we access the first (and only) ``Constraint`` that has been added to the -``BetweenRequirement`` by writing ``req[0]``. ``Requirements`` are are sequences of -``Constraint`` s, after all. +Note that here, we access the first (and only) `Constraint` that has been added to the +`BetweenRequirement` by writing `req[0]`. `Requirements` are are sequences of +`Constraint` s, after all. Once the numbers of rows are retrieved, we can compare them as we wish. For instance, we could compute the absolute and relative growth (or loss) of numbers of rows from -``table1`` to ``table2``: +`table1` to `table2`: ```python absolute_change = abs(n_rows2 - n_rows1) relative_change = (absolute_change) / n_rows1 if n_rows1 != 0 else None ``` -Importantly, many datajudge staples, such as ``Condition`` s can be used, too. We shall see +Importantly, many datajudge staples, such as `Condition` s can be used, too. We shall see this in our next example. ## Example 2: Investigating unique values -In this example we will suppose that there is a table called ``table`` consisting of -several columns. Two of its columns are supposed to be called ``col_int`` and -``col_varchar``. We are now interested in the unique values in these two columns combined. +In this example we will suppose that there is a table called `table` consisting of +several columns. Two of its columns are supposed to be called `col_int` and +`col_varchar`. We are now interested in the unique values in these two columns combined. Put differently, we are wondering: -> Which unique pairs of values in ``col_int`` and ``col_varchar`` have we encountered? +> Which unique pairs of values in `col_int` and `col_varchar` have we encountered? -To add to the mix, we will moreover only be interested in tuples in which ``col_int`` has a +To add to the mix, we will moreover only be interested in tuples in which `col_int` has a value of larger than 10. -As before, we will start off by creating a ``Requirement``. Since we are only dealing with -a single table this time, we will create a ``WithinRequirement``. +As before, we will start off by creating a `Requirement`. Since we are only dealing with +a single table this time, we will create a `WithinRequirement`. ```python import sqlalchemy as sa @@ -113,20 +113,20 @@ req.add_uniques_equality_constraint( uniques = req[0].get_factual_value(engine) ``` -If one was to investigate this ``uniques`` variable further, one could, e.g. see the +If one was to investigate this `uniques` variable further, one could, e.g. see the following: ```python ([(10, 'hi10'), (11, 'hi11'), (12, 'hi12'), (13, 'hi13'), (14, 'hi14'), (15, 'hi15'), (16, 'hi16'), (17, 'hi17'), (18, 'hi18'), (19, 'hi19')], [1, 100, 12, 1, 7, 8, 1, 1, 1337, 1]) ``` -This becomes easier to parse when inspecting the underlying ``retrieve`` method of the -``UniquesEquality`` ``Constraint``: the first value of the tuple corresponds to the list -of unique pairs in columns ``col_int`` and ``col_varchar``. The second value of the tuple +This becomes easier to parse when inspecting the underlying `retrieve` method of the +`UniquesEquality` `Constraint`: the first value of the tuple corresponds to the list +of unique pairs in columns `col_int` and `col_varchar`. The second value of the tuple are the respective counts thereof. Moreoever, one could manually customize the underlying SQL query. In order to do so, one -can use the fact that ``retrieve`` methods typically return an actual result or value +can use the fact that `retrieve` methods typically return an actual result or value as well as the sqlalchemy selections that led to said result or value. We can use these selections and compile them to a standard, textual SQL query: @@ -161,13 +161,13 @@ table. Moreover, for columns present in both tables, we'd like to learn about th respective types. In order to illustrate such an example, we will again assume that there are two tables -called ``table1`` and ``table2``, irrespective of prior examples. +called `table1` and `table2`, irrespective of prior examples. -We can now create a ``BetweenRequirement`` for these two tables and use the -``ColumnSubset`` ``Constraint``. As before, we will rely on the ``get_factual_value`` +We can now create a `BetweenRequirement` for these two tables and use the +`ColumnSubset` `Constraint`. As before, we will rely on the `get_factual_value` method to retrieve the values of interest for the first table passed to the -``BetweenRequirement`` and the ``get_target_value`` method for the second table passed -to the ``BetweenRequirement``. +`BetweenRequirement` and the `get_target_value` method for the second table passed +to the `BetweenRequirement`. ```python import sqlalchemy as sa @@ -194,7 +194,6 @@ print(f"Columns present in only table1: {set(columns1) - set(columns2)}") print(f"Columns present in only table2: {set(columns2) - set(columns1)}") ``` - This could, for instance result in the following printout: ``` diff --git a/docs/examples/twitch.md b/docs/examples/twitch.md index 2e7a6534..545c8542 100644 --- a/docs/examples/twitch.md +++ b/docs/examples/twitch.md @@ -41,18 +41,18 @@ df_v2.to_sql("twitch_v2", engine, schema="public", if_exists="replace") df_v1.to_sql("twitch_v1", engine, schema="public", if_exists="replace") ``` -Once the tables are stored in a database, we can actually write a ``datajudge`` +Once the tables are stored in a database, we can actually write a `datajudge` specification against them. But first, we'll have a look at what the data roughly looks like by investigating a random sample of four rows: **A sample of the data** -| channel | watch_time | stream_time | peak_viewers | average_viewers | followers | followers_gained | views_gained | partnered | mature | language | -|----------|------------|-------------|--------------|-----------------|-----------|------------------|--------------|-----------|--------|-----------| -| xQcOW | 6196161750 | 215250 | 222720 | 27716 | 3246298 | 1734810 | 93036735 | True | False | English | -| summit1g | 6091677300 | 211845 | 310998 | 25610 | 5310163 | 1374810 | 89705964 | True | False | English | -| Gaules | 5644590915 | 515280 | 387315 | 10976 | 1767635 | 1023779 | 102611607 | True | True | Portuguese| -| ESL_CSGO | 3970318140 | 517740 | 300575 | 7714 | 3944850 | 703986 | 106546942 | True | False | English | +| channel | watch_time | stream_time | peak_viewers | average_viewers | followers | followers_gained | views_gained | partnered | mature | language | +| -------- | ---------- | ----------- | ------------ | --------------- | --------- | ---------------- | ------------ | --------- | ------ | ---------- | +| xQcOW | 6196161750 | 215250 | 222720 | 27716 | 3246298 | 1734810 | 93036735 | True | False | English | +| summit1g | 6091677300 | 211845 | 310998 | 25610 | 5310163 | 1374810 | 89705964 | True | False | English | +| Gaules | 5644590915 | 515280 | 387315 | 10976 | 1767635 | 1023779 | 102611607 | True | True | Portuguese | +| ESL_CSGO | 3970318140 | 517740 | 300575 | 7714 | 3944850 | 703986 | 106546942 | True | False | English | Note that we expect both version 1 and version 2 to follow this structure. Due to them being assembled at different points in time, merely their rows shows differ. @@ -80,7 +80,7 @@ express expectations against them. In this example, we have two tables in the sa one table per version of the Twitch data. Yet, let's start with a straightforward example only using version 2. We want to use our -domain knowledge that constrains the values of the ``language`` column only to contain letters +domain knowledge that constrains the values of the `language` column only to contain letters and have a length strictly larger than 0. ```python @@ -145,7 +145,7 @@ between_requirement_version.add_uniques_equality_constraint( Now having compared the 'same kind of data' between version 1 and version 2, we may as well compare 'different kind of data' within version 2, as a means of a sanity check. This sanity check consists of checking whether the mean -``average_viewer`` value of mature channels should deviate at most 10% from +`average_viewer` value of mature channels should deviate at most 10% from the overall mean. ```python @@ -168,7 +168,7 @@ between_requirement_columns.add_numeric_mean_constraint( ``` Lastly, we need to collect all of our requirements in a list and make sure -``pytest`` can find them by calling ``collect_data_tests``. +`pytest` can find them by calling `collect_data_tests`. ```python from datajudge.pytest_integration import collect_data_tests @@ -268,9 +268,9 @@ to investigate what is wrong with the data, what this has been caused by and how Concretely, what exactly do we learn from the error messages? -* The column ``language`` now has a row with value ``'Sw3d1zh'``. This break two of our - constraints. The ``VarCharRegex`` constraint compared the columns' values to a regular - expression. The ``UniquesEquality`` constraint expected the unique values of the - ``language`` column to not have changed between version 1 and version 2. -* The mean value of ``average_viewers`` of ``mature`` channels is substantially - more +- The column `language` now has a row with value `'Sw3d1zh'`. This break two of our + constraints. The `VarCharRegex` constraint compared the columns' values to a regular + expression. The `UniquesEquality` constraint expected the unique values of the + `language` column to not have changed between version 1 and version 2. +- The mean value of `average_viewers` of `mature` channels is substantially - more than our 10% tolerance - lower than the global mean. diff --git a/docs/getting-started.md b/docs/getting-started.md index 81ebd5db..b65cde43 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -2,16 +2,16 @@ ## Glossary -- A ``DataSource`` represents a way to retrieve data from database. Typically, this corresponds to a table in the database. Yet, it could also be a more elaborate object. See the section on 'Alternative ``DataSource`` s' for more detail. +- A `DataSource` represents a way to retrieve data from database. Typically, this corresponds to a table in the database. Yet, it could also be a more elaborate object. See the section on 'Alternative `DataSource` s' for more detail. -- A ``Constraint`` captures a concrete expectation between either two ``DataSource`` s or a single ``DataSource`` and a reference value. +- A `Constraint` captures a concrete expectation between either two `DataSource` s or a single `DataSource` and a reference value. -- A ``Requirement`` captures all ``Constraint`` s between two given ``DataSource`` s or all ``Constraint`` s within a single ``DataSource``. If a ``Requirement`` refers links to two ``DataSource`` s, it is a ``BetweenRequirement``. If a ``Requirement`` merely refers to a single ``DataSource``, it is a ``WithinRequirement``. +- A `Requirement` captures all `Constraint` s between two given `DataSource` s or all `Constraint` s within a single `DataSource`. If a `Requirement` refers links to two `DataSource` s, it is a `BetweenRequirement`. If a `Requirement` merely refers to a single `DataSource`, it is a `WithinRequirement`. -- Conceptually, a 'specification' captures all ``Requirement`` s against a database. In practice that means it is usually a separate python file which: +- Conceptually, a 'specification' captures all `Requirement` s against a database. In practice that means it is usually a separate python file which: - - gathers all relevant ``Requirement`` s - - turns these ``Requirement`` s' ``Constraint`` s into individual tests + - gathers all relevant `Requirement` s + - turns these `Requirement` s' `Constraint` s into individual tests - can be 'tested' by pytest ## Creating a specification @@ -43,25 +43,25 @@ In case you haven't worked with sqlalchemy engines before, you might need to ins ## Specifying Constraints -In order to discover possible ``Constraint`` s, please investigate the ``_add_*_constraint`` methods +In order to discover possible `Constraint` s, please investigate the `_add_*_constraint` methods for [`BetweenRequirement`](datajudge.requirements.BetweenRequirement) and [`WithinRequirement`](datajudge.requirements.WithinRequirement) respectively. TODO: FIX LINKS These methods are meant to be mostly self-documenting through the usage of expressive parameters. -Note that most ``Constraint`` s will allow for at least one ``Condition``. A ``Condition`` +Note that most `Constraint` s will allow for at least one `Condition`. A `Condition` can be thought of as a conditional event in probability theory or a filter/clause in a database -query. Please consult the doc string of ``Condition`` for greater detail. For examples, please -see ``tests/unit/test_condition.py``. +query. Please consult the doc string of `Condition` for greater detail. For examples, please +see `tests/unit/test_condition.py`. -Many ``Constraint`` s have optional ``columns`` parameters. If no argument is given, all +Many `Constraint` s have optional `columns` parameters. If no argument is given, all available columns will be used. ## Defining limitations of change -``BetweenRequirement`` s allow for ``Constraint`` s expressing the limitation of a loss or gain. For example, the ``NRowsMinGain`` ``Constraint`` -expresses by how much the number of rows must at least grow from the first ``DataSource`` to the second. In the example of ``NRowsMinGain`` , -this growth limitation is expressed relative to the number of rows of the first ``DataSource``. +`BetweenRequirement` s allow for `Constraint` s expressing the limitation of a loss or gain. For example, the `NRowsMinGain` `Constraint` +expresses by how much the number of rows must at least grow from the first `DataSource` to the second. In the example of `NRowsMinGain` , +this growth limitation is expressed relative to the number of rows of the first `DataSource`. Generally, such relative limitations can be defined in two ways: @@ -82,20 +82,20 @@ date_growth := (max_date_table_2 - min_date_table_2) / (max_date_table_1 - min_d #rows_table_2 > (1 + date_growth) * #rows_table_1 ``` -In the latter case a date column must be passed during the instantiation of the ``BetweenRequirement``. Moreover, the ``date_range_*`` must be passed -in the respective ``add_*_constraint`` method. When using date ranges as an indicator of change, the ``constant_max_*`` argument can safely be ignored. Additionally, -an additional buffer to the date growth can be added with help of the ``date_range_gain_deviation`` parameter: +In the latter case a date column must be passed during the instantiation of the `BetweenRequirement`. Moreover, the `date_range_*` must be passed +in the respective `add_*_constraint` method. When using date ranges as an indicator of change, the `constant_max_*` argument can safely be ignored. Additionally, +an additional buffer to the date growth can be added with help of the `date_range_gain_deviation` parameter: ``` date_growth := (max_date_table_2 - min_date_table_2) / (max_date_table_1 - min_date_table_1) #rows_table_2 > (1 + date_growth + date_range_gain_deviation) + * #rows_table_1 ``` -This example revolving around ``NRowsMinGain`` generalizes to many ``Constraint`` s concerned with growth, gain, loss or shrinkage limitations. +This example revolving around `NRowsMinGain` generalizes to many `Constraint` s concerned with growth, gain, loss or shrinkage limitations. ## Testing a specification -In order to test whether the ``Constraint`` s expressed in a specification hold true, you can simply run +In order to test whether the `Constraint` s expressed in a specification hold true, you can simply run ```bash pytest your_specification.py @@ -112,8 +112,8 @@ As the testing relies on [pytest](https://docs.pytest.org/en/latest), all of `py ## Test information -When calling a ``Constraint``'s ``test`` method, a ``TestResult`` is returned. The latter comes with a -``logging_message`` field. This field comprises information about the test failure, the constraint at hand +When calling a `Constraint`'s `test` method, a `TestResult` is returned. The latter comes with a +`logging_message` field. This field comprises information about the test failure, the constraint at hand as well as the underlying database queries. Depending on the use case at hand, it might make sense to rely on this information for logging or data investigation @@ -129,17 +129,17 @@ The following table lists all the supported codes, along with their descriptions ### Supported styling codes -| Code | Description | Example | -|------|-------------|---------| +| Code | Description | Example | +| ---------- | --------------------------------------------------------------- | ---------------------------- | | `numMatch` | Indicates the part of a number that matches the expected value. | `[numMatch]3.141[/numMatch]` | -| `numDiff` | Indicates the part of a number that differs. | `[numDiff]6[/numDiff]` | +| `numDiff` | Indicates the part of a number that differs. | `[numDiff]6[/numDiff]` | ## Alternative DataSources -A ``Requirement`` is instantiated with either one or two fixed ``DataSource``s. +A `Requirement` is instantiated with either one or two fixed `DataSource`s. -While the most typical example of a ``DataSource`` would be a table in a database, ``datajudge`` allows -for other ``DataSource`` s as well. These are often derived from primitive tables of a database. +While the most typical example of a `DataSource` would be a table in a database, `datajudge` allows +for other `DataSource` s as well. These are often derived from primitive tables of a database. TODO: FIX TABLE @@ -166,21 +166,21 @@ TODO: FIX TABLE --> -Typically, a user does not need to instantiate a corresponding ``DataSource`` themselves. Rather, this is taken care -of by using the appropriate constructor for ``WithinRequirement`` or ``BetweenRequirement``. +Typically, a user does not need to instantiate a corresponding `DataSource` themselves. Rather, this is taken care +of by using the appropriate constructor for `WithinRequirement` or `BetweenRequirement`. -Note that in principle, several tables can be combined to make up for a single ``DataSource``. Yet, most of -the time when trying to compare two tables, it is more convenient to create a ``BetweenRequirement`` and use -the ``from_tables`` constructor. +Note that in principle, several tables can be combined to make up for a single `DataSource`. Yet, most of +the time when trying to compare two tables, it is more convenient to create a `BetweenRequirement` and use +the `from_tables` constructor. ## Column capitalization Different database management systems handle the capitalization of entities, such as column names, differently. For the time being: -- Mssql: ``datajudge`` expects column name capitalization as is seen in database, either lowercase or uppercase. -- Postgres: ``datajudge`` expects lowercase column names. -- Snowflake: ``datajudge`` will lowercase independently of the capitalization provided. +- Mssql: `datajudge` expects column name capitalization as is seen in database, either lowercase or uppercase. +- Postgres: `datajudge` expects lowercase column names. +- Snowflake: `datajudge` will lowercase independently of the capitalization provided. The Snowflake behavior is due to an upstream [bug](https://github.com/snowflakedb/snowflake-sqlalchemy/issues/157) in snowflake-sqlalchemy. diff --git a/docs/index.md b/docs/index.md index cd1e913b..de826ae6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,8 +1,8 @@ # datajudge -``datajudge`` allows for assessing whether data from database complies with reference information. +`datajudge` allows for assessing whether data from database complies with reference information. -While meant to be as agnostic to concrete database management systems as possible, ``datajudge`` currently explicitly supports: +While meant to be as agnostic to concrete database management systems as possible, `datajudge` currently explicitly supports: - Postgres - MSSQL diff --git a/docs/motivation.md b/docs/motivation.md index fe6695b4..addee008 100644 --- a/docs/motivation.md +++ b/docs/motivation.md @@ -1,10 +1,10 @@ # Motivation -Ensuring data quality is of great importance for many use cases. ``datajudge`` seeks to make this convenient. +Ensuring data quality is of great importance for many use cases. `datajudge` seeks to make this convenient. -``datajudge`` allows for the expression of expectations held against data stored in databases. In particular, it allows for comparing different ``DataSource``s. Yet, it also comes with functionalities to compare data from a single ``DataSource`` to fixed reference values derived from explicit domain knowledge. +`datajudge` allows for the expression of expectations held against data stored in databases. In particular, it allows for comparing different `DataSource`s. Yet, it also comes with functionalities to compare data from a single `DataSource` to fixed reference values derived from explicit domain knowledge. -Not trying to reinvent the wheel, ``datajudge`` relies on ``pytest`` to execute the data expectations. +Not trying to reinvent the wheel, `datajudge` relies on `pytest` to execute the data expectations. ## Comparisons between DataSources @@ -17,10 +17,10 @@ In both cases one might want to compare different data -- either from different ## Why not Great Expectations? -The major selling point is to be able to conveniently express expectations **between** different ``DataSource``s. Great Expectations, in contrast, focuses on expectations against a single ``DataSource``. +The major selling point is to be able to conveniently express expectations **between** different `DataSource`s. Great Expectations, in contrast, focuses on expectations against a single `DataSource`. Moreover, some users have pointed out the following advantages: -- lots of 'query writing' is taken care of by having tailored ``Constraint``s +- lots of 'query writing' is taken care of by having tailored `Constraint`s - easier and faster onboarding - assertion messages with counterexamples and other context information, speeding up the data debugging process diff --git a/docs/testing.md b/docs/testing.md index fc17c6bb..49eb8998 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -1,23 +1,23 @@ # Testing -While ``datajudge`` allows to express expectations via specifications, ``Requirement``s -and ``Constraint``s, the execution of tests is delegated to pytest. As a +While `datajudge` allows to express expectations via specifications, `Requirement`s +and `Constraint`s, the execution of tests is delegated to pytest. As a consequence, one may use any functionalities that pytest has to offer. Here, we want to illustrate some of these advanced functionalities that might turn out useful. -Yet, it should be noted that for most intents and purposes, using ``datajudge`` 's helper +Yet, it should be noted that for most intents and purposes, using `datajudge` 's helper function :func:`~datajudge.pytest_integration.collect_data_tests` is a good starting point. It should work out of the box and hides some complexity. For exemplary applications see, the [companies example](examples/company-data.md) or the [twitch example](examples/twitch.md). -Throughout this article we will not rely on ``collect_data_tests``. Instead we will more -explicitly create a mechanism turning a List of ``Requirement`` objects into something -that can be tested by pytest manually. Importantly, we want every ``Constraint`` of every -``Requirement`` to be tested independently of each other. For instance, we would not like +Throughout this article we will not rely on `collect_data_tests`. Instead we will more +explicitly create a mechanism turning a List of `Requirement` objects into something +that can be tested by pytest manually. Importantly, we want every `Constraint` of every +`Requirement` to be tested independently of each other. For instance, we would not like one failing test to halt all others. -Many of these approaches rely on adapting pytest's ``conftest.py``. If you are not familiar +Many of these approaches rely on adapting pytest's `conftest.py`. If you are not familiar with this concept, you might want to read up on it [in the pytest docs](https://docs.pytest.org/en/latest/writing_plugins.html#conftest-py-plugins). @@ -37,16 +37,16 @@ In this section, we present two approaches to do a subselection of tests. ### Ex-post: subselecting generated tests -Instead of merely running ``$ pytest specification.py`` one may add pytests's -``-k`` flag and specify the ``Constraint`` (s) one cares about. +Instead of merely running `$ pytest specification.py` one may add pytests's +`-k` flag and specify the `Constraint` (s) one cares about. -Importantly, every ``Constraint`` object can be identified via a name. If one wants +Importantly, every `Constraint` object can be identified via a name. If one wants to figure out how this string is built, please refer to the implementation of :meth:`~datajudge.constraints.base.Constraint.get_description`. Otherwise, one could also just run all of the tests once and investigate the resulting test report to find the relevant names. -When only caring about the ``UniquesEquality`` constraint in our +When only caring about the `UniquesEquality` constraint in our :doc:`twitch example `. one might for instance use the following prefix the filter for it: @@ -58,14 +58,14 @@ pytest twitch_specification.py -k "UniquesEquality::public.twitch_v1" Another option to subselect a certain set of tests is by use of [pytest markers](https://docs.pytest.org/en/latest/example/markers.html). -The following is one way of using markers in conjunction with ``datajudge``. +The following is one way of using markers in conjunction with `datajudge`. In this particular illustration we'll allow for two markers: -* ``basic``: indicating that only truly fundamental tests should be run -* ``all``: indicating that any available test should be run +- `basic`: indicating that only truly fundamental tests should be run +- `all`: indicating that any available test should be run -For that matter we'll add a bit of pytest magic to the respective ``conftest.py``. +For that matter we'll add a bit of pytest magic to the respective `conftest.py`. ```python title="conftest.py" def pytest_generate_tests(metafunc): @@ -85,7 +85,7 @@ def pytest_generate_tests(metafunc): ) ``` -Moreover, we'll have to register these markers in pytest's ``[tool.pytest.ini_options]`` in `pyproject.toml`. +Moreover, we'll have to register these markers in pytest's `[tool.pytest.ini_options]` in `pyproject.toml`. You can read more about these files [here](https://docs.pytest.org/en/latest/customize.html). ```toml title="pyproject.toml" @@ -139,13 +139,13 @@ Once these changes are taken care of, one may run pytest specification.py -m basic ``` -to only test the basic ``Requirement`` s or +to only test the basic `Requirement` s or ```bash pytest specification.py -m all ``` -to test all ``Requirement``s. +to test all `Requirement`s. ## Using parameters in a specification @@ -160,10 +160,10 @@ those pointers or identifiers as parameters of the specification. For the sake of concreteness, we will assume here that we wish frame two identifiers as parameters: -* ``new_db``: the name of the 'new database' -* ``old_db``: the name of the 'old database' +- `new_db`: the name of the 'new database' +- `old_db`: the name of the 'old database' -In light of that we will again adapt pytest's ``conftest.py``: +In light of that we will again adapt pytest's `conftest.py`: ```python title="conftest.py" def pytest_addoption(parser): @@ -183,7 +183,7 @@ def pytest_generate_tests(metafunc): ) ``` -Now, we can make the creation of our ``Requirement``s and ``Constraint``s +Now, we can make the creation of our `Requirement`s and `Constraint`s dependent on these parameters: ```python title="application.py" @@ -218,17 +218,17 @@ pytest specification.py --new_db=db_v1 --old_db=db_v2 ## Html reports -By default, running ``pytest`` tests will output test results to one's respective shell. +By default, running `pytest` tests will output test results to one's respective shell. Alternatively, one might want to generate an html report summarizing and expanding on all test results. This can be advantageous for -* Sharing test results with colleagues -* Archiving and tracking test results over time -* Make underlying sql queries conveniently accessible +- Sharing test results with colleagues +- Archiving and tracking test results over time +- Make underlying sql queries conveniently accessible Concretely, such an html report can be generated by [pytest-html](https://github.com/pytest-dev/pytest-html). Once installed, using it is as simple -as appending ``--html=myreport.html`` to the pytest call. +as appending `--html=myreport.html` to the pytest call. In our twitch example, this generates [this html report](https://github.com/Quantco/datajudge/tree/main/docs/examples/twitch_report.html). @@ -238,14 +238,14 @@ Usually we not only care about knowing whether there is a problem with the data at hand and what it is. Rather, we would also like to fix it as fast and conveniently as possible. -For that matter, ``datajudge`` makes the queries it uses to assert testing predicates +For that matter, `datajudge` makes the queries it uses to assert testing predicates available via the :class:`datajudge.constraints.base.TestResult` class. Hence, if a test is failing, the user can jumpstart the investigation of the problem by reusing and potentially adapting the underlying queries. -Instead of simply running ``assert constraint.test(engine).outcome``, one may add -the ``TestResult``'s ``logging_message`` to e.g. a ``logger`` or add it to pytest -``extra``: +Instead of simply running `assert constraint.test(engine).outcome`, one may add +the `TestResult`'s `logging_message` to e.g. a `logger` or add it to pytest +`extra`: ```python from pytest_html import extras @@ -271,7 +271,7 @@ def test_constraint(constraint: Constraint, engine, extra): assert test_result.outcome ``` -Such a ``logging_message`` - with ready to execute sql queries - can look as follows: +Such a `logging_message` - with ready to execute sql queries - can look as follows: ```sql /* diff --git a/pyproject.toml b/pyproject.toml index a0f3ec48..6c6f6953 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,14 @@ ignore = [ "N803", # https://docs.astral.sh/ruff/rules/invalid-argument-name "N806", # https://docs.astral.sh/ruff/rules/non-lowercase-variable-in-function "E501", # https://docs.astral.sh/ruff/faq/#is-the-ruff-linter-compatible-with-black + "D100", + "D101", + "D102", + "D103", + "D104", + "D105", + "D106", + "D107", ] select = [ # pyflakes @@ -45,8 +53,15 @@ select = [ "N", # pyupgrade "UP", + # pydocstyle + "D", ] +[tool.ruff.lint.pydocstyle] +convention = "numpy" +[tool.ruff.lint.per-file-ignores] +"**/{tests,test_utils}/*" = ["D"] + [tool.ruff.lint.isort] known-first-party = ["datajudge"] diff --git a/src/datajudge/__init__.py b/src/datajudge/__init__.py index d7ac7b33..1ef7eb62 100644 --- a/src/datajudge/__init__.py +++ b/src/datajudge/__init__.py @@ -1,5 +1,6 @@ """datajudge allows to assess whether data from database complies with reference -information.""" +information. +""" from .constraints.base import Constraint from .db_access import Condition diff --git a/src/datajudge/constraints/column.py b/src/datajudge/constraints/column.py index dede0e78..b3c8f1fd 100644 --- a/src/datajudge/constraints/column.py +++ b/src/datajudge/constraints/column.py @@ -72,8 +72,7 @@ def compare( class ColumnType(Constraint): - """ - A class used to represent a ColumnType constraint. + """A class used to represent a ColumnType constraint. This class enables flexible specification of column types either in string format or using SQLAlchemy's type hierarchy. It checks whether a column's type matches the specified type, allowing for checks against backend-specific types, diff --git a/src/datajudge/constraints/stats.py b/src/datajudge/constraints/stats.py index 135ca4aa..7d1479d7 100644 --- a/src/datajudge/constraints/stats.py +++ b/src/datajudge/constraints/stats.py @@ -25,13 +25,11 @@ def __init__( def approximate_p_value( d: float, n_samples: int, m_samples: int ) -> Optional[float]: - """ - Calculates the approximate p-value according to + """Calculates the approximate p-value according to 'A procedure to find exact critical values of Kolmogorov-Smirnov Test', Silvia Fachinetti, 2009 Note: For environments with `scipy` installed, this method will return a quasi-exact p-value. """ - # approximation does not work for small sample sizes samples = min(n_samples, m_samples) if samples < 35: @@ -58,8 +56,7 @@ def approximate_p_value( def check_acceptance( d_statistic: float, n_samples: int, m_samples: int, accepted_level: float ) -> bool: - """ - For a given test statistic, d, and the respective sample sizes `n` and `m`, this function + """For a given test statistic, d, and the respective sample sizes `n` and `m`, this function checks whether the null hypothesis can be rejected for an accepted significance level. For more information, check out the `Wikipedia entry `_. diff --git a/src/datajudge/constraints/uniques.py b/src/datajudge/constraints/uniques.py index 5319a823..ee325153 100644 --- a/src/datajudge/constraints/uniques.py +++ b/src/datajudge/constraints/uniques.py @@ -393,8 +393,7 @@ def test(self, engine: sa.engine.Engine) -> TestResult: class CategoricalBoundConstraint(Constraint): - """ - `CategoricalBoundConstraint` is a constraint class that checks if the share of specific values + """`CategoricalBoundConstraint` is a constraint class that checks if the share of specific values in a column falls within predefined bounds. It compares the actual distribution of values in a `DataSource` column with a target distribution, supplied as a dictionary. diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 2f94ff5a..6250dfd1 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -42,10 +42,7 @@ def get_table_columns(table, column_names): def apply_patches(engine: sa.engine.Engine): - """ - Apply patches to e.g. specific dialect not implemented by sqlalchemy - """ - + """Apply patches to e.g. specific dialect not implemented by sqlalchemy""" if is_bigquery(engine): # Patch for the EXCEPT operator (see BigQuery set operators # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#set_operators) @@ -843,8 +840,7 @@ def get_column( *, aggregate_operator: Callable | None = None, ): - """ - Queries the database for the values of the relevant column (as returned by `get_column(...)`). + """Queries the database for the values of the relevant column (as returned by `get_column(...)`). If an aggregation operation is passed, the results are aggregated accordingly and a single scalar value is returned. """ @@ -1267,8 +1263,7 @@ def get_ks_2sample( ref1: DataReference, ref2: DataReference, ): - """ - Run the query for the two-sample Kolmogorov-Smirnov test and return the test statistic d. + """Run the query for the two-sample Kolmogorov-Smirnov test and return the test statistic d. For a raw-sql version of this query, please see this PR: https://github.com/Quantco/datajudge/pull/28/ diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py index 44482b32..de19d6ac 100644 --- a/src/datajudge/requirements.py +++ b/src/datajudge/requirements.py @@ -194,8 +194,7 @@ def add_column_type_constraint( name: Optional[str] = None, cache_size=None, ): - """ - Check if a column type matches the expected column_type. + """Check if a column type matches the expected column_type. The column_type can be provided as a string (backend-specific type name), a backend-specific SQLAlchemy type, or a SQLAlchemy's generic type. @@ -339,7 +338,6 @@ def add_uniques_equality_constraint( See the ``Uniques`` class for further parameter details on ``map_func`` and ``reduce_func``, and ``output_processors``. """ - ref = DataReference(self.data_source, columns, condition) self._constraints.append( uniques_constraints.UniquesEquality( @@ -399,7 +397,6 @@ def add_uniques_superset_constraint( See ``Uniques`` for further details on ``map_func``, ``reduce_func``, and ``output_processors``. """ - ref = DataReference(self.data_source, columns, condition) self._constraints.append( uniques_constraints.UniquesSuperset( @@ -465,7 +462,6 @@ def add_uniques_subset_constraint( See ``Uniques`` for further details on ``map_func``, ``reduce_func``, and ``output_processors``. """ - ref = DataReference(self.data_source, columns, condition) self._constraints.append( uniques_constraints.UniquesSubset( @@ -507,9 +503,7 @@ def add_categorical_bound_constraint( name: Optional[str] = None, cache_size=None, ): - """ - Check if the distribution of unique values in columns falls within the - specified minimum and maximum bounds. + """Check if the distribution of unique values in columns falls within the specified minimum and maximum bounds. The `CategoricalBoundConstraint` is added to ensure the distribution of unique values in the specified columns of a `DataSource` falls within the given minimum and maximum @@ -517,26 +511,28 @@ def add_categorical_bound_constraint( Parameters ---------- - columns : List[str] + columns: A list of column names from the `DataSource` to apply the constraint on. - distribution : Dict[T, Tuple[float, float]] + distribution: A dictionary where keys represent unique values and the corresponding tuple values represent the minimum and maximum allowed proportions of the respective unique value in the columns. - default_bounds : Tuple[float, float], optional, default=(0, 0) + default_bounds: A tuple specifying the minimum and maximum allowed proportions for all elements not mentioned in the distribution. By default, it's set to (0, 0), which means all elements not present in `distribution` will cause a constraint failure. - max_relative_violations : float, optional, default=0 + max_relative_violations: A tolerance threshold (0 to 1) for the proportion of elements in the data that can violate the bound constraints without triggering the constraint violation. - condition : Condition, optional + condition: An optional parameter to specify a `Condition` object to filter the data before applying the constraint. - name : str, optional + name: An optional parameter to provide a custom name for the constraint. + cache_size: + TODO - Example + Example: ------- This method can be used to test for consistency in columns with expected categorical values or ensure that the distribution of values in a column adheres to a certain @@ -554,7 +550,6 @@ def add_categorical_bound_constraint( ) ``` """ - ref = DataReference(self.data_source, columns, condition) self._constraints.append( uniques_constraints.CategoricalBoundConstraint( @@ -817,7 +812,6 @@ def add_date_no_overlap_constraint( For illustrative examples of this constraint, please refer to its test cases. """ - relevant_columns = [start_column, end_column] + ( key_columns if key_columns else [] ) @@ -922,8 +916,7 @@ def add_date_no_gap_constraint( name: Optional[str] = None, cache_size=None, ): - """ - Express that date range rows have no gap in-between them. + """Express that date range rows have no gap in-between them. The table under inspection must consist of at least one but up to many key columns, identifying an entity. Additionally, a ``start_column`` and an ``end_column``, @@ -978,8 +971,7 @@ def add_functional_dependency_constraint( ] = output_processor_limit, cache_size=None, ): - """ - Expresses a functional dependency, a constraint where the `value_columns` are uniquely determined by the `key_columns`. + """Expresses a functional dependency, a constraint where the `value_columns` are uniquely determined by the `key_columns`. This means that for each unique combination of values in the `key_columns`, there is exactly one corresponding combination of values in the `value_columns`. The ``add_unique_constraint`` constraint is a special case of this constraint, where the `key_columns` are a primary key, @@ -1017,8 +1009,7 @@ def add_numeric_no_gap_constraint( name: Optional[str] = None, cache_size=None, ): - """ - Express that numeric interval rows have no gaps larger than some max value in-between them. + """Express that numeric interval rows have no gaps larger than some max value in-between them. The table under inspection must consist of at least one but up to many key columns, identifying an entity. Additionally, a ``start_column`` and an ``end_column``, indicating interval start and end values, should be provided. @@ -1100,7 +1091,6 @@ def add_numeric_no_overlap_constraint( For illustrative examples of this constraint, please refer to its test cases. """ - relevant_columns = [start_column, end_column] + ( key_columns if key_columns else [] ) @@ -1130,8 +1120,7 @@ def add_varchar_regex_constraint( n_counterexamples: int = 5, cache_size=None, ): - """ - Assesses whether the values in a column match a given regular expression pattern. + """Assesses whether the values in a column match a given regular expression pattern. The option ``allow_none`` can be used in cases where the column is defined as nullable and contains null values. @@ -1174,8 +1163,7 @@ def add_varchar_regex_constraint_db( n_counterexamples: int = 5, cache_size=None, ): - """ - Assesses whether the values in a column match a given regular expression pattern. + """Assesses whether the values in a column match a given regular expression pattern. How the tolerance factor is calculated can be controlled with the ``aggregated`` flag. When ``True``, the tolerance is calculated using unique values. If not, the @@ -1263,7 +1251,6 @@ def add_groupby_aggregation_constraint( In order to allow for slight deviations from this pattern, ``tolerance`` expresses the fraction of all grouped-by rows, which may be incomplete ranges. """ - ref = DataReference(self.data_source, list(columns), condition) self._constraints.append( groupby_constraints.AggregateNumericRangeEquality( @@ -1664,7 +1651,6 @@ def add_uniques_equality_constraint( See :class:`~datajudge.constraints.uniques.Uniques` for further parameter details on ``map_func``, ``reduce_func``, and ``output_processors``. """ - ref = DataReference(self.data_source, columns1, condition1) ref2 = DataReference(self.data_source2, columns2, condition2) self._constraints.append( @@ -1727,7 +1713,6 @@ def add_uniques_superset_constraint( See :class:`~datajudge.constraints.uniques.Uniques` for further details on ``map_func``, ``reduce_func``, and ``output_processors``. """ - ref = DataReference(self.data_source, columns1, condition1) ref2 = DataReference(self.data_source2, columns2, condition2) self._constraints.append( @@ -1793,7 +1778,6 @@ def add_uniques_subset_constraint( See :class:`~datajudge.constraints.uniques.Uniques` for further details on ``map_func``, ``reduce_func``, and ``output_processors``. """ - ref = DataReference(self.data_source, columns1, condition1) ref2 = DataReference(self.data_source2, columns2, condition2) self._constraints.append( @@ -2020,7 +2004,7 @@ def add_column_type_constraint( name: Optional[str] = None, cache_size=None, ): - "Check that the columns have the same type." + """Check that the columns have the same type.""" ref1 = DataReference(self.data_source, [column1]) ref2 = DataReference(self.data_source2, [column2]) self._constraints.append( @@ -2183,13 +2167,11 @@ def add_ks_2sample_constraint( significance_level: float = 0.05, cache_size=None, ): - """ - Apply the so-called two-sample Kolmogorov-Smirnov test to the distributions of the two given columns. + """Apply the so-called two-sample Kolmogorov-Smirnov test to the distributions of the two given columns. The constraint is fulfilled, when the resulting p-value of the test is higher than the significance level (default is 0.05, i.e., 5%). The signifance_level must be a value between 0.0 and 1.0. """ - if not column1 or not column2: raise ValueError( "Column names have to be given for this test's functionality." diff --git a/src/datajudge/utils.py b/src/datajudge/utils.py index ad47607f..5fc5bee3 100644 --- a/src/datajudge/utils.py +++ b/src/datajudge/utils.py @@ -8,8 +8,7 @@ def _fmt_diff_part(s, d): def format_difference( n1: Union[float, int], n2: Union[float, int], decimal_separator: bool = True ) -> Tuple[str, str]: - """ - Given two numbers, n1 and n2, return a tuple of two strings, + """Given two numbers, n1 and n2, return a tuple of two strings, each representing one of the input numbers with the differing part highlighted. Highlighting is done using BBCode-like tags, which are replaced by the formatter. @@ -55,8 +54,7 @@ def __call__( # noqa: E704 def output_processor_sort( collection: Collection, counts: Optional[Collection] = None ) -> Tuple[Collection, Optional[Collection]]: - """ - Sorts a collection of tuple elements in descending order of their counts, + """Sorts a collection of tuple elements in descending order of their counts, and for ties, makes use of the ascending order of the elements themselves. If the first element is not instanceof tuple, @@ -90,8 +88,7 @@ def output_processor_sort( def output_processor_limit( collection: Collection, counts: Optional[Collection] = None, limit: int = 100 ) -> Tuple[Collection, Optional[Collection]]: - """ - Limits the collection to the first ``limit`` elements. + """Limits the collection to the first ``limit`` elements. If the list was shortened, will add a ``limit+1``-th string element, informing the user of the truncation. @@ -142,8 +139,7 @@ def filternull_element_or_tuple_any(values: List) -> List: def sort_tuple_none_aware( collection: Collection[Tuple], ascending=True ) -> Collection[Tuple]: - """ - Stable sort of a collection of tuples. + """Stable sort of a collection of tuples. Each tuple in the collection must have the same length, since they are treated as rows in a table, with ``elem[0]`` being the first column, From d9c714c81d8ec9a642b947d0ea0f4c559876ca27 Mon Sep 17 00:00:00 2001 From: Thomas Marwitz Date: Wed, 14 Aug 2024 14:57:48 +0200 Subject: [PATCH 3/3] Render all code documentation, exemplary link within docstring. Include a link example to another module function (:meth: in sphinx). Fix some ruff complaints. --- docs/testing.md | 2 +- mkdocs.yml | 1 + src/datajudge/requirements.py | 35 +++++++++++++++++++---------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/docs/testing.md b/docs/testing.md index 49eb8998..c5a72fa1 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -6,7 +6,7 @@ consequence, one may use any functionalities that pytest has to offer. Here, we want to illustrate some of these advanced functionalities that might turn out useful. Yet, it should be noted that for most intents and purposes, using `datajudge` 's helper -function :func:`~datajudge.pytest_integration.collect_data_tests` is a good starting +function [`collect_data_tests`][datajudge.pytest_integration.collect_data_tests] is a good starting point. It should work out of the box and hides some complexity. For exemplary applications see, the [companies example](examples/company-data.md) or the [twitch example](examples/twitch.md). diff --git a/mkdocs.yml b/mkdocs.yml index 2a129df3..88d52847 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -48,6 +48,7 @@ plugins: docstring_section_style: spacy separate_signature: true merge_init_into_class: true + show_submodules: true # show *all* code contained in paths nav: - installation.md diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py index de19d6ac..13b80201 100644 --- a/src/datajudge/requirements.py +++ b/src/datajudge/requirements.py @@ -316,27 +316,27 @@ def add_uniques_equality_constraint( ): """Check if the data's unique values are equal to a given set of values. - The ``UniquesEquality`` constraint asserts if the values contained in a column - of a ``DataSource`` are strictly the ones of a reference set of expected values, - specified via the ``uniques`` parameter. + The `UniquesEquality` constraint asserts if the values contained in a column + of a `DataSource` are strictly the ones of a reference set of expected values, + specified via the `uniques` parameter. - Null values in the columns ``columns`` are ignored. To assert the non-existence of them use - the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method - for ``WithinRequirement``. + Null values in the columns `columns` are ignored. To assert the non-existence of them use + the [add_null_absence_constraint][datajudge.requirements.WithinRequirement.add_null_absence_constraint] helper method + for `WithinRequirement`. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom ``filter_func`` function. + It can be configured in more detail by supplying a custom `filter_func` function. Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, - :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + [filternull_never][datajudge.utils.filternull_never], :func:`~datajudge.utils.filternull_element_or_tuple_all`, :func:`~datajudge.utils.filternull_element_or_tuple_any`. - Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + Passing `None` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. The current default of :func:`~datajudge.utils.filternull_element` Cause (possibly often unintended) changes in behavior when the users adds a second column (filtering no longer can trigger at all). The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. - To silence the warning, set ``filter_func`` explicitly. + To silence the warning, set `filter_func` explicitly. - See the ``Uniques`` class for further parameter details on ``map_func`` and - ``reduce_func``, and ``output_processors``. + See the `Uniques` class for further parameter details on `map_func` and + `reduce_func`, and `output_processors`. """ ref = DataReference(self.data_source, columns, condition) self._constraints.append( @@ -972,6 +972,7 @@ def add_functional_dependency_constraint( cache_size=None, ): """Expresses a functional dependency, a constraint where the `value_columns` are uniquely determined by the `key_columns`. + This means that for each unique combination of values in the `key_columns`, there is exactly one corresponding combination of values in the `value_columns`. The ``add_unique_constraint`` constraint is a special case of this constraint, where the `key_columns` are a primary key, @@ -1010,6 +1011,7 @@ def add_numeric_no_gap_constraint( cache_size=None, ): """Express that numeric interval rows have no gaps larger than some max value in-between them. + The table under inspection must consist of at least one but up to many key columns, identifying an entity. Additionally, a ``start_column`` and an ``end_column``, indicating interval start and end values, should be provided. @@ -1583,7 +1585,7 @@ def add_max_null_fraction_constraint( Given that ``column2``\'s underlying data has a fraction ``q`` of ``NULL`` values, the ``max_relative_deviation`` parameter allows ``column1``\'s underlying data to have a fraction ``(1 + max_relative_deviation) * q`` of ``NULL`` values. - """ + """ # noqa: D301 ref = DataReference(self.data_source, [column1], condition1) ref2 = DataReference(self.data_source2, [column2], condition2) self._constraints.append( @@ -2028,7 +2030,7 @@ def add_row_equality_constraint( In other words, :math:`\\frac{|T1 - T2| + |T2 - T1|}{|T1 \\cup T2|} \\leq` ``max_missing_fraction``. Rows from T1 are indexed in ``columns1``, rows from T2 are indexed in ``columns2``. - """ + """ # noqa: D301 ref = DataReference(self.data_source, columns1, condition1) ref2 = DataReference(self.data_source2, columns2, condition2) self._constraints.append( @@ -2062,7 +2064,7 @@ def add_row_subset_constraint( contrast to ``EXCEPT ALL``, this should lead to a set subtraction instead of a multiset subtraction. In other words, duplicates in T1 are treated as single occurrences. - """ + """ # noqa: D301 max_missing_fraction_getter = self.get_deviation_getter( constant_max_missing_fraction, date_range_loss_fraction ) @@ -2095,7 +2097,7 @@ def add_row_superset_constraint( :math:`\\frac{|T2-T1|}{|T2|} \\leq` ``max_missing_fraction``. Rows from T1 are indexed in ``columns1``, rows from T2 are indexed in ``columns2``. - """ + """ # noqa: D301 max_missing_fraction_getter = self.get_deviation_getter( constant_max_missing_fraction, date_range_loss_fraction ) @@ -2168,6 +2170,7 @@ def add_ks_2sample_constraint( cache_size=None, ): """Apply the so-called two-sample Kolmogorov-Smirnov test to the distributions of the two given columns. + The constraint is fulfilled, when the resulting p-value of the test is higher than the significance level (default is 0.05, i.e., 5%). The signifance_level must be a value between 0.0 and 1.0.