diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9694b984..3429a4f7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,7 +28,6 @@ jobs: - name: Run pre-commit checks uses: pre-commit/action@v3.0.0 - linux: name: "Linux - unit tests - Python ${{ matrix.PYTHON_VERSION }}" runs-on: ubuntu-latest @@ -228,3 +227,265 @@ jobs: uses: codecov/codecov-action@v3.1.1 with: file: ./coverage.xml + + linux-integration_tests-impala-column-pt1: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt1" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_column_capitalization.py + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml + + + linux-integration_tests-impala-column-pt2: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt2" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_data_source.py + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml + + linux-integration_tests-impala-column-pt3: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt3" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_integration.py -k row + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml + + + linux-integration_tests-impala-column-pt4: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt4" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_integration.py -k uniques + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml + + linux-integration_tests-impala-column-pt5: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt5" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_integration.py -k date + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml + + linux-integration_tests-impala-column-pt6: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt6" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_integration.py -k varchar + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml + + + linux-integration_tests-impala-column-pt7: + name: "Linux - integration tests - Python ${{ matrix.PYTHON_VERSION }} - Impala - pt7" + runs-on: ubuntu-20.04 + env: + CI: True + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: [ '3.8'] + + steps: + - name: Checkout branch + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Fetch full git history + run: git fetch --prune --unshallow + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + miniforge-variant: Mambaforge + miniforge-version: 4.11.0-2 + use-mamba: true + environment-file: environment.yml + activate-environment: datajudge + - name: Set up container + run: docker compose up --wait impala + - name: Run Integration Tests + shell: bash -l {0} + run: | + flit install -s + pytest --cov=datajudge --cov-report=xml --cov-append --backend=impala tests/integration/test_integration.py -k numeric + - name: Generate code coverage report + uses: codecov/codecov-action@v3.1.1 + with: + file: ./coverage.xml diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b7a6434f..70ea3981 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,7 @@ Changelog - Implement :meth:`~datajudge.BetweenRequirement.add_column_type_constraint`. Previously, only the ``WithinRequirement`` method existed. - Implemented an option ``infer_pk`` to automatically retrieve and primary key definition as part of :meth:`datajudge.WithinRequirement.add_uniqueness_constraint`. - Added a ``name`` parameter to all ``add_x_constraint`` methods of ``WithinRequirement`` and ``BetweenRequirement``. This will give pytest test a custom name. +- Added preliminary support for Impala. 1.2.0 - 2022.10.21 diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 00000000..1efa1490 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,64 @@ +# Taken from +# https://github.com/ibis-project/ibis/blob/master/docker-compose.yml +version: "3.4" +services: + impala: + depends_on: + - impala-postgres + - kudu + environment: + PGPASSWORD: postgres + healthcheck: + interval: 30s + retries: 20 + test: + - CMD-SHELL + - nc -z 127.0.0.1 21050 && nc -z 127.0.0.1 50070 + timeout: 10s + hostname: localhost + image: ibisproject/impala:latest + ports: + - 21050:21050 + networks: + - impala + impala-postgres: + user: postgres + hostname: postgres + environment: + POSTGRES_PASSWORD: postgres + healthcheck: + interval: 10s + retries: 3 + test: + - CMD + - pg_isready + timeout: 5s + image: postgres:13.9-alpine + networks: + - impala + kudu: + cap_add: + - SYS_TIME + depends_on: + - kudu-tserver + environment: + KUDU_MASTER: "true" + image: ibisproject/kudu:latest + ports: + - 7051:7051 + - 8051:8051 + networks: + - impala + kudu-tserver: + cap_add: + - SYS_TIME + environment: + KUDU_MASTER: "false" + image: ibisproject/kudu:latest + ports: + - 7050:7050 + - 8050:8050 + networks: + - impala +networks: + impala: diff --git a/environment.yml b/environment.yml index 465531aa..8687f3bb 100644 --- a/environment.yml +++ b/environment.yml @@ -24,3 +24,4 @@ dependencies: - flit-core - flit - sphinx-autodoc-typehints + - impyla diff --git a/src/datajudge/constraints/miscs.py b/src/datajudge/constraints/miscs.py index 6918df06..153a2425 100644 --- a/src/datajudge/constraints/miscs.py +++ b/src/datajudge/constraints/miscs.py @@ -15,6 +15,8 @@ def __init__(self, ref, primary_keys: List[str], name: str = None): def retrieve( self, engine: sa.engine.Engine, ref: DataReference ) -> Tuple[Set[str], OptionalSelections]: + if db_access.is_impala(engine): + raise NotImplementedError("Primary key retrieval does not work for Impala.") values, selections = db_access.get_primary_keys(engine, self.ref) return set(values), selections diff --git a/src/datajudge/constraints/row.py b/src/datajudge/constraints/row.py index 45fc1ff1..0eef0c8c 100644 --- a/src/datajudge/constraints/row.py +++ b/src/datajudge/constraints/row.py @@ -21,6 +21,8 @@ def __init__( self.max_missing_fraction_getter = max_missing_fraction_getter def test(self, engine: sa.engine.Engine) -> TestResult: + if db_access.is_impala(engine): + raise NotImplementedError("Currently not implemented for impala.") self.max_missing_fraction = self.max_missing_fraction_getter(engine) self.ref1_minus_ref2_sample, _ = db_access.get_row_difference_sample( engine, self.ref, self.ref2 diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index 8260e8fb..7555a77f 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -29,6 +29,10 @@ def is_bigquery(engine: sa.engine.Engine) -> bool: return engine.name == "bigquery" +def is_impala(engine: sa.engine.Engine) -> bool: + return engine.name == "impala" + + def get_table_columns(table, column_names): return [table.c[column_name] for column_name in column_names] @@ -408,6 +412,15 @@ def get_date_span(engine, ref, date_column_name): ) ] ) + elif is_impala(engine): + selection = sa.select( + [ + sa.func.datediff( + sa.func.to_date(sa.func.max(column)), + sa.func.to_date(sa.func.min(column)), + ) + ] + ) else: raise NotImplementedError( "Date spans not yet implemented for this sql dialect." @@ -620,6 +633,14 @@ def get_date_gaps( ) > legitimate_gap_size ) + elif is_impala(engine): + gap_condition = ( + sa.func.datediff( + sa.func.to_date(start_table.c[start_column]), + sa.func.to_date(end_table.c[end_column]), + ) + > legitimate_gap_size + ) elif is_bigquery(engine): # see https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_diff # Note that to have a gap (positive date_diff), the first date (start table) @@ -728,6 +749,8 @@ def get_max(engine, ref): def get_mean(engine, ref): def column_operator(column): + if is_impala(engine): + return sa.func.avg(column) return sa.func.avg(sa.cast(column, sa.DECIMAL)) return get_column(engine, ref, aggregate_operator=column_operator) @@ -1124,9 +1147,14 @@ def get_regex_violations(engine, ref, aggregated, regex, n_counterexamples): if aggregated: subquery = subquery.distinct() subquery = subquery.subquery() - violation_selection = sa.select(subquery.c[column]).where( - sa.not_(subquery.c[column].regexp_match(regex)) - ) + if is_impala(engine): + violation_selection = sa.select(subquery.c[column]).where( + sa.not_(sa.func.regexp_like(subquery.c[column], regex)) + ) + else: + violation_selection = sa.select(subquery.c[column]).where( + sa.not_(subquery.c[column].regexp_match(regex)) + ) n_violations_selection = sa.select([sa.func.count()]).select_from( violation_selection.subquery() ) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c0375056..9012412f 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,8 +6,9 @@ import pytest import sqlalchemy as sa +from impala.dbapi import connect -from datajudge.db_access import apply_patches, is_bigquery, is_mssql +from datajudge.db_access import apply_patches, is_bigquery, is_impala, is_mssql TEST_DB_NAME = "tempdb" SCHEMA = "dbo" # 'dbo' is the standard schema in mssql @@ -15,6 +16,18 @@ def get_engine(backend) -> sa.engine.Engine: address = os.environ.get("DB_ADDR", "localhost") + + if backend == "impala": + + def conn_creator(): + return connect( + host=address, + port=21050, + database="default", + ) + + return sa.create_engine("impala://", creator=conn_creator) + if backend == "postgres": connection_string = f"postgresql://datajudge:datajudge@{address}:5432/datajudge" elif "mssql" in backend: @@ -47,7 +60,7 @@ def get_engine(backend) -> sa.engine.Engine: def engine(backend): engine = get_engine(backend) with engine.connect() as conn: - if engine.name in ("postgresql", "bigquery"): + if engine.name in ("postgresql", "bigquery", "impala"): conn.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}") return engine @@ -693,7 +706,7 @@ def random_normal_table(engine, metadata): Table with normally distributed values of varying means and sd 1. """ - if is_bigquery(engine): + if is_bigquery(engine) or is_impala(engine): # It takes too long to insert the table into BigQuery, # test using this fixture must be disabled for BigQuery return None, None, None @@ -737,6 +750,10 @@ def capitalization_table(engine, metadata): elif is_bigquery(engine): str_datatype = "STRING" primary_key = "" # there is no primary key in BigQuery + elif is_impala(engine): + str_datatype = "STRING" + # Impala supports primary keys but uses a different grammar. + primary_key = "" else: str_datatype = "TEXT" with engine.connect() as connection: @@ -778,7 +795,9 @@ def cross_cdf_table2(engine, metadata): def pytest_addoption(parser): parser.addoption( "--backend", - choices=(("mssql", "mssql-freetds", "postgres", "snowflake", "bigquery")), + choices=( + ("mssql", "mssql-freetds", "postgres", "snowflake", "bigquery", "impala") + ), help="which database backend to use to run the integration tests", ) diff --git a/tests/integration/test_column_capitalization.py b/tests/integration/test_column_capitalization.py index 80d03b11..688474d0 100644 --- a/tests/integration/test_column_capitalization.py +++ b/tests/integration/test_column_capitalization.py @@ -1,7 +1,7 @@ import pytest from datajudge import Condition, WithinRequirement -from datajudge.db_access import is_bigquery, is_mssql, is_postgresql +from datajudge.db_access import is_bigquery, is_impala, is_mssql, is_postgresql # These tests @@ -15,6 +15,10 @@ def test_column_existence( pytest.skip("Mssql interface expects exact capitalization.") if is_bigquery(engine) and use_uppercase_column != use_uppercase_query: pytest.skip("BigQuery interface expects exact capitalization.") + if is_impala(engine) and use_uppercase_query: + pytest.skip( + "Impala interface transforms writes to lower-case, expects lower-case reads." + ) if is_postgresql(engine): pytest.skip("Postgres interface always expects lower-cased columns.") ( diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 7e14eeda..ea3aa1f9 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -6,6 +6,7 @@ from datajudge.db_access import ( Condition, is_bigquery, + is_impala, is_mssql, is_postgresql, is_snowflake, @@ -256,7 +257,8 @@ def test_n_uniques_equality_within(engine, unique_table1, data): (operation, columns, n_uniques, condition) = data req = requirements.WithinRequirement.from_table(*unique_table1) req.add_n_uniques_equality_constraint(columns, n_uniques, condition=condition) - assert operation(req[0].test(engine).outcome) + test_result = req[0].test(engine) + assert operation(test_result.outcome), test_result.failure_message @pytest.mark.parametrize( @@ -849,7 +851,8 @@ def test_numeric_mean_between(engine, int_table1, int_table2, data): condition1=condition1, condition2=condition2, ) - assert operation(req[0].test(engine).outcome) + test_result = req[0].test(engine) + assert operation(test_result.outcome), test_result.failure_message @pytest.mark.parametrize( @@ -1438,7 +1441,12 @@ def test_varchar_regex_counterexample_invalid( def test_backend_dependent_condition(engine, mix_table1): if is_mssql(engine): condition = Condition(raw_string="DATALENGTH(col_varchar) = 3") - elif is_postgresql(engine) or is_snowflake(engine) or is_bigquery(engine): + elif ( + is_postgresql(engine) + or is_snowflake(engine) + or is_bigquery(engine) + or is_impala(engine) + ): condition = Condition(raw_string="LENGTH(col_varchar) = 3") else: raise NotImplementedError(f"Unexpected backend: {engine.name}") @@ -1614,11 +1622,14 @@ def test_column_type_between(engine, get_fixture, data): def test_primary_key_definition_within(engine, pk_table, data): if is_bigquery(engine): pytest.skip("No primary key concept in BigQuery") + if is_impala(engine): + pytest.skip("Currently not implemented for impala.") (operation, columns) = data req = requirements.WithinRequirement.from_table(*pk_table) req.add_primary_key_definition_constraint(columns) - assert operation(req[0].test(engine).outcome) + test_result = req[0].test(engine) + assert operation(test_result.outcome), test_result.failure_message @pytest.mark.parametrize( @@ -1647,7 +1658,7 @@ def test_uniqueness_within(engine, mix_table2, data): # is not correctly compiled when dealing with snowflake or bigquery. # Use the mod function instead if ( - (is_snowflake(engine) or is_bigquery(engine)) + (is_snowflake(engine) or is_bigquery(engine) or is_impala(engine)) and condition is not None and condition.raw_string is not None and "% 2 = 0" in condition.raw_string @@ -1670,6 +1681,8 @@ def test_uniqueness_within(engine, mix_table2, data): ], ) def test_uniqueness_within_infer_pk(engine, data, mix_table2_pk): + if is_impala(engine): + pytest.skip("Primary key retrieval currently not implemented for impala.") if is_bigquery(engine): pytest.skip("No primary key concept in BigQuery") # We purposefully select a non-unique column ["col_date"] to validate @@ -1707,6 +1720,8 @@ def test_null_absence_within(engine, get_fixture, data): ) def test_column_type_within(engine, mix_table1, data): (operation, col_name, type_name) = data + if is_impala(engine): + type_name = {"VARCHAR": "string", "INTEGER": "int"}[type_name] req = requirements.WithinRequirement.from_table(*mix_table1) req.add_column_type_constraint(col_name, type_name) test_result = req[0].test(engine) @@ -1747,6 +1762,8 @@ def test_column_type_within(engine, mix_table1, data): ], ) def test_row_equality_between(engine, mix_table1, mix_table2, data): + if is_impala(engine): + pytest.skip("Currently not implemented for Impala. EXCEPT throws syntax error.") (operation, columns, max_missing_fraction, condition1, condition2) = data req = requirements.BetweenRequirement.from_tables(*mix_table1, *mix_table2) req.add_row_equality_constraint( @@ -1788,6 +1805,8 @@ def test_row_equality_between(engine, mix_table1, mix_table2, data): ], ) def test_row_subset_between(engine, mix_table1, mix_table2, data): + if is_impala(engine): + pytest.skip("Currently not implemented for Impala. EXCEPT throws syntax error.") ( operation, columns, @@ -1840,6 +1859,8 @@ def test_row_subset_between(engine, mix_table1, mix_table2, data): ], ) def test_row_superset_between(engine, mix_table2, mix_table1, data): + if is_impala(engine): + pytest.skip("Currently not implemented for Impala. EXCEPT throws syntax error.") ( operation, columns, @@ -1883,6 +1904,8 @@ def test_row_superset_between(engine, mix_table2, mix_table1, data): ], ) def test_row_matching_equality(engine, row_match_table1, row_match_table2, data): + if is_impala(engine): + pytest.skip("Currently not implemented for Impala. EXCEPT throws syntax error.") ( operation, matching_columns, @@ -1909,6 +1932,8 @@ def test_row_matching_equality(engine, row_match_table1, row_match_table2, data) @pytest.mark.parametrize("key", [("some_id",), ("some_id", "extra_id")]) def test_groupby_aggregation_within(engine, groupby_aggregation_table_correct, key): skip_if_mssql(engine) + if is_impala(engine): + pytest.skip("array_agg does not exist for Impala.") req = requirements.WithinRequirement.from_table(*groupby_aggregation_table_correct) req.add_groupby_aggregation_constraint(key, "value", 1) test_result = req[0].test(engine) @@ -1921,6 +1946,8 @@ def test_groupby_aggregation_within_with_failures( engine, groupby_aggregation_table_incorrect, tolerance, operation, key ): skip_if_mssql(engine) + if is_impala(engine): + pytest.skip("array_agg does not exist for Impala.") req = requirements.WithinRequirement.from_table( *groupby_aggregation_table_incorrect ) @@ -2036,8 +2063,8 @@ def test_ks_2sample_constraint_wrong_between( ) def test_ks_2sample_random(engine, random_normal_table, configuration): - if is_bigquery(engine): - pytest.skip("It takes too long to insert the table into BigQuery") + if is_bigquery(engine) or is_impala(engine): + pytest.skip("It takes too long to insert the table.") (operation, col_1, col_2, min_p_value) = configuration req = requirements.BetweenRequirement.from_tables( diff --git a/tests/integration/test_stats.py b/tests/integration/test_stats.py index 5b62d3d5..9a4fdfb2 100644 --- a/tests/integration/test_stats.py +++ b/tests/integration/test_stats.py @@ -1,7 +1,7 @@ import pytest import datajudge -from datajudge.db_access import DataReference, TableDataSource, is_bigquery +from datajudge.db_access import DataReference, TableDataSource, is_bigquery, is_impala def test_cross_cdf_selection(engine, cross_cdf_table1, cross_cdf_table2): @@ -39,7 +39,7 @@ def test_cross_cdf_selection(engine, cross_cdf_table1, cross_cdf_table2): ) def test_ks_2sample_calculate_statistic(engine, random_normal_table, configuration): - if is_bigquery(engine): + if is_bigquery(engine) or is_impala(engine): pytest.skip("It takes too long to insert the table into BigQuery") col_1, col_2, expected_d, expected_p = configuration