Skip to content

Commit

Permalink
feat: google big query database selection much faster [TCTC-6113] (#1126
Browse files Browse the repository at this point in the history
)

* poc: google big query database selection much faster

* feat: clean and update

* feat: update database fetch

* feat: update on the db selection to check some stuffs

* feat: update the field to db_schema

* oupsi, we shouldn't remove the database field

* chore: clean && update

* feat:keep the default database but return also the list of schema

* chore: remove and clean print

* feat: refactor to add schema to get_model

* feat: we don't need project_tree in this connector

* feat: more update with the new form of the schema_name

* test: update tests

* chore: update on changelog

* fix: more tests

* test: coverage

* oupsi
  • Loading branch information
Sanix-Darker authored Jul 7, 2023
1 parent c20d55e commit eca4914
Show file tree
Hide file tree
Showing 3 changed files with 289 additions and 75 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## Unreleased

## Changed

- Feat[Goole Big Query] : We can now get the database model(list of tables) based on a given schema name to speed up the project tree structure.
- Fix: on mysql, avoid duplicated columns when retrieving table informations

### [4.6.0] 2023-06-02
Expand Down
249 changes: 206 additions & 43 deletions tests/google_big_query/test_google_big_query.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Any, Generator
from unittest.mock import patch

import pandas
Expand Down Expand Up @@ -180,8 +181,8 @@ class FakeResponse:
def __init__(self) -> None:
...

def to_dataframe(self) -> pd.DataFrame:
return pd.DataFrame(
def to_dataframe(self) -> Generator[Any, Any, Any]:
yield pd.DataFrame(
[
{
'name': 'coucou',
Expand Down Expand Up @@ -271,6 +272,11 @@ def to_dataframe(self) -> pd.DataFrame:
return_value=Client,
)

mocker.patch(
'toucan_connectors.google_big_query.google_big_query_connector.GoogleBigQueryConnector._fetch_query_results',
return_value=FakeResponse().to_dataframe(),
)

mocker.patch(
'toucan_connectors.google_big_query.google_big_query_connector.GoogleBigQueryConnector._get_google_credentials',
return_value=Credentials,
Expand Down Expand Up @@ -321,51 +327,156 @@ def to_dataframe(self) -> pd.DataFrame:
assert (
mocked_query.call_args_list[0][0][0]
== """
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
UNION ALL
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
UNION ALL
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM taar.INFORMATION_SCHEMA.COLUMNS C
JOIN taar.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
taar.INFORMATION_SCHEMA.COLUMNS C
JOIN taar.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
"""
)
mocked_query.reset_mock()

mocker.patch(
'toucan_connectors.google_big_query.google_big_query_connector.GoogleBigQueryConnector._fetch_query_results',
return_value=FakeResponse().to_dataframe(),
)

connector.get_model('some-db')
assert (
mocked_query.call_args_list[0][0][0]
== """
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
AND T.table_catalog = 'some-db'
UNION ALL
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
AND T.table_catalog = 'some-db'
UNION ALL
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM taar.INFORMATION_SCHEMA.COLUMNS C
JOIN taar.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
taar.INFORMATION_SCHEMA.COLUMNS C
JOIN taar.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
AND T.table_catalog = 'some-db'
"""
)

mocked_query.reset_mock()

mocker.patch(
'toucan_connectors.google_big_query.google_big_query_connector.GoogleBigQueryConnector._fetch_query_results',
return_value=FakeResponse().to_dataframe(),
)

connector.get_model('some-db', 'foooo')

# since we specified only the foooo schema we should only get the query for
# it
assert (
mocked_query.call_args_list[0][0][0]
== """
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
AND T.table_catalog = 'some-db'
"""
)
Expand Down Expand Up @@ -468,46 +579,98 @@ def test_get_model_multi_location(mocker: MockFixture, _fixture_credentials) ->
assert (
mocked_query.call_args_list[0][0][0]
== """
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
UNION ALL
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
"""
)
# No location should be specified in the happy path
assert mocked_query.call_args_list[0][1] == {}
assert (
mocked_query.call_args_list[1][0][0]
== """
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
foooo.INFORMATION_SCHEMA.COLUMNS C
JOIN foooo.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
"""
)
# Next calls should specify the location
assert mocked_query.call_args_list[1][1] == {'location': 'Paris'}
assert (
mocked_query.call_args_list[2][0][0]
== """
SELECT C.table_name AS name, C.table_schema AS schema, T.table_catalog AS database,
T.table_type AS type, C.column_name, C.data_type FROM baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T ON C.table_name = T.table_name
WHERE IS_SYSTEM_DEFINED='NO' AND IS_PARTITIONING_COLUMN='NO' AND IS_HIDDEN='NO'
SELECT
C.table_name AS name,
C.table_schema AS schema,
T.table_catalog AS database,
T.table_type AS type,
C.column_name,
C.data_type
FROM
baarrrr.INFORMATION_SCHEMA.COLUMNS C
JOIN baarrrr.INFORMATION_SCHEMA.TABLES T
ON C.table_name = T.table_name
WHERE
IS_SYSTEM_DEFINED = 'NO'
AND IS_PARTITIONING_COLUMN = 'NO'
AND IS_HIDDEN = 'NO'
"""
)
# Next calls should specify the location
assert mocked_query.call_args_list[2][1] == {'location': 'Toulouse'}


def test_get_form(_fixture_credentials: MockFixture) -> None:
def test_get_form(mocker: MockFixture, _fixture_credentials: MockFixture) -> None:
def mock_available_schs():
return ['ok', 'test']

mocker.patch(
'toucan_connectors.google_big_query.google_big_query_connector.GoogleBigQueryConnector._available_schs',
return_value=mock_available_schs,
)

assert (
GoogleBigQueryDataSource(query=',', name='MyGBQ', domain='foo').get_form(
GoogleBigQueryConnector(
Expand Down
Loading

0 comments on commit eca4914

Please sign in to comment.