Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Redshift profiler missing tables [sc-29514] #1017

Merged
merged 2 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions metaphor/postgresql/profile/extractor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio
import traceback
from typing import Collection, Iterable, List
from typing import Collection, List

try:
import asyncpg
Expand Down Expand Up @@ -61,26 +61,40 @@ async def extract(self) -> Collection[ENTITY_TYPES]:

await asyncio.gather(*coroutines)

return self._datasets.values()
return [
dataset
for dataset in self._datasets.values()
if self._trim_fields_and_check_empty_dataset(dataset)
]

async def _profile_database(self, database: str) -> None:
pool = await self._create_connection_pool()

async with pool.acquire() as conn:
await self._fetch_tables(conn, database)
datasets = await self._fetch_columns(conn, database)
logger.info(f"Include {len(datasets)} tables from {database}")
logger.info(f"Include {len(datasets)} datasets from {database}")

tasks = [
self._profile_dataset(pool, dataset)
for dataset in datasets
if dataset.schema.sql_schema.materialization != MaterializationType.VIEW
or not self._include_views
if self._filter_dataset_type(dataset)
]
await asyncio.gather(*tasks)
await pool.close()

self._trim_fields(datasets)
def _filter_dataset_type(self, dataset: Dataset) -> bool:
"""
Filter out dataset types based on the config, not profile "External", "Stream" and "Snapshot"
"""
dataset_type = dataset.schema.sql_schema.materialization
if self._include_views:
return dataset_type in {
MaterializationType.TABLE,
MaterializationType.VIEW,
MaterializationType.MATERIALIZED_VIEW,
}
return dataset_type == MaterializationType.TABLE

async def _profile_dataset(self, pool: asyncpg.Pool, dataset: Dataset) -> None:
async with pool.acquire() as conn:
Expand Down Expand Up @@ -224,8 +238,14 @@ def _init_dataset(
)

@staticmethod
def _trim_fields(datasets: Iterable[Dataset]) -> None:
"""Drop temporary fields"""
for dataset in datasets:
dataset.schema = None
dataset.statistics = None
def _trim_fields_and_check_empty_dataset(dataset: Dataset) -> bool:
"""Drop temporary fields and check if the dataset field statistic is empty"""
if (
not dataset.field_statistics
or not dataset.field_statistics.field_statistics
):
return False

dataset.schema = None
dataset.statistics = None
return True
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "metaphor-connectors"
version = "0.14.134"
version = "0.14.135"
license = "Apache-2.0"
description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app."
authors = ["Metaphor <[email protected]>"]
Expand Down
34 changes: 34 additions & 0 deletions tests/postgresql/profile/test_extractor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from metaphor.common.base_config import OutputConfig
from metaphor.common.column_statistics import ColumnStatistics
from metaphor.common.sampling import SamplingConfig
from metaphor.models.metadata_change_event import (
Expand All @@ -8,9 +9,12 @@
DatasetSchema,
DatasetStatistics,
FieldStatistics,
MaterializationType,
SchemaField,
SQLSchema,
)
from metaphor.postgresql.profile.extractor import PostgreSQLProfileExtractor
from metaphor.redshift.profile.config import RedshiftProfileRunConfig

column_statistics = ColumnStatistics(unique_count=True, avg_value=True)

Expand All @@ -23,12 +27,42 @@ def init_dataset(name: str, row_count) -> Dataset:

dataset.schema = DatasetSchema()
dataset.schema.fields = []
dataset.schema.sql_schema = SQLSchema()

dataset.statistics = DatasetStatistics()
dataset.statistics.record_count = float(row_count)
return dataset


def test_filter_dataset_type():
table = init_dataset(name="1", row_count=1000)
table.schema.sql_schema.materialization = MaterializationType.TABLE

view = init_dataset(name="2", row_count=1000)
view.schema.sql_schema.materialization = MaterializationType.VIEW

external = init_dataset(name="3", row_count=1000)
external.schema.sql_schema.materialization = MaterializationType.EXTERNAL

config = RedshiftProfileRunConfig(
host="",
database="",
user="",
password="",
output=OutputConfig(),
)
extractor_filter_view = PostgreSQLProfileExtractor(config)
assert extractor_filter_view._filter_dataset_type(table)
assert not extractor_filter_view._filter_dataset_type(view)
assert not extractor_filter_view._filter_dataset_type(external)

config.include_views = True
extractor_include_view = PostgreSQLProfileExtractor(config)
assert extractor_include_view._filter_dataset_type(table)
assert extractor_include_view._filter_dataset_type(view)
assert not extractor_include_view._filter_dataset_type(external)


def test_build_profiling_query():
dataset = init_dataset(name="foo", row_count=1000)
dataset.schema.fields = [
Expand Down
Loading