Skip to content

Commit

Permalink
QuickSight crawler (#985)
Browse files Browse the repository at this point in the history
* Inital implementation

* Remove unuse code

* Update poetry

* Bump version

* Update readme

* Update readme

* Update readme

* Improve readability

* Add test

* Update metaphor/quick_sight/extractor.py

Co-authored-by: Tsung-Ju Lii <[email protected]>

* Update metaphor/quick_sight/extractor.py

Co-authored-by: Tsung-Ju Lii <[email protected]>

* Update metaphor/quick_sight/cll.py

Co-authored-by: Tsung-Ju Lii <[email protected]>

* Refine

---------

Co-authored-by: Tsung-Ju Lii <[email protected]>
  • Loading branch information
elic-eon and usefulalgorithm authored Sep 23, 2024
1 parent a036ba5 commit 167ccb7
Show file tree
Hide file tree
Showing 21 changed files with 3,127 additions and 28 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Each connector is placed under its own directory under [metaphor](./metaphor) an
| [postgresql.profile](metaphor/postgresql/profile/) | Data profile |
| [postgresql.usage](metaphor/postgresql/usage/) | Usage |
| [power_bi](metaphor/power_bi/) | Dashboard, lineage |
| [quick_sight](metaphor/quick_sight/) | Dashboard, lineage |
| [redshift](metaphor/redshift/) | Schema, description, statistics, queries |
| [redshift.profile](metaphor/redshift/profile/) | Data profile |
| [sharepoint](metaphor/sharepoint/) | Document embeddings |
Expand Down
7 changes: 3 additions & 4 deletions metaphor/azure_data_factory/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from metaphor.common.entity_id import dataset_normalized_name
from metaphor.common.logger import get_logger
from metaphor.common.snowflake import normalize_snowflake_account
from metaphor.common.utils import removesuffix
from metaphor.models.metadata_change_event import (
DataPlatform,
Expand Down Expand Up @@ -240,11 +241,9 @@ def process_snowflake_linked_service(
query_db = parse_qs(url.query or "").get("db")
database = query_db[0] if query_db else None

# extract snowflake account name from jdbc format, 'snowflake://<account>.snowflakecomputing.com/'
# extract snowflake account name from jdbc format, 'snowflake://<snowflake_host>/'
hostname = urlparse(url.path).hostname
snowflake_account = (
removesuffix(hostname, ".snowflakecomputing.com") if hostname else None
)
snowflake_account = normalize_snowflake_account(hostname) if hostname else None

return LinkedService(database=database, account=snowflake_account)

Expand Down
9 changes: 9 additions & 0 deletions metaphor/common/entity_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ def to_dashboard_entity_id_from_logical_id(logical_id: DashboardLogicalID) -> En
return EntityId(EntityType.DASHBOARD, logical_id)


def to_entity_id_from_virtual_view_logical_id(
logical_id: VirtualViewLogicalID,
) -> EntityId:
"""
converts a VirtualView logical ID to entity ID
"""
return EntityId(EntityType.VIRTUAL_VIEW, logical_id)


def normalize_full_dataset_name(name: str) -> str:
"""
Normalizes a fully qualified dataset name
Expand Down
14 changes: 9 additions & 5 deletions metaphor/common/snowflake.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
PRIVATE_LINK_SUFFIX = ".privatelink"
SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"


def normalize_snowflake_account(account: str) -> str:
def normalize_snowflake_account(host: str) -> str:
"""
Normalize different variations of Snowflake account.
See https://docs.snowflake.com/en/user-guide/admin-account-identifier
"""

# Account name is case insensitive
account = account.lower()
host = host.lower()

if host.endswith(SNOWFLAKE_HOST_SUFFIX):
host = host[: -len(SNOWFLAKE_HOST_SUFFIX)]

# Strip PrivateLink suffix
if account.endswith(PRIVATE_LINK_SUFFIX):
return account[: -len(PRIVATE_LINK_SUFFIX)]
if host.endswith(PRIVATE_LINK_SUFFIX):
return host[: -len(PRIVATE_LINK_SUFFIX)]

return account
return host
4 changes: 1 addition & 3 deletions metaphor/fivetran/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,7 @@ def get_snowflake_account_from_config(config: dict) -> Optional[str]:
if host is None:
return None

# remove snowflakecomputing.com parts
account = ".".join(host.split(".")[:-2])
return normalize_snowflake_account(account)
return normalize_snowflake_account(host)

@staticmethod
def get_source_account_from_config(
Expand Down
74 changes: 74 additions & 0 deletions metaphor/quick_sight/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# QuickSight Connector

This connector extracts technical metadata from AWS QuickSight using the [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) library.

## Setup

We recommend creating a dedicated AWS IAM user for the crawler with limited permissions based on the following IAM policy:

``` json
{
"Version": "2012-10-17",
"Statement":
[
{
"Effect": "Allow",
"Action":
[
"quicksight:DescribeAnalysis",
"quicksight:DescribeDashboard",
"quicksight:DescribeDataSource",
"quicksight:DescribeDataSet",
"quicksight:DescribeDataSetRefreshProperties",
"quicksight:DescribeFolder",
"quicksight:DescribeUser",
"quicksight:ListAnalyses",
"quicksight:ListDashboards",
"quicksight:ListDataSources",
"quicksight:ListDataSets",
"quicksight:ListFolders",
"quicksight:ListUsers",
],
"Resource":
[
"*"
]
}
]
}
```

## Config File

Create a YAML config file based on the following template.

### Required Configurations

You must specify an AWS user credential to access QuickSight API. You can also specify a role ARN and let the connector assume the role before accessing AWS APIs.

```yaml
aws:
access_key_id: <aws_access_key_id>
secret_access_key: <aws_secret_access_key>
region_name: <aws_region_name>
assume_role_arn: <aws_role_arn> # If using IAM role
aws_account_id: <quick_aws_account_id>
```
### Optional Configurations
#### Output Destination
See [Output Config](../common/docs/output.md) for more information.
## Testing
Follow the [Installation](../../README.md) instructions to install `metaphor-connectors` in your environment (or virtualenv).

Run the following command to test the connector locally:

```shell
metaphor quick_sight <config_file>
```

Manually verify the output after the run finishes.
6 changes: 6 additions & 0 deletions metaphor/quick_sight/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from metaphor.common.cli import cli_main
from metaphor.quick_sight.extractor import QuickSightExtractor


def main(config_file: str):
cli_main(QuickSightExtractor, config_file)
111 changes: 111 additions & 0 deletions metaphor/quick_sight/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import enum
from typing import Dict, List

import boto3
from pydantic.dataclasses import dataclass

from metaphor.common.aws import AwsCredentials
from metaphor.common.logger import json_dump_to_debug_file
from metaphor.quick_sight.models import Dashboard, DataSet, DataSource, ResourceType


def create_quick_sight_client(aws: AwsCredentials) -> boto3.client:
return aws.get_session().client("quicksight")


class Endpoint(enum.Enum):
list_dashboards = "list_dashboards"
list_data_sets = "list_data_sets"
list_data_sources = "list_data_sources"


@dataclass
class EndpointDictKeys:
list_key: str
item_key: str


ENDPOINT_SETTING = {
Endpoint.list_data_sets: EndpointDictKeys("DataSetSummaries", "DataSetId"),
Endpoint.list_dashboards: EndpointDictKeys("DashboardSummaryList", "DashboardId"),
Endpoint.list_data_sources: EndpointDictKeys("DataSources", "DataSourceId"),
}


class Client:
def __init__(
self,
aws: AwsCredentials,
aws_account_id: str,
resources: Dict[str, ResourceType],
):
self._client = create_quick_sight_client(aws)
self._aws_account_id = aws_account_id
self._resources = resources

def get_resources(self):
self._get_dataset_detail()
self._get_dashboard_detail()
self._get_data_source_detail()

def _get_resource_ids(self, endpoint: Endpoint) -> List[str]:
paginator = self._client.get_paginator(endpoint.value)
paginator_response = paginator.paginate(AwsAccountId=self._aws_account_id)

ids = []
settings = ENDPOINT_SETTING[endpoint]
for page in paginator_response:
for item in page[settings.list_key]:
ids.append(item[settings.item_key])
return ids

def _get_dataset_detail(self) -> None:
results = []
for dataset_id in self._get_resource_ids(Endpoint.list_data_sets):
result = self._client.describe_data_set(
AwsAccountId=self._aws_account_id, DataSetId=dataset_id
)

results.append(result)

dataset = DataSet(**(result["DataSet"]))

if dataset.Arn is None:
continue

self._resources[dataset.Arn] = dataset

json_dump_to_debug_file(results, "datasets.json")

def _get_dashboard_detail(self):
results = []
for dashboard_id in self._get_resource_ids(Endpoint.list_dashboards):
result = self._client.describe_dashboard(
AwsAccountId=self._aws_account_id, DashboardId=dashboard_id
)
results.append(result)
dashboard = Dashboard(**(result["Dashboard"]))

if dashboard.Arn is None:
continue

self._resources[dashboard.Arn] = dashboard

json_dump_to_debug_file(results, "dashboards.json")

def _get_data_source_detail(self):
results = []
for data_source_id in self._get_resource_ids(Endpoint.list_data_sources):
result = self._client.describe_data_source(
AwsAccountId=self._aws_account_id, DataSourceId=data_source_id
)
results.append(result)

data_source = DataSource(**(result["DataSource"]))

if data_source.Arn is None:
continue

self._resources[data_source.Arn] = data_source

json_dump_to_debug_file(results, "data_sources.json")
12 changes: 12 additions & 0 deletions metaphor/quick_sight/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pydantic.dataclasses import dataclass

from metaphor.common.aws import AwsCredentials
from metaphor.common.base_config import BaseConfig
from metaphor.common.dataclass import ConnectorConfig


@dataclass(config=ConnectorConfig)
class QuickSightRunConfig(BaseConfig):
aws: AwsCredentials

aws_account_id: str
105 changes: 105 additions & 0 deletions metaphor/quick_sight/data_source_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from typing import Optional

from metaphor.common.snowflake import normalize_snowflake_account
from metaphor.models.metadata_change_event import DataPlatform
from metaphor.quick_sight.models import (
DataSource,
DataSourceType,
TypeDataSourceParameters,
)

DATA_SOURCE_PLATFORM_MAP = {
DataSourceType.AURORA: DataPlatform.MYSQL,
DataSourceType.AURORA_POSTGRESQL: DataPlatform.POSTGRESQL,
DataSourceType.BIGQUERY: DataPlatform.BIGQUERY,
DataSourceType.DATABRICKS: DataPlatform.UNITY_CATALOG,
DataSourceType.MARIADB: DataPlatform.MYSQL,
DataSourceType.MYSQL: DataPlatform.MYSQL,
DataSourceType.POSTGRESQL: DataPlatform.POSTGRESQL,
DataSourceType.REDSHIFT: DataPlatform.REDSHIFT,
DataSourceType.ORACLE: DataPlatform.ORACLE,
DataSourceType.SNOWFLAKE: DataPlatform.SNOWFLAKE,
DataSourceType.SQLSERVER: DataPlatform.MSSQL,
}


def _get_database_from_parameters(parameters: TypeDataSourceParameters):
if parameters.AuroraParameters:
return parameters.AuroraParameters.Database

if parameters.AuroraPostgreSqlParameters:
return parameters.AuroraPostgreSqlParameters.Database

if parameters.BigQueryParameters:
return parameters.BigQueryParameters.ProjectId

if parameters.MariaDbParameters:
return parameters.MariaDbParameters.Database

if parameters.MySqlParameters:
return parameters.MySqlParameters.Database

if parameters.OracleParameters:
return parameters.OracleParameters.Database

if parameters.PostgreSqlParameters:
return parameters.PostgreSqlParameters.Database

if parameters.RdsParameters:
return parameters.RdsParameters.Database

if parameters.RedshiftParameters:
return parameters.RedshiftParameters.Database

if parameters.SnowflakeParameters:
return parameters.SnowflakeParameters.Database

if parameters.SqlServerParameters:
return parameters.SqlServerParameters.Database

return None


def get_database(data_source: DataSource) -> Optional[str]:
"""
Extract database from DataSource parameters
"""
if data_source.DataSourceParameters is None:
return None

parameters = data_source.DataSourceParameters

return _get_database_from_parameters(parameters)


def get_account(data_source: DataSource) -> Optional[str]:
"""
Extract account from DataSource parameters
"""
if data_source.DataSourceParameters is None:
return None

parameters = data_source.DataSourceParameters

if parameters.AuroraParameters:
return parameters.AuroraParameters.Host

if parameters.MariaDbParameters:
return parameters.MariaDbParameters.Host

if parameters.MySqlParameters:
return parameters.MySqlParameters.Host

if parameters.OracleParameters:
return parameters.OracleParameters.Host

if parameters.SnowflakeParameters:
return (
normalize_snowflake_account(parameters.SnowflakeParameters.Host)
if parameters.SnowflakeParameters.Host
else None
)

if parameters.SqlServerParameters:
return parameters.SqlServerParameters.Host
return None
Loading

0 comments on commit 167ccb7

Please sign in to comment.