Skip to content

Commit

Permalink
[sc-25545] Exclude personal space dashboards for tableau (#813)
Browse files Browse the repository at this point in the history
* [sc-25545] Exclude personal space dashboards for tableau

* bump version

* bump version
  • Loading branch information
usefulalgorithm authored Apr 2, 2024
1 parent 2cc4fff commit 1c75071
Show file tree
Hide file tree
Showing 14 changed files with 373 additions and 179 deletions.
22 changes: 16 additions & 6 deletions metaphor/tableau/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,20 +95,30 @@ disable_preview_image: true

#### Excluding Projects

You can specify the project to be ignored by the connector. By default the project `Personal Space` is ignored. To specify more ignored projects, add the following configuration:
You can specify the project to be included / excluded by the connector. By default the project `Personal Space` is ignored.

To override the default behavior and include `Personal Space` in the connector, use the following configuration:

```yaml
exclude_extra_projects:
- <excluded_project_name_1>
- <excluded_project_name_2>
include_personal_space: True
```

To override the default behavior and include `Personal Space` in the connector, use the following configuration:
To specify the projects to include / exclude, use the following field:

```yaml
include_personal_space: True
projects_filter:
includes:
- project_id_1
- project_id_2
...
excludes:
- project_id_1
- project_id_2
...
```

To only include specific projects, use `includes` field. To only exclude certain projects, use `excludes` field.

## Testing

Follow the [Installation](../../README.md) instructions to install `metaphor-connectors` in your environment (or virtualenv). Make sure to include either `all` or `tableau` extra.
Expand Down
25 changes: 16 additions & 9 deletions metaphor/tableau/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import dataclasses
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional

from pydantic import model_validator
from pydantic.dataclasses import dataclass
Expand Down Expand Up @@ -27,6 +27,19 @@ class TableauPasswordAuthConfig:
password: str


@dataclass(config=ConnectorConfig)
class TableauProjectConfig:
includes: List[str] = dataclasses.field(default_factory=list)
excludes: List[str] = dataclasses.field(default_factory=list)

def include_project(self, project_id: str):
if self.includes and project_id not in self.includes:
return False
if self.excludes and project_id in self.excludes:
return False
return True


@dataclass(config=ConnectorConfig)
class TableauRunConfig(BaseConfig):
server_url: str
Expand All @@ -44,20 +57,14 @@ class TableauRunConfig(BaseConfig):
default_factory=dict
)

exclude_extra_projects: List[str] = dataclasses.field(default_factory=list)
include_personal_space: bool = False

projects_filter: TableauProjectConfig = TableauProjectConfig()

# whether to disable Chart preview image
disable_preview_image: bool = False

@model_validator(mode="after")
def have_access_token_or_user_password(self):
must_set_exactly_one(self.__dict__, ["access_token", "user_password"])
return self

@property
def excluded_projects(self) -> Set[str]:
return set(
self.exclude_extra_projects
+ ([] if self.include_personal_space else [PERSONAL_SPACE_PROJECT_NAME])
)
119 changes: 57 additions & 62 deletions metaphor/tableau/extractor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import base64
import re
import traceback
from dataclasses import dataclass
from typing import Collection, Dict, List, Optional, Set, Tuple, Union

from metaphor.tableau.workbook import Workbook, get_all_workbooks

try:
import tableauserverclient as tableau
except ImportError:
Expand Down Expand Up @@ -45,15 +46,13 @@
VirtualViewLogicalID,
VirtualViewType,
)
from metaphor.tableau.config import TableauRunConfig
from metaphor.tableau.graphql_utils import paginate_connection
from metaphor.tableau.config import PERSONAL_SPACE_PROJECT_NAME, TableauRunConfig
from metaphor.tableau.graphql_utils import fetch_custom_sql_tables
from metaphor.tableau.query import (
CustomSqlTable,
DatabaseTable,
WorkbookQueryResponse,
connection_type_map,
custom_sql_graphql_query,
workbooks_graphql_query,
)

logger = get_logger()
Expand Down Expand Up @@ -94,7 +93,8 @@ def __init__(self, config: TableauRunConfig):
self._snowflake_account = config.snowflake_account
self._bigquery_project_name_to_id_map = config.bigquery_project_name_to_id_map
self._disable_preview_image = config.disable_preview_image
self._excluded_projects = config.excluded_projects
self._include_personal_space = config.include_personal_space
self._projects_filter = config.projects_filter

self._views: Dict[str, tableau.ViewItem] = {}
self._projects: Dict[str, List[str]] = {} # project id -> project hierarchy
Expand Down Expand Up @@ -136,16 +136,19 @@ async def extract(self) -> Collection[ENTITY_TYPES]:

server = tableau.Server(self._server_url, use_server_version=True)
with server.auth.sign_in(tableau_auth):
self._extract_dashboards(server)
self._extract_datasources(server)
workbooks = get_all_workbooks(server)
self._extract_dashboards(server, workbooks)
self._extract_datasources(server, workbooks)

return [
*self._dashboards.values(),
*self._virtual_views.values(),
*self._datasets.values(),
]

def _extract_dashboards(self, server: tableau.Server) -> None:
def _extract_dashboards(
self, server: tableau.Server, workbooks: List[Workbook]
) -> None:
# fetch all projects
projects: List[tableau.ProjectItem] = list(tableau.Pager(server.projects))
json_dump_to_debug_file([w.__dict__ for w in projects], "projects.json")
Expand All @@ -168,64 +171,47 @@ def _extract_dashboards(self, server: tableau.Server) -> None:
continue
self._views[view.id] = view

# fetch all workbooks
workbooks: List[tableau.WorkbookItem] = list(tableau.Pager(server.workbooks))
json_dump_to_debug_file([w.__dict__ for w in workbooks], "workbooks.json")
logger.info(
f"\nThere are {len(workbooks)} workbooks on site: {[workbook.name for workbook in workbooks]}"
)
for workbook in workbooks:
if workbook.id is not None and workbook.project_id is not None:
self._workbook_project[workbook.id] = workbook.project_id
self._workbook_project[workbook.id] = str(workbook.project_id)

server.workbooks.populate_views(workbook, usage=True)
server.workbooks.populate_views(workbook.rest_item, usage=True)

try:
self._parse_dashboard(
workbook, self._get_system_contacts(server, workbook.owner_id)
workbook,
self._get_system_contacts(server, workbook.rest_item.owner_id),
)
except Exception as error:
traceback.print_exc()
logger.error(f"failed to parse workbook {workbook.name}, error {error}")

def _extract_datasources(self, server: tableau.Server) -> None:
# fetch custom SQL tables from Metadata GraphQL API
custom_sql_tables = paginate_connection(
server, custom_sql_graphql_query, "customSQLTablesConnection"
)
except Exception:
logger.exception(f"failed to parse workbook {workbook.rest_item.name}")

json_dump_to_debug_file(custom_sql_tables, "graphql_custom_sql_tables.json")
logger.info(f"Found {len(custom_sql_tables)} custom SQL tables.")
def _extract_datasources(
self, server: tableau.Server, workbooks: List[Workbook]
) -> None:
custom_sql_tables = fetch_custom_sql_tables(server)

# mapping of datasource to (query, list of upstream dataset IDs)
datasource_upstream_datasets = {}
for item in custom_sql_tables:
custom_sql_table = CustomSqlTable.model_validate(item)
datasource_upstream_datasets.update(
self._parse_custom_sql_table(custom_sql_table)
)

# fetch workbook related info from Metadata GraphQL API
workbooks = paginate_connection(
server, workbooks_graphql_query, "workbooksConnection"
)
json_dump_to_debug_file(workbooks, "graphql_workbooks.json")
logger.info(f"Found {len(workbooks)} workbooks.")
datasource_upstream_datasets = {
datasource_id: custom_sql_source
for table in custom_sql_tables
for datasource_id, custom_sql_source in self._parse_custom_sql_table(
table
).items()
}

for item in workbooks:
for workbook in workbooks:
try:
workbook = WorkbookQueryResponse.model_validate(item)
if workbook.projectName in self._excluded_projects:
if not self._should_include_workbook(workbook):
logger.info(
f"Ignoring datasources from workbook in excluded project: {workbook.projectName}"
f"Ignoring datasources from workbook in excluded project: {workbook.project_name}, workbook id = {workbook.id}"
)
continue
self._parse_workbook_query_response(
server, workbook, datasource_upstream_datasets
server, workbook.graphql_item, datasource_upstream_datasets
)
except Exception as error:
except Exception:
logger.exception(
f"failed to parse workbook {item['vizportalUrlId']}, error {error}"
f"failed to parse workbook {workbook.graphql_item.vizportalUrlId}"
)

def _get_system_contacts(
Expand Down Expand Up @@ -286,31 +272,32 @@ def _build_asset_full_name_and_structure(
return full_name, structure

def _parse_dashboard(
self, workbook: tableau.WorkbookItem, system_contacts: Optional[SystemContacts]
self, workbook: Workbook, system_contacts: Optional[SystemContacts]
) -> None:
if not workbook.webpage_url:
logger.exception(f"workbook {workbook.name} missing webpage_url")
return

if workbook.project_name in self._excluded_projects:
if not self._should_include_workbook(workbook):
logger.info(
f"Ignoring dashboard from workbook in excluded project: {workbook.project_name}"
f"Ignoring dashboard from workbook in excluded project: {workbook.project_name}, workbook id = {workbook.id}"
)
return

workbook_id = TableauExtractor._extract_workbook_id(workbook.webpage_url)
rest_workbook = workbook.rest_item
if not rest_workbook.webpage_url:
logger.exception(f"workbook {rest_workbook.name} missing webpage_url")
return

workbook_id = TableauExtractor._extract_workbook_id(rest_workbook.webpage_url)

views: List[tableau.ViewItem] = workbook.views
views: List[tableau.ViewItem] = rest_workbook.views
charts = [self._parse_chart(self._views[view.id]) for view in views if view.id]
total_views = sum([view.total_views for view in views])

full_name, structure = self._build_asset_full_name_and_structure(
workbook.name, workbook.project_id, workbook.project_name
rest_workbook.name, rest_workbook.project_id, rest_workbook.project_name
)

dashboard_info = DashboardInfo(
title=full_name,
description=workbook.description,
description=rest_workbook.description,
charts=charts,
view_count=float(total_views),
)
Expand All @@ -331,11 +318,11 @@ def _parse_dashboard(

self._dashboards[workbook_id] = dashboard

if workbook.tags:
if rest_workbook.tags:
dashboard.system_tags = SystemTags(
tags=[
SystemTag(value=tag, system_tag_source=SystemTagSource.TABLEAU)
for tag in sorted(workbook.tags)
for tag in sorted(rest_workbook.tags)
]
)

Expand Down Expand Up @@ -681,3 +668,11 @@ def _build_view_url(self, content_url: Optional[str]) -> Optional[str]:
@staticmethod
def _build_preview_data_url(preview: bytes) -> str:
return f"data:image/png;base64,{base64.b64encode(preview).decode('ascii')}"

def _should_include_workbook(self, workbook: Workbook) -> bool:
if (
not self._include_personal_space
and workbook.project_name == PERSONAL_SPACE_PROJECT_NAME
):
return False
return self._projects_filter.include_project(workbook.project_id)
31 changes: 29 additions & 2 deletions metaphor/tableau/graphql_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@

import tableauserverclient as tableau

from metaphor.common.logger import get_logger
from metaphor.common.logger import get_logger, json_dump_to_debug_file
from metaphor.tableau.query import (
CustomSqlTable,
WorkbookQueryResponse,
custom_sql_graphql_query,
workbooks_graphql_query,
)

logger = get_logger()


def paginate_connection(
def _paginate_connection(
server: tableau.Server, query: str, connection_name: str, batch_size=50
) -> List[Dict]:
"""Return all the nodes from GraphQL connection through pagination"""
Expand All @@ -28,3 +34,24 @@ def paginate_connection(
return result

offset += batch_size


def fetch_workbooks(server: tableau.Server, batch_size: int = 50):
# fetch workbook related info from Metadata GraphQL API
workbooks = _paginate_connection(
server, workbooks_graphql_query, "workbooksConnection", batch_size
)
json_dump_to_debug_file(workbooks, "graphql_workbooks.json")
logger.info(f"Found {len(workbooks)} workbooks.")
return [WorkbookQueryResponse.model_validate(workbook) for workbook in workbooks]


def fetch_custom_sql_tables(server: tableau.Server, batch_size: int = 50):
# fetch custom SQL tables from Metadata GraphQL API
custom_sql_tables = _paginate_connection(
server, custom_sql_graphql_query, "customSQLTablesConnection", batch_size
)

json_dump_to_debug_file(custom_sql_tables, "graphql_custom_sql_tables.json")
logger.info(f"Found {len(custom_sql_tables)} custom SQL tables.")
return [CustomSqlTable.model_validate(table) for table in custom_sql_tables]
2 changes: 2 additions & 0 deletions metaphor/tableau/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
luid
name
projectName
projectVizportalUrlId
vizportalUrlId
upstreamDatasources {
id
Expand Down Expand Up @@ -124,6 +125,7 @@ class WorkbookQueryResponse(BaseModel):
luid: str
name: str
projectName: str
projectVizportalUrlId: str
vizportalUrlId: str
upstreamDatasources: List[PublishedDatasource]
embeddedDatasources: List[EmbeddedDatasource]
Expand Down
Loading

0 comments on commit 1c75071

Please sign in to comment.