Skip to content

Commit

Permalink
Merge branch 'master' into feat(ingestion/neo4j)
Browse files Browse the repository at this point in the history
# Conflicts:
#	datahub-web-react/src/app/ingest/source/builder/sources.json
  • Loading branch information
keith-fullsight committed Nov 29, 2024
2 parents 01dfc26 + 94f104d commit bf888c0
Show file tree
Hide file tree
Showing 56 changed files with 1,875 additions and 1,243 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ jobs:
- uses: gradle/actions/setup-gradle@v3
- name: Gradle build (and test) for NOT metadata ingestion
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
# datahub-schematron:cli excluded due to dependency on metadata-ingestion
run: |
./gradlew build \
-x :metadata-ingestion:build \
Expand All @@ -100,6 +101,7 @@ jobs:
-x :metadata-ingestion-modules:gx-plugin:check \
-x :datahub-frontend:build \
-x :datahub-web-react:build \
-x :metadata-integration:java:datahub-schematron:cli:test \
--parallel
- name: Gradle build (and test) for frontend
if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/check-datahub-jars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ jobs:
- name: check ${{ matrix.command }} jar
run: |
./gradlew :metadata-integration:java:${{ matrix.command }}:build --info
./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar
./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc
10 changes: 9 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ buildscript {
// see also datahub-frontend/play.gradle
ext.playVersion = '2.8.22'
ext.playScalaVersion = '2.13'
ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license
ext.log4jVersion = '2.23.1'
ext.slf4jVersion = '1.7.36'
ext.logbackClassic = '1.4.14'
Expand Down Expand Up @@ -105,7 +106,14 @@ project.ext.spec = [
]

project.ext.externalDependency = [
'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10",
'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing
'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion",
'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion",
'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion",
'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion",
'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion",
'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion",
'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion",
'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
'antlr4': 'org.antlr:antlr4:4.9.3',
'assertJ': 'org.assertj:assertj-core:3.11.1',
Expand Down
7 changes: 7 additions & 0 deletions datahub-frontend/play.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ dependencies {
implementation externalDependency.antlr4Runtime
implementation externalDependency.antlr4
implementation externalDependency.akkaHttp
implementation externalDependency.akkaActor
implementation externalDependency.akkaStream
implementation externalDependency.akkaActorTyped
implementation externalDependency.akkaSlf4j
implementation externalDependency.akkaJackson
implementation externalDependency.akkaParsing
implementation externalDependency.akkaProtobuf

implementation externalDependency.jerseyCore
implementation externalDependency.jerseyGuava
Expand Down
8 changes: 8 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,14 @@
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
"recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]"
},
{
"urn": "urn:li:dataPlatform:iceberg",
"name": "iceberg",
"displayName": "Iceberg",
"description": "Ingest databases and tables from any Iceberg catalog implementation",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg",
"recipe": "source:\n type: \"iceberg\"\n config:\n env: dev\n # each thread will open internet connections to fetch manifest files independently, \n # this value needs to be adjusted with ulimit\n processing_threads: 1 \n # a single catalog definition with a form of a dictionary\n catalog: \n demo: # name of the catalog\n type: \"rest\" # other types are available\n uri: \"uri\"\n s3.access-key-id: \"access-key\"\n s3.secret-access-key: \"secret-access-key\"\n s3.region: \"aws-region\"\n profiling:\n enabled: false\n"
},
{
"urn": "urn:li:dataPlatform:neo4j",
"name": "neo4j",
Expand Down
37 changes: 37 additions & 0 deletions docs/automations/snowflake-tag-propagation.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';

<FeatureAvailability saasOnly />

> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative.
## Introduction

Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on
Expand All @@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH
- Automatically Add DataHub Tags to Snowflake Tables and Columns
- Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub

## Prerequisites

### Permissions Required for Tag Management

- `CREATE TAG`: Required to create new tags in Snowflake.
Ensure the user or role has this privilege on the specific schema or database where tags will be created.
- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects.
This permission must be granted at the database, schema, or object level depending on the scope.


### Permissions Required for Object Access

- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes.
- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags.

### Example Permission Grant Statements

To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands:

```sql
-- Tag management permissions
GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;

-- Object access for metadata operations
GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;

-- Future privileges for tagging
GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
```


## Enabling Snowflake Tag Sync

1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar.
Expand Down
24 changes: 22 additions & 2 deletions docs/managed-datahub/release-notes/v_0_3_7.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Release Availability Date

Recommended CLI/SDK
---
- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11
- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12

If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better.

Expand All @@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
## Release Changelog
---

### v0.3.7.4

- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)].
- [Automations] Filter out self-nodes in glossary term propagation
- [Remote Executor] Allow dashes in executor ids.
- [Search] Fix Nested Filter Counts in Primary Search
- [Search] Fix white screen of death on empty search result
- [Columns Tab] Support searching nested struct columns correctly in V2 UI.
- [Logo] Fix fit of custom logo for V2 UI nav bar.
- [Structured Properties] Better handling for special characters in structured properties
- [Lineage] Improvements to handling lineage cycles
- [Metadata Tests] Improve Reliability of Metadata Tests Action Application
- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack
- [Columns Tab] Property display nullable status in column sidebar (bug)
- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths.
- [Documentation] Support group authors for institutional memory aspect


### v0.3.7

- All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1
- Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141

Expand Down Expand Up @@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
- Improved UX for setting up and managing SSO

- Ingestion changes
- In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11
- In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12
- PowerBI: Support for PowerBI Apps and cross-workspace lineage
- Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups
- Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage
Expand Down
16 changes: 11 additions & 5 deletions docs/what/urn.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)

## Restrictions

There are a few restrictions when creating an urn:
There are a few restrictions when creating an URN:

1. Commas are reserved character in URN fields: `,`
2. Parentheses are reserved characters in URN fields: `(` or `)`
3. Colons are reserved characters in URN fields: `:`
4. Urn separator UTF-8 character ``
The following characters are not allowed anywhere in the URN

1. Parentheses are reserved characters in URN fields: `(` or `)`
2. The "unit separator" unicode character `` (U+241F)

The following characters are not allowed within an URN tuple.

1. Commas are reserved characters in URN tuples: `,`

Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.

Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
6 changes: 4 additions & 2 deletions metadata-ingestion/docs/sources/iceberg/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce

## Troubleshooting

### [Common Issue]
### Exceptions while increasing `processing_threads`

[Provide description of common issues with this integration and steps to resolve]
Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience
exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open
files (i.e. using `ulimit` in Linux).
9 changes: 5 additions & 4 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
)

base_requirements = {
# Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict.
"typing_extensions>=3.7.4.3",
# Our min version of typing_extensions is somewhat constrained by Airflow.
"typing_extensions>=3.10.0.2",
# Actual dependencies.
"typing-inspect",
# pydantic 1.8.2 is incompatible with mypy 0.910.
Expand Down Expand Up @@ -249,7 +249,8 @@

iceberg_common = {
# Iceberg Python SDK
"pyiceberg>=0.4,<0.7",
# Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency
"pyiceberg>=0.4.0",
}

mssql_common = {
Expand Down Expand Up @@ -775,7 +776,7 @@
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
"starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",
"nifi = datahub.ingestion.source.nifi:NifiSource",
"powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource",
"powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource",
"powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource",
"iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource",
"vertica = datahub.ingestion.source.sql.vertica:VerticaSource",
Expand Down
2 changes: 2 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/graph/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
SystemMetadataClass,
TelemetryClientIdClass,
)
from datahub.telemetry.telemetry import telemetry_instance
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.str_enum import StrEnum
from datahub.utilities.urns.urn import Urn, guess_entity_type
Expand Down Expand Up @@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph:
graph_config = config_utils.load_client_config()
graph = DataHubGraph(graph_config)
graph.test_connection()
telemetry_instance.set_context(server=graph)
return graph
9 changes: 5 additions & 4 deletions metadata-ingestion/src/datahub/ingestion/run/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
)
from datahub.ingestion.transformer.transform_registry import transform_registry
from datahub.metadata.schema_classes import MetadataChangeProposalClass
from datahub.telemetry import stats, telemetry
from datahub.telemetry import stats
from datahub.telemetry.telemetry import telemetry_instance
from datahub.utilities._custom_package_loader import model_version_name
from datahub.utilities.global_warning_util import (
clear_global_warnings,
Expand Down Expand Up @@ -273,8 +274,9 @@ def __init__(
if self.graph is None and isinstance(self.sink, DatahubRestSink):
with _add_init_error_context("setup default datahub client"):
self.graph = self.sink.emitter.to_graph()
self.graph.test_connection()
self.ctx.graph = self.graph
telemetry.telemetry_instance.update_capture_exception_context(server=self.graph)
telemetry_instance.set_context(server=self.graph)

with set_graph_context(self.graph):
with _add_init_error_context("configure reporters"):
Expand Down Expand Up @@ -615,7 +617,7 @@ def log_ingestion_stats(self) -> None:
sink_warnings = len(self.sink.get_report().warnings)
global_warnings = len(get_global_warnings())

telemetry.telemetry_instance.ping(
telemetry_instance.ping(
"ingest_stats",
{
"source_type": self.source_type,
Expand All @@ -637,7 +639,6 @@ def log_ingestion_stats(self) -> None:
),
"has_pipeline_name": bool(self.config.pipeline_name),
},
self.ctx.graph,
)

def _approx_all_vals(self, d: LossyList[Any]) -> int:
Expand Down
68 changes: 7 additions & 61 deletions metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,7 @@
make_assertion_from_test,
make_assertion_result_from_test,
)
from datahub.ingestion.source.sql.sql_types import (
ATHENA_SQL_TYPES_MAP,
BIGQUERY_TYPES_MAP,
POSTGRES_TYPES_MAP,
SNOWFLAKE_TYPES_MAP,
SPARK_SQL_TYPES_MAP,
TRINO_SQL_TYPES_MAP,
VERTICA_SQL_TYPES_MAP,
resolve_athena_modified_type,
resolve_postgres_modified_type,
resolve_trino_modified_type,
resolve_vertica_modified_type,
)
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalHandler,
StaleEntityRemovalSourceReport,
Expand All @@ -89,17 +77,11 @@
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
BooleanTypeClass,
DateTypeClass,
MySqlDDL,
NullTypeClass,
NumberTypeClass,
RecordType,
SchemaField,
SchemaFieldDataType,
SchemaMetadata,
StringTypeClass,
TimeTypeClass,
)
from datahub.metadata.schema_classes import (
DataPlatformInstanceClass,
Expand Down Expand Up @@ -804,28 +786,6 @@ def make_mapping_upstream_lineage(
)


# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py
_field_type_mapping = {
"boolean": BooleanTypeClass,
"date": DateTypeClass,
"time": TimeTypeClass,
"numeric": NumberTypeClass,
"text": StringTypeClass,
"timestamp with time zone": DateTypeClass,
"timestamp without time zone": DateTypeClass,
"integer": NumberTypeClass,
"float8": NumberTypeClass,
"struct": RecordType,
**POSTGRES_TYPES_MAP,
**SNOWFLAKE_TYPES_MAP,
**BIGQUERY_TYPES_MAP,
**SPARK_SQL_TYPES_MAP,
**TRINO_SQL_TYPES_MAP,
**ATHENA_SQL_TYPES_MAP,
**VERTICA_SQL_TYPES_MAP,
}


def get_column_type(
report: DBTSourceReport,
dataset_name: str,
Expand All @@ -835,24 +795,10 @@ def get_column_type(
"""
Maps known DBT types to datahub types
"""
TypeClass: Any = _field_type_mapping.get(column_type) if column_type else None

if TypeClass is None and column_type:
# resolve a modified type
if dbt_adapter == "trino":
TypeClass = resolve_trino_modified_type(column_type)
elif dbt_adapter == "athena":
TypeClass = resolve_athena_modified_type(column_type)
elif dbt_adapter == "postgres" or dbt_adapter == "redshift":
# Redshift uses a variant of Postgres, so we can use the same logic.
TypeClass = resolve_postgres_modified_type(column_type)
elif dbt_adapter == "vertica":
TypeClass = resolve_vertica_modified_type(column_type)
elif dbt_adapter == "snowflake":
# Snowflake types are uppercase, so we check that.
TypeClass = _field_type_mapping.get(column_type.upper())

# if still not found, report the warning

TypeClass = resolve_sql_type(column_type, dbt_adapter)

# if still not found, report a warning
if TypeClass is None:
if column_type:
report.info(
Expand All @@ -861,9 +807,9 @@ def get_column_type(
context=f"{dataset_name} - {column_type}",
log=False,
)
TypeClass = NullTypeClass
TypeClass = NullTypeClass()

return SchemaFieldDataType(type=TypeClass())
return SchemaFieldDataType(type=TypeClass)


@platform_name("dbt")
Expand Down
Loading

0 comments on commit bf888c0

Please sign in to comment.