From 8d9bc1d87d6616d187b0bb5ab1bda453be477cac Mon Sep 17 00:00:00 2001 From: Xuan <65048031+xuan616@users.noreply.github.com> Date: Thu, 5 Jan 2023 21:42:28 -0800 Subject: [PATCH] feat: add table/column lineage models (#21) add table/column lineage models --- README.md | 7 +-- .../aa30b4276b9b_add_table_column_lineage.py | 44 +++++++++++++++++++ amundsen_rds/models/__init__.py | 12 ++--- amundsen_rds/models/column.py | 16 +++++++ amundsen_rds/models/table.py | 16 +++++++ setup.py | 2 +- 6 files changed, 87 insertions(+), 10 deletions(-) create mode 100644 amundsen_rds/migrations/versions/aa30b4276b9b_add_table_column_lineage.py diff --git a/README.md b/README.md index 731d161..2cacd0b 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,10 @@ The schema in ORM models follows the logic of [databuilder models](https://githu Amundsenrds will be used in [databuilder](https://github.com/amundsen-io/amundsendatabuilder) and [metadatalibrary](https://github.com/amundsen-io/amundsenmetadatalibrary) for metadata storage and retrieval with relational databases. ## Requirements -- Python >= 3.6.x -- MySQL >= 5.7 - +- Python: >= 3.6 +- MySQL: 5.7, 8 + +**Note**: amundsen-rds(version >= 0.0.8) comes with SQLAlchemy ORM features supported only in MySQL 8 in the correlated amundsen [metadata-service](https://github.com/amundsen-io/amundsen/tree/main/metadata). ## Instructions to configure venv - In the terminal window, change directory to [amundsen-rds](https://github.com/amundsen-io/amundsenrds]). ``` diff --git a/amundsen_rds/migrations/versions/aa30b4276b9b_add_table_column_lineage.py b/amundsen_rds/migrations/versions/aa30b4276b9b_add_table_column_lineage.py new file mode 100644 index 0000000..ea64ff2 --- /dev/null +++ b/amundsen_rds/migrations/versions/aa30b4276b9b_add_table_column_lineage.py @@ -0,0 +1,44 @@ +"""add table/column lineage + +Revision ID: aa30b4276b9b +Revises: a539c998cc1e +Create Date: 2023-01-03 14:48:38.280768 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + +# revision identifiers, used by Alembic. +revision = 'aa30b4276b9b' +down_revision = 'a539c998cc1e' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # add table/column lineage tables + op.create_table('table_lineage', + sa.Column('table_source_rk', sa.String(length=1024, collation='latin1_general_cs'), nullable=False), + sa.Column('table_target_rk', sa.String(length=1024, collation='latin1_general_cs'), nullable=False), + sa.Column('published_tag', sa.String(length=128), nullable=False), + sa.Column('publisher_last_updated_epoch_ms', sa.BigInteger(), nullable=False), + sa.ForeignKeyConstraint(['table_source_rk'], ['table_metadata.rk'], ondelete='cascade'), + sa.ForeignKeyConstraint(['table_target_rk'], ['table_metadata.rk'], ondelete='cascade'), + sa.PrimaryKeyConstraint('table_source_rk', 'table_target_rk') + ) + op.create_table('column_lineage', + sa.Column('column_source_rk', sa.String(length=1024, collation='latin1_general_cs'), nullable=False), + sa.Column('column_target_rk', sa.String(length=1024, collation='latin1_general_cs'), nullable=False), + sa.Column('published_tag', sa.String(length=128), nullable=False), + sa.Column('publisher_last_updated_epoch_ms', sa.BigInteger(), nullable=False), + sa.ForeignKeyConstraint(['column_source_rk'], ['column_metadata.rk'], ondelete='cascade'), + sa.ForeignKeyConstraint(['column_target_rk'], ['column_metadata.rk'], ondelete='cascade'), + sa.PrimaryKeyConstraint('column_source_rk', 'column_target_rk') + ) + + +def downgrade() -> None: + # drop table/lineage tables + op.drop_table('column_lineage') + op.drop_table('table_lineage') diff --git a/amundsen_rds/models/__init__.py b/amundsen_rds/models/__init__.py index ba8647c..440afe0 100644 --- a/amundsen_rds/models/__init__.py +++ b/amundsen_rds/models/__init__.py @@ -7,7 +7,7 @@ from amundsen_rds.models.badge import Badge from amundsen_rds.models.cluster import Cluster from amundsen_rds.models.column import ( - ColumnBadge, ColumnDescription, ColumnStat, TableColumn + ColumnBadge, ColumnDescription, ColumnLineage, ColumnStat, TableColumn ) from amundsen_rds.models.dashboard import ( Dashboard, DashboardBadge, DashboardChart, DashboardCluster, @@ -20,22 +20,22 @@ Schema, SchemaDescription, SchemaProgrammaticDescription ) from amundsen_rds.models.table import ( - Table, TableBadge, TableDescription, TableFollower, TableOwner, - TableProgrammaticDescription, TableSource, TableTag, TableTimestamp, - TableUsage, TableWatermark + Table, TableBadge, TableDescription, TableFollower, TableLineage, + TableOwner, TableProgrammaticDescription, TableSource, TableTag, + TableTimestamp, TableUsage, TableWatermark ) from amundsen_rds.models.tag import Tag from amundsen_rds.models.updated_timestamp import UpdatedTimestamp from amundsen_rds.models.user import User RDSModel = Union[Application, ApplicationTable, Badge, Cluster, - TableColumn, ColumnBadge, ColumnDescription, ColumnStat, + TableColumn, ColumnBadge, ColumnDescription, ColumnLineage, ColumnStat, Dashboard, DashboardBadge, DashboardChart, DashboardCluster, DashboardDescription, DashboardExecution, DashboardFollower, DashboardGroup, DashboardGroupDescription, DashboardOwner, DashboardQuery, DashboardTable, DashboardTag, DashboardTimestamp, DashboardUsage, Database, Schema, SchemaDescription, SchemaProgrammaticDescription, Table, - TableBadge, TableDescription, TableFollower, TableOwner, + TableBadge, TableDescription, TableFollower, TableLineage, TableOwner, TableProgrammaticDescription, TableSource, TableTag, TableTimestamp, TableUsage, TableWatermark, Tag, UpdatedTimestamp, User] diff --git a/amundsen_rds/models/column.py b/amundsen_rds/models/column.py index a145d87..b37675b 100644 --- a/amundsen_rds/models/column.py +++ b/amundsen_rds/models/column.py @@ -78,3 +78,19 @@ class ColumnStat(Base): nullable=False) published_tag = Column(String(PUBLISHED_TAG_LEN), nullable=False) publisher_last_updated_epoch_ms = Column(BigInteger, nullable=False) + + +class ColumnLineage(Base): + """ + Column lineage model. + """ + __tablename__ = 'column_lineage' + + column_source_rk = Column(String(KEY_LEN, **INDEX_KEY_COLLATION_ARGS), + ForeignKey('column_metadata.rk', ondelete='cascade'), + primary_key=True) + column_target_rk = Column(String(KEY_LEN, **INDEX_KEY_COLLATION_ARGS), + ForeignKey('column_metadata.rk', ondelete='cascade'), + primary_key=True) + published_tag = Column(String(PUBLISHED_TAG_LEN), nullable=False) + publisher_last_updated_epoch_ms = Column(BigInteger, nullable=False) diff --git a/amundsen_rds/models/table.py b/amundsen_rds/models/table.py index eb2f725..652f03b 100644 --- a/amundsen_rds/models/table.py +++ b/amundsen_rds/models/table.py @@ -204,3 +204,19 @@ class TableWatermark(Base): nullable=False) published_tag = Column(String(PUBLISHED_TAG_LEN), nullable=False) publisher_last_updated_epoch_ms = Column(BigInteger, nullable=False) + + +class TableLineage(Base): + """ + Table lineage model. + """ + __tablename__ = 'table_lineage' + + table_source_rk = Column(String(KEY_LEN, **INDEX_KEY_COLLATION_ARGS), + ForeignKey('table_metadata.rk', ondelete='cascade'), + primary_key=True) + table_target_rk = Column(String(KEY_LEN, **INDEX_KEY_COLLATION_ARGS), + ForeignKey('table_metadata.rk', ondelete='cascade'), + primary_key=True) + published_tag = Column(String(PUBLISHED_TAG_LEN), nullable=False) + publisher_last_updated_epoch_ms = Column(BigInteger, nullable=False) diff --git a/setup.py b/setup.py index 8bff11e..0508573 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -__version__ = '0.0.7' +__version__ = '0.0.8' requirements = [