From 047644b888b121fa3feb10a5f33bdef60b1072ce Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 24 Dec 2024 10:06:35 +0900 Subject: [PATCH 1/2] feat: update mlflow-related metadata models (#12174) Co-authored-by: John Joyce Co-authored-by: John Joyce --- .../src/main/resources/entity.graphql | 196 +++++++++++++++++- .../dataprocess/DataProcessInstanceOutput.pdl | 2 +- .../DataProcessInstanceProperties.pdl | 2 +- .../ml/metadata/MLModelGroupProperties.pdl | 35 ++++ .../ml/metadata/MLModelProperties.pdl | 28 ++- .../ml/metadata/MLTrainingRunProperties.pdl | 36 ++++ .../src/main/resources/entity-registry.yml | 4 + .../com.linkedin.entity.aspects.snapshot.json | 54 +++-- ...com.linkedin.entity.entities.snapshot.json | 99 +++++++-- .../com.linkedin.entity.runs.snapshot.json | 54 +++-- ...nkedin.operations.operations.snapshot.json | 54 +++-- ...m.linkedin.platform.platform.snapshot.json | 99 +++++++-- 12 files changed, 568 insertions(+), 95 deletions(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index e086273068ee53..9abf4e16f12dd7 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -262,8 +262,16 @@ type Query { Fetch all Business Attributes """ listBusinessAttributes(input: ListBusinessAttributesInput!): ListBusinessAttributesResult + + """ + Fetch a Data Process Instance by primary key (urn) + """ + dataProcessInstance(urn: String!): DataProcessInstance + + } + """ An ERModelRelationship is a high-level abstraction that dictates what datasets fields are erModelRelationshiped. """ @@ -9832,15 +9840,45 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity privileges: EntityPrivileges } +""" +Properties describing a group of related ML models +""" type MLModelGroupProperties { + """ + Display name of the model group + """ + name: String + """ + Detailed description of the model group's purpose and contents + """ description: String - createdAt: Long + """ + When this model group was created + """ + created: AuditStamp + """ + When this model group was last modified + """ + lastModified: AuditStamp + + """ + Version identifier for this model group + """ version: VersionTag + """ + Custom key-value properties for the model group + """ customProperties: [CustomPropertiesEntry!] + + """ + Deprecated creation timestamp + @deprecated Use the 'created' field instead + """ + createdAt: Long @deprecated(reason: "Use `created` instead") } """ @@ -9990,40 +10028,103 @@ description: String } type MLMetric { + """ + Name of the metric (e.g. accuracy, precision, recall) + """ name: String + """ + Description of what this metric measures + """ description: String + """ + The computed value of the metric + """ value: String + """ + Timestamp when this metric was recorded + """ createdAt: Long } type MLModelProperties { + """ + The display name of the model used in the UI + """ + name: String! + """ + Detailed description of the model's purpose and characteristics + """ description: String - date: Long + """ + When the model was last modified + """ + lastModified: AuditStamp + """ + Version identifier for this model + """ version: String + """ + The type/category of ML model (e.g. classification, regression) + """ type: String + """ + Mapping of hyperparameter configurations + """ hyperParameters: HyperParameterMap - hyperParams: [MLHyperParam] + """ + List of hyperparameter settings used to train this model + """ + hyperParams: [MLHyperParam] + """ + Performance metrics from model training + """ trainingMetrics: [MLMetric] + """ + Names of ML features used by this model + """ mlFeatures: [String!] + """ + Tags for categorizing and searching models + """ tags: [String!] + """ + Model groups this model belongs to + """ groups: [MLModelGroup] + """ + Additional custom properties specific to this model + """ customProperties: [CustomPropertiesEntry!] + """ + URL to view this model in external system + """ externalUrl: String + + """ + When this model was created + """ + created: AuditStamp + + """ + Deprecated timestamp for model creation + @deprecated Use 'created' field instead + """ + date: Long @deprecated(reason: "Use `created` instead") } type MLFeatureProperties { @@ -12804,3 +12905,92 @@ type CronSchedule { """ timezone: String! } + + +""" +Properties describing a data process instance's execution metadata +""" +type DataProcessInstanceProperties { + """ + The display name of this process instance + """ + name: String! + + """ + URL to view this process instance in the external system + """ + externalUrl: String + + """ + When this process instance was created + """ + created: AuditStamp + + """ + Additional custom properties specific to this process instance + """ + customProperties: [CustomPropertiesEntry!] +} + +""" +Properties specific to an ML model training run instance +""" +type MLTrainingRunProperties { + """ + Unique identifier for this training run + """ + id: String + + """ + List of URLs to access training run outputs (e.g. model artifacts, logs) + """ + outputUrls: [String] + + """ + Hyperparameters used in this training run + """ + hyperParams: [MLHyperParam] + + """ + Performance metrics recorded during this training run + """ + trainingMetrics: [MLMetric] +} + +extend type DataProcessInstance { + + """ + Additional read only properties associated with the Data Job + """ + properties: DataProcessInstanceProperties + + """ + The specific instance of the data platform that this entity belongs to + """ + dataPlatformInstance: DataPlatformInstance + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes + + """ + The parent container in which the entity resides + """ + container: Container + + """ + Standardized platform urn where the data process instance is defined + """ + platform: DataPlatform! + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + + """ + Additional properties when subtype is Training Run + """ + mlTrainingRunProperties: MLTrainingRunProperties +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl index f33c41e63efed6..fe782dbe01ca9b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl @@ -15,7 +15,7 @@ record DataProcessInstanceOutput { @Relationship = { "/*": { "name": "Produces", - "entityTypes": [ "dataset" ] + "entityTypes": [ "dataset", "mlModel" ] } } @Searchable = { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index c63cb1a97c017d..5c6bfaecf1ef4d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -52,4 +52,4 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc } created: AuditStamp -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl index b54e430038082d..81c5e7a240f618 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl @@ -4,6 +4,7 @@ import com.linkedin.common.Urn import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties +import com.linkedin.common.TimeStamp /** * Properties associated with an ML Model Group @@ -13,6 +14,17 @@ import com.linkedin.common.CustomProperties } record MLModelGroupProperties includes CustomProperties { + /** + * Display name of the MLModelGroup + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + /** * Documentation of the MLModelGroup */ @@ -25,8 +37,31 @@ record MLModelGroupProperties includes CustomProperties { /** * Date when the MLModelGroup was developed */ + @deprecated createdAt: optional Time + /** + * Time and Actor who created the MLModelGroup + */ + created: optional TimeStamp + + /** + * Date when the MLModelGroup was last modified + */ + lastModified: optional TimeStamp + + /** + * List of jobs (if any) used to train the model group. Visible in Lineage. + */ + @Relationship = { + "/*": { + "name": "TrainedBy", + "entityTypes": [ "dataJob" ], + "isLineage": true + } + } + trainingJobs: optional array[Urn] + /** * Version of the MLModelGroup */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index 621a3e1747b504..d89d07384bba1d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -6,6 +6,7 @@ import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.TimeStamp /** * Properties associated with a ML Model @@ -15,6 +16,18 @@ import com.linkedin.common.ExternalReference } record MLModelProperties includes CustomProperties, ExternalReference { + /** + * Display name of the MLModel + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + + /** * Documentation of the MLModel */ @@ -27,8 +40,19 @@ record MLModelProperties includes CustomProperties, ExternalReference { /** * Date when the MLModel was developed */ + @deprecated date: optional Time + /** + * Audit stamp containing who created this and when + */ + created: optional TimeStamp + + /** + * Date when the MLModel was last modified + */ + lastModified: optional TimeStamp + /** * Version of the MLModel */ @@ -93,12 +117,12 @@ record MLModelProperties includes CustomProperties, ExternalReference { deployments: optional array[Urn] /** - * List of jobs (if any) used to train the model + * List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect. */ @Relationship = { "/*": { "name": "TrainedBy", - "entityTypes": [ "dataJob" ], + "entityTypes": [ "dataJob", "dataProcessInstance" ], "isLineage": true } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl new file mode 100644 index 00000000000000..f8b8eeafe908b7 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl @@ -0,0 +1,36 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.AuditStamp +import com.linkedin.common.CustomProperties +import com.linkedin.common.ExternalReference +import com.linkedin.common.Urn +import com.linkedin.common.JobFlowUrn +import com.linkedin.common.DataJobUrn +/** + * The inputs and outputs of this training run + */ +@Aspect = { + "name": "mlTrainingRunProperties", +} +record MLTrainingRunProperties includes CustomProperties, ExternalReference { + + /** + * Run Id of the ML Training Run + */ + id: optional string + + /** + * List of URLs for the Outputs of the ML Training Run + */ + outputUrls: optional array[string] + + /** + * Hyperparameters of the ML Training Run + */ + hyperParams: optional array[MLHyperParam] + + /** + * Metrics of the ML Training Run + */ + trainingMetrics: optional array[MLMetric] +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 1c3eb5b574e204..4fe170ced69f33 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -116,6 +116,10 @@ entities: - dataProcessInstanceRunEvent - status - testResults + - dataPlatformInstance + - subTypes + - container + - mlTrainingRunProperties - name: chart category: core keyAspect: chartKey diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 827789130d8bbb..1c713fd33884b5 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3826,12 +3826,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3840,17 +3851,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3866,7 +3888,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3901,7 +3923,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3936,7 +3958,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3944,7 +3966,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3952,7 +3974,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3967,7 +3989,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3975,7 +3997,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3989,11 +4011,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4004,7 +4026,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4020,7 +4042,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index b549cef0af84b2..77d4644f3c121a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3984,12 +3984,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3998,17 +4009,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4024,7 +4046,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4059,7 +4081,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4094,7 +4116,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4102,7 +4124,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4110,7 +4132,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4125,7 +4147,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4133,7 +4155,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4147,11 +4169,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4162,7 +4184,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4178,7 +4200,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4981,12 +5003,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4995,12 +5028,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index c8be9d063eaea9..8b6def75f7a665 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3550,12 +3550,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3564,17 +3575,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3590,7 +3612,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3625,7 +3647,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3660,7 +3682,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3668,7 +3690,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3676,7 +3698,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3691,7 +3713,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3699,7 +3721,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3713,11 +3735,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3728,7 +3750,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3744,7 +3766,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 8c7595c5e505d8..e4cc5c42303ee2 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3544,12 +3544,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3558,17 +3569,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3584,7 +3606,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3619,7 +3641,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3654,7 +3676,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3662,7 +3684,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3670,7 +3692,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3685,7 +3707,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3693,7 +3715,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3707,11 +3729,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3722,7 +3744,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3738,7 +3760,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 75e5c9a559076b..e375ac698ab516 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3978,12 +3978,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3992,17 +4003,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4018,7 +4040,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4053,7 +4075,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4088,7 +4110,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4096,7 +4118,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4104,7 +4126,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4119,7 +4141,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4127,7 +4149,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4141,11 +4163,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4156,7 +4178,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4172,7 +4194,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4975,12 +4997,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4989,12 +5022,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { From 09a9b6eef912d8f855a2cc6fdc03032f5ec7a652 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Mon, 23 Dec 2024 22:39:57 -0800 Subject: [PATCH 2/2] feat(ingest/looker): Do not emit usage for non-ingested dashboards and charts (#11647) --- .../ingestion/source/looker/looker_common.py | 9 + .../ingestion/source/looker/looker_source.py | 22 +- .../ingestion/source/looker/looker_usage.py | 40 +- .../looker/looker_mces_usage_history.json | 364 +++++++++++++++++- .../tests/integration/looker/test_looker.py | 87 ++++- 5 files changed, 482 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index a66962f962255f..1183916e9b3fef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1408,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): dashboards_with_activity: LossySet[str] = dataclasses_field( default_factory=LossySet ) + + # Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data + dashboards_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + charts_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + stage_latency: List[StageLatency] = dataclasses_field(default_factory=list) _looker_explore_registry: Optional[LookerExploreRegistry] = None total_explores: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 815c5dfb1c0147..8487d5113bc1d3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -68,6 +68,7 @@ ViewField, ViewFieldType, gen_model_key, + get_urn_looker_element_id, ) from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI @@ -165,6 +166,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): # Required, as we do not ingest all folders but only those that have dashboards/looks self.processed_folders: List[str] = [] + # Keep track of ingested chart urns, to omit usage for non-ingested entities + self.chart_urns: Set[str] = set() + @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() @@ -642,6 +646,7 @@ def _make_chart_metadata_events( chart_urn = self._make_chart_urn( element_id=dashboard_element.get_urn_element_id() ) + self.chart_urns.add(chart_urn) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[Status(removed=False)], @@ -1380,7 +1385,9 @@ def _get_folder_and_ancestors_workunits( yield from self._emit_folder_as_container(folder) def extract_usage_stat( - self, looker_dashboards: List[looker_usage.LookerDashboardForUsage] + self, + looker_dashboards: List[looker_usage.LookerDashboardForUsage], + ingested_chart_urns: Set[str], ) -> List[MetadataChangeProposalWrapper]: looks: List[looker_usage.LookerChartForUsage] = [] # filter out look from all dashboard @@ -1391,6 +1398,15 @@ def extract_usage_stat( # dedup looks looks = list({str(look.id): look for look in looks}.values()) + filtered_looks = [] + for look in looks: + if not look.id: + continue + chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id)) + if chart_urn in ingested_chart_urns: + filtered_looks.append(look) + else: + self.reporter.charts_skipped_for_usage.add(look.id) # Keep stat generators to generate entity stat aspect later stat_generator_config: looker_usage.StatGeneratorConfig = ( @@ -1414,7 +1430,7 @@ def extract_usage_stat( stat_generator_config, self.reporter, self._make_chart_urn, - looks, + filtered_looks, ) mcps: List[MetadataChangeProposalWrapper] = [] @@ -1669,7 +1685,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.source_config.extract_usage_history: self.reporter.report_stage_start("usage_extraction") usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat( - looker_dashboards_for_usage + looker_dashboards_for_usage, self.chart_urns ) for usage_mcp in usage_mcps: yield usage_mcp.as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index ef7d64e4f42d43..098d7d73a3da84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -42,6 +42,7 @@ TimeWindowSizeClass, _Aspect as AspectAbstract, ) +from datahub.utilities.lossy_collections import LossySet logger = logging.getLogger(__name__) @@ -170,7 +171,7 @@ def __init__( self.config = config self.looker_models = looker_models # Later it will help to find out for what are the looker entities from query result - self.id_vs_model: Dict[str, ModelForUsage] = { + self.id_to_model: Dict[str, ModelForUsage] = { self.get_id(looker_object): looker_object for looker_object in looker_models } self.post_filter = len(self.looker_models) > 100 @@ -225,6 +226,10 @@ def get_id(self, looker_object: ModelForUsage) -> str: def get_id_from_row(self, row: dict) -> str: pass + @abstractmethod + def report_skip_set(self) -> LossySet[str]: + pass + def create_mcp( self, model: ModelForUsage, aspect: Aspect ) -> MetadataChangeProposalWrapper: @@ -258,20 +263,11 @@ def _process_entity_timeseries_rows( return entity_stat_aspect - def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]: - aspects: List[Tuple[ModelForUsage, AspectAbstract]] = [] - for looker_object in self.looker_models: - aspects.append( - (looker_object, self.to_entity_absolute_stat_aspect(looker_object)) - ) - - return aspects - def _fill_user_stat_aspect( self, entity_usage_stat: Dict[Tuple[str, str], Aspect], user_wise_rows: List[Dict], - ) -> Iterable[Tuple[ModelForUsage, Aspect]]: + ) -> Iterable[Tuple[str, Aspect]]: logger.debug("Entering fill user stat aspect") # We first resolve all the users using a threadpool to warm up the cache @@ -300,7 +296,7 @@ def _fill_user_stat_aspect( for row in user_wise_rows: # Confirm looker object was given for stat generation - looker_object = self.id_vs_model.get(self.get_id_from_row(row)) + looker_object = self.id_to_model.get(self.get_id_from_row(row)) if looker_object is None: logger.warning( "Looker object with id({}) was not register with stat generator".format( @@ -338,7 +334,7 @@ def _fill_user_stat_aspect( logger.debug("Starting to yield answers for user-wise counts") for (id, _), aspect in entity_usage_stat.items(): - yield self.id_vs_model[id], aspect + yield id, aspect def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: rows = [] @@ -357,7 +353,7 @@ def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: ) if self.post_filter: logger.debug("post filtering") - rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model] + rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model] logger.debug("Filtered down to %d rows", len(rows)) except Exception as e: logger.warning(f"Failed to execute {query_name} query: {e}") @@ -378,7 +374,8 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: return # yield absolute stat for looker entities - for looker_object, aspect in self._process_absolute_aspect(): # type: ignore + for looker_object in self.looker_models: + aspect = self.to_entity_absolute_stat_aspect(looker_object) yield self.create_mcp(looker_object, aspect) # Execute query and process the raw json which contains stat information @@ -399,10 +396,13 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: ) user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query") # yield absolute stat for entity - for looker_object, aspect in self._fill_user_stat_aspect( + for object_id, aspect in self._fill_user_stat_aspect( entity_usage_stat, user_wise_rows ): - yield self.create_mcp(looker_object, aspect) + if object_id in self.id_to_model: + yield self.create_mcp(self.id_to_model[object_id], aspect) + else: + self.report_skip_set().add(object_id) class DashboardStatGenerator(BaseStatGenerator): @@ -425,6 +425,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "DashboardStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.dashboards_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { HistoryViewField.HISTORY_DASHBOARD_ID: ",".join( @@ -541,6 +544,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "ChartStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.charts_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { LookViewField.LOOK_ID: ",".join( diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 594983c8fb0f2a..ed0c5401c9029f 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -1,4 +1,66 @@ [ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(looker,dashboard_elements.3)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "upstream_fields": "" + }, + "title": "", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "chartUrl": "https://looker.company.com/x/", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Look" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { @@ -9,7 +71,9 @@ "customProperties": {}, "title": "foo", "description": "lorem ipsum", - "charts": [], + "charts": [ + "urn:li:chart:(looker,dashboard_elements.3)" + ], "datasets": [], "dashboards": [], "lastModified": { @@ -89,6 +153,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "inputFields", + "aspect": { + "json": { + "fields": [] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(looker,dashboards.1)", @@ -215,6 +295,98 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "model_name": "look_data" + }, + "name": "look_data", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Model" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { @@ -389,6 +561,180 @@ "lastRunId": "no-run-id-provided" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Explore/look_data" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "project": "lkml_samples", + "model": "look_data", + "looker.explore.label": "My Explore View", + "looker.explore.name": "look_view", + "looker.explore.file": "test_source_file.lkml" + }, + "externalUrl": "https://looker.company.com/explore/look_data/look_view", + "name": "My Explore View", + "description": "lorem ipsum", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", + "type": "VIEW" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "look_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Explore" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "embed", + "aspect": { + "json": { + "renderUrl": "https://looker.company.com/embed/explore/look_data/look_view" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + }, + { + "id": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "urn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { @@ -747,22 +1093,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "looker-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index a39de8384efb23..c96bcc729a95da 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -31,7 +31,10 @@ from datahub.ingestion.api.source import SourceReport from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError from datahub.ingestion.source.looker import looker_common, looker_usage -from datahub.ingestion.source.looker.looker_common import LookerExplore +from datahub.ingestion.source.looker.looker_common import ( + LookerDashboardSourceReport, + LookerExplore, +) from datahub.ingestion.source.looker.looker_config import LookerCommonConfig from datahub.ingestion.source.looker.looker_lib_wrapper import ( LookerAPI, @@ -414,7 +417,9 @@ def setup_mock_dashboard_multiple_charts(mocked_client): ) -def setup_mock_dashboard_with_usage(mocked_client): +def setup_mock_dashboard_with_usage( + mocked_client: mock.MagicMock, skip_look: bool = False +) -> None: mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", @@ -437,7 +442,13 @@ def setup_mock_dashboard_with_usage(mocked_client): ), ), DashboardElement( - id="3", type="", look=LookWithQuery(id="3", view_count=30) + id="3", + type="" if skip_look else "vis", # Looks only ingested if type == `vis` + look=LookWithQuery( + id="3", + view_count=30, + query=Query(model="look_data", view="look_view"), + ), ), ], ) @@ -611,6 +622,12 @@ def side_effect_query_inline( HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, }, + { + HistoryViewField.HISTORY_DASHBOARD_ID: "5", + HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", + HistoryViewField.HISTORY_DASHBOARD_USER: 1, + HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, + }, ] ), looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps( @@ -790,6 +807,70 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): ) +@freeze_time(FROZEN_TIME) +def test_looker_filter_usage_history(pytestconfig, tmp_path, mock_time): + mocked_client = mock.MagicMock() + with mock.patch("looker_sdk.init40") as mock_sdk: + mock_sdk.return_value = mocked_client + setup_mock_dashboard_with_usage(mocked_client, skip_look=True) + mocked_client.run_inline_query.side_effect = side_effect_query_inline + setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + + temp_output_file = f"{tmp_path}/looker_mces.json" + pipeline = Pipeline.create( + { + "run_id": "looker-test", + "source": { + "type": "looker", + "config": { + "base_url": "https://looker.company.com", + "client_id": "foo", + "client_secret": "bar", + "extract_usage_history": True, + "max_threads": 1, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": temp_output_file, + }, + }, + } + ) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() + + # There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries) + dashboard_usage_aspect_count = 0 + # There should be 0 chartUsageStatistics -- filtered by set of ingested charts + chart_usage_aspect_count = 0 + with open(temp_output_file) as f: + temp_output_dict = json.load(f) + for element in temp_output_dict: + if ( + element.get("entityType") == "dashboard" + and element.get("aspectName") == "dashboardUsageStatistics" + ): + dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1 + if ( + element.get("entityType") == "chart" + and element.get("aspectName") == "chartUsageStatistics" + ): + chart_usage_aspect_count = chart_usage_aspect_count + 1 + + assert dashboard_usage_aspect_count == 4 + assert chart_usage_aspect_count == 0 + + source_report = cast(LookerDashboardSourceReport, pipeline.source.get_report()) + # From timeseries query + assert str(source_report.dashboards_skipped_for_usage) == str(["5"]) + # From dashboard element + assert str(source_report.charts_skipped_for_usage) == str(["3"]) + + @freeze_time(FROZEN_TIME) def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): output_file_name: str = "looker_mces.json"