diff --git a/modules/data_warehouse/README.md b/modules/data_warehouse/README.md index c697bd28..ab535532 100644 --- a/modules/data_warehouse/README.md +++ b/modules/data_warehouse/README.md @@ -12,7 +12,7 @@ The resources/services/activations/deletions that this module will create/trigge - Creates a BigQuery Dataset - Creates a BigQuery Table - Creates a Google Cloud Storage bucket -- Loads the Google Cloud Storage bucket with data from https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips +- Loads the Google Cloud Storage bucket with data from [TheLook eCommerce Public Dataset](https://console.cloud.google.com/marketplace/product/bigquery-public-data/thelook-ecommerce) - Provides SQL examples - Creates and inferences with a BigQuery ML model - Creates a Looker Studio report @@ -47,7 +47,7 @@ Functional examples are included in the |------|-------------| | bigquery\_editor\_url | The URL to launch the BigQuery editor with the sample query procedure opened | | ds\_friendly\_name | Dataset name | -| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the taxi data analysis | +| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the e-commerce data analysis | | neos\_tutorial\_url | The URL to launch the in-console tutorial for the EDW solution | | raw\_bucket | Raw bucket name | diff --git a/modules/data_warehouse/assets/data-warehouse-architecture.svg b/modules/data_warehouse/assets/data-warehouse-architecture.svg index 8fb9c7cc..e2c804cb 100644 --- a/modules/data_warehouse/assets/data-warehouse-architecture.svg +++ b/modules/data_warehouse/assets/data-warehouse-architecture.svg @@ -1,6 +1,6 @@ - + - Google Cloud ProjectAPPLICATIONDATA SOURCES1DATA LAKE2435DATA WAREHOUSEDataDataDataVISUALIZATIONDATA ANALYSISLooker StudioBigQueryCloud StorageWorkflows \ No newline at end of file + Google Cloud ProjectAPPLICATIONDATA SOURCES1DATA LAKE2435DATA WAREHOUSEDataDataDataVISUALIZATIONDATA ANALYSISLooker StudioBigQueryCloud StorageWorkflows diff --git a/modules/data_warehouse/bigquery.tf b/modules/data_warehouse/bigquery.tf index 33175b87..5c5dc324 100644 --- a/modules/data_warehouse/bigquery.tf +++ b/modules/data_warehouse/bigquery.tf @@ -18,12 +18,14 @@ # # Create the BigQuery dataset resource "google_bigquery_dataset" "ds_edw" { project = module.project-services.project_id - dataset_id = "ds_edw" + dataset_id = "thelook" friendly_name = "My EDW Dataset" description = "My EDW Dataset with tables" location = var.region labels = var.labels delete_contents_on_destroy = var.force_destroy + + depends_on = [time_sleep.wait_after_apis] } # # Create a BigQuery connection @@ -33,6 +35,7 @@ resource "google_bigquery_connection" "ds_connection" { location = var.region friendly_name = "Storage Bucket Connection" cloud_resource {} + depends_on = [time_sleep.wait_after_apis] } # # Grant IAM access to the BigQuery Connection account for Cloud Storage @@ -42,64 +45,146 @@ resource "google_storage_bucket_iam_binding" "bq_connection_iam_object_viewer" { members = [ "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}", ] +} - depends_on = [ - google_bigquery_connection.ds_connection, - ] +# # Create a Biglake table for events with metadata caching +resource "google_bigquery_table" "tbl_edw_events" { + dataset_id = google_bigquery_dataset.ds_edw.dataset_id + table_id = "events" + project = module.project-services.project_id + deletion_protection = var.deletion_protection + + schema = file("${path.module}/src/schema/events_schema.json") + + external_data_configuration { + autodetect = true + connection_id = google_bigquery_connection.ds_connection.name + source_format = "PARQUET" + source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/events.parquet"] + } + + labels = var.labels } -# # Create a BigQuery external table -resource "google_bigquery_table" "tbl_edw_taxi" { +# # Create a Biglake table for inventory_items +resource "google_bigquery_table" "tbl_edw_inventory_items" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id - table_id = "taxi_trips" + table_id = "inventory_items" project = module.project-services.project_id deletion_protection = var.deletion_protection + schema = file("${path.module}/src/schema/inventory_items_schema.json") + external_data_configuration { autodetect = true - connection_id = "${module.project-services.project_id}.${var.region}.ds_connection" + connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" - source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/new-york-taxi-trips/tlc-yellow-trips-2022/taxi-*.Parquet"] + source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/inventory_items.parquet"] + } + + labels = var.labels +} + +# # Create a Biglake table with metadata caching for order_items +resource "google_bigquery_table" "tbl_edw_order_items" { + dataset_id = google_bigquery_dataset.ds_edw.dataset_id + table_id = "order_items" + project = module.project-services.project_id + deletion_protection = var.deletion_protection + schema = file("${path.module}/src/schema/order_items_schema.json") + + external_data_configuration { + autodetect = true + connection_id = google_bigquery_connection.ds_connection.name + source_format = "PARQUET" + source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/order_items.parquet"] } - schema = file("${path.module}/src/taxi_trips_schema.json") + labels = var.labels +} + +# # Create a Biglake table for orders +resource "google_bigquery_table" "tbl_edw_orders" { + dataset_id = google_bigquery_dataset.ds_edw.dataset_id + table_id = "orders" + project = module.project-services.project_id + deletion_protection = var.deletion_protection + + schema = file("${path.module}/src/schema/orders_schema.json") + + external_data_configuration { + autodetect = true + connection_id = google_bigquery_connection.ds_connection.name + source_format = "PARQUET" + source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/orders.parquet"] + } labels = var.labels +} - depends_on = [ - google_bigquery_connection.ds_connection, - google_storage_bucket.raw_bucket, - ] +# # Create a Biglake table for products +resource "google_bigquery_table" "tbl_edw_products" { + dataset_id = google_bigquery_dataset.ds_edw.dataset_id + table_id = "products" + project = module.project-services.project_id + deletion_protection = var.deletion_protection + + schema = file("${path.module}/src/schema/products_schema.json") + + external_data_configuration { + autodetect = true + connection_id = google_bigquery_connection.ds_connection.name + source_format = "PARQUET" + source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/products.parquet"] + } + + labels = var.labels +} + +# # Create a Biglake table for products +resource "google_bigquery_table" "tbl_edw_users" { + dataset_id = google_bigquery_dataset.ds_edw.dataset_id + table_id = "users" + project = module.project-services.project_id + deletion_protection = var.deletion_protection + + schema = file("${path.module}/src/schema/users_schema.json") + + external_data_configuration { + autodetect = true + connection_id = google_bigquery_connection.ds_connection.name + source_format = "PARQUET" + source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/users.parquet"] + } + + labels = var.labels } # Load Queries for Stored Procedure Execution -# # Load Lookup Data Tables +# # Load Distribution Center Lookup Data Tables resource "google_bigquery_routine" "sp_provision_lookup_tables" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_provision_lookup_tables" routine_type = "PROCEDURE" language = "SQL" - definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id }) - - depends_on = [ - google_bigquery_dataset.ds_edw, - ] + definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id }) } - -# # Add Looker Studio Data Report Procedure -resource "google_bigquery_routine" "sproc_sp_demo_datastudio_report" { +# Add Looker Studio Data Report Procedure +resource "google_bigquery_routine" "sproc_sp_demo_lookerstudio_report" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_lookerstudio_report" routine_type = "PROCEDURE" language = "SQL" - definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id }) + definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id }) depends_on = [ - google_bigquery_table.tbl_edw_taxi, + google_bigquery_table.tbl_edw_inventory_items, + google_bigquery_table.tbl_edw_order_items, + google_bigquery_routine.sp_provision_lookup_tables, ] } @@ -110,24 +195,26 @@ resource "google_bigquery_routine" "sp_sample_queries" { routine_id = "sp_sample_queries" routine_type = "PROCEDURE" language = "SQL" - definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id }) + definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id }) depends_on = [ - google_bigquery_table.tbl_edw_taxi, + google_bigquery_table.tbl_edw_inventory_items, + google_bigquery_table.tbl_edw_order_items, ] } -# # Add Bigquery ML Model + +# Add Bigquery ML Model resource "google_bigquery_routine" "sp_bigqueryml_model" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_bigqueryml_model" routine_type = "PROCEDURE" language = "SQL" - definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id }) + definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id }) depends_on = [ - google_bigquery_table.tbl_edw_taxi, + google_bigquery_table.tbl_edw_order_items, ] } @@ -138,10 +225,10 @@ resource "google_bigquery_routine" "sp_sample_translation_queries" { routine_id = "sp_sample_translation_queries" routine_type = "PROCEDURE" language = "SQL" - definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id }) + definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id }) depends_on = [ - google_bigquery_table.tbl_edw_taxi, + google_bigquery_table.tbl_edw_inventory_items, ] } @@ -151,6 +238,8 @@ resource "google_project_service_identity" "bigquery_data_transfer_sa" { provider = google-beta project = module.project-services.project_id service = "bigquerydatatransfer.googleapis.com" + + depends_on = [time_sleep.wait_after_apis] } # # Grant the DTS service account access @@ -162,6 +251,8 @@ resource "google_project_iam_member" "dts_service_account_roles" { project = module.project-services.project_id role = each.key member = "serviceAccount:${google_project_service_identity.bigquery_data_transfer_sa.email}" + + depends_on = [time_sleep.wait_after_apis] } # Create specific service account for DTS Run @@ -206,7 +297,7 @@ resource "google_bigquery_data_transfer_config" "dts_config" { data_source_id = "scheduled_query" schedule = "every day 00:00" params = { - query = "CALL `${module.project-services.project_id}.ds_edw.sp_bigqueryml_model`()" + query = "CALL `${module.project-services.project_id}.${google_bigquery_dataset.ds_edw.dataset_id}.sp_bigqueryml_model`()" } service_account_name = google_service_account.dts.email diff --git a/modules/data_warehouse/main.tf b/modules/data_warehouse/main.tf index 6d452f99..c60fb5de 100644 --- a/modules/data_warehouse/main.tf +++ b/modules/data_warehouse/main.tf @@ -1,4 +1,4 @@ -/** +/* * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -60,6 +60,11 @@ module "project-services" { ] } +resource "time_sleep" "wait_after_apis" { + create_duration = "120s" + depends_on = [module.project-services] +} + // Create random ID to be used for deployment uniqueness resource "random_id" "id" { byte_length = 4 @@ -77,10 +82,12 @@ resource "google_storage_bucket" "raw_bucket" { public_access_prevention = "enforced" + depends_on = [time_sleep.wait_after_apis] + labels = var.labels } -# # Set up the provisioning bucketstorage bucket +# # Set up the provisioning storage bucket resource "google_storage_bucket" "provisioning_bucket" { name = "ds-edw-provisioner-${random_id.id.hex}" project = module.project-services.project_id @@ -90,6 +97,8 @@ resource "google_storage_bucket" "provisioning_bucket" { public_access_prevention = "enforced" + depends_on = [time_sleep.wait_after_apis] + labels = var.labels } @@ -99,6 +108,8 @@ resource "google_pubsub_topic" "topic" { name = "provisioning-topic" project = module.project-services.project_id + depends_on = [time_sleep.wait_after_apis] + labels = var.labels } @@ -112,6 +123,8 @@ resource "google_pubsub_topic_iam_binding" "binding" { # # Get the GCS service account to trigger the pub/sub notification data "google_storage_project_service_account" "gcs_account" { project = module.project-services.project_id + + depends_on = [time_sleep.wait_after_apis] } # # Create the Storage trigger @@ -120,7 +133,9 @@ resource "google_storage_notification" "notification" { bucket = google_storage_bucket.provisioning_bucket.name payload_format = "JSON_API_V1" topic = google_pubsub_topic.topic.id - depends_on = [google_pubsub_topic_iam_binding.binding] + depends_on = [ + google_pubsub_topic_iam_binding.binding, + ] } # # Create the Eventarc trigger @@ -132,7 +147,6 @@ resource "google_eventarc_trigger" "trigger_pubsub_tf" { attribute = "type" value = "google.cloud.pubsub.topic.v1.messagePublished" - } destination { workflow = google_workflows_workflow.workflow.id @@ -148,7 +162,6 @@ resource "google_eventarc_trigger" "trigger_pubsub_tf" { labels = var.labels depends_on = [ - google_workflows_workflow.workflow, google_project_iam_member.eventarc_service_account_invoke_role, ] } @@ -159,6 +172,8 @@ resource "google_service_account" "eventarc_service_account" { project = module.project-services.project_id account_id = "eventarc-sa-${random_id.id.hex}" display_name = "Service Account for Cloud Eventarc" + + depends_on = [time_sleep.wait_after_apis] } # # Grant the Eventarc service account Workflow Invoker Access @@ -166,13 +181,9 @@ resource "google_project_iam_member" "eventarc_service_account_invoke_role" { project = module.project-services.project_id role = "roles/workflows.invoker" member = "serviceAccount:${google_service_account.eventarc_service_account.email}" - - depends_on = [ - google_service_account.eventarc_service_account - ] } -// Sleep for 60 seconds to drop start file +// Sleep for 120 seconds to drop start file resource "time_sleep" "wait_to_startfile" { depends_on = [ google_storage_notification.notification, @@ -180,7 +191,7 @@ resource "time_sleep" "wait_to_startfile" { google_workflows_workflow.workflow ] - create_duration = "60s" + create_duration = "120s" } // Drop start file for workflow to execute diff --git a/modules/data_warehouse/outputs.tf b/modules/data_warehouse/outputs.tf index 207794a9..646e4ada 100644 --- a/modules/data_warehouse/outputs.tf +++ b/modules/data_warehouse/outputs.tf @@ -24,13 +24,14 @@ output "raw_bucket" { description = "Raw bucket name" } +#TODO Create new Looker Studio Template output "lookerstudio_report_url" { - value = "https://lookerstudio.google.com/reporting/create?c.reportId=402d64d6-2a14-45a1-b159-0dcc88c62cd5&ds.ds0.datasourceName=vw_taxi&ds.ds0.projectId=${var.project_id}&ds.ds0.type=TABLE&ds.ds0.datasetId=ds_edw&ds.ds0.tableId=vw_lookerstudio_report" - description = "The URL to create a new Looker Studio report displays a sample dashboard for the taxi data analysis" + value = "https://lookerstudio.google.com/reporting/create?c.reportId=8a6517b8-8fcd-47a2-a953-9d4fb9ae4794&ds.ds_profit.datasourceName=lookerstudio_report_profit&ds.ds_profit.projectId=${module.project-services.project_id}&ds.ds_profit.type=TABLE&ds.ds_profit.datasetId=${google_bigquery_dataset.ds_edw.dataset_id}&ds.ds_profit.tableId=lookerstudio_report_profit&ds.ds_dc.datasourceName=lookerstudio_report_distribution_centers&ds.ds_dc.projectId=${module.project-services.project_id}&ds.ds_dc.type=TABLE&ds.ds_dc.datasetId=${google_bigquery_dataset.ds_edw.dataset_id}&ds.ds_dc.tableId=lookerstudio_report_distribution_centers" + description = "The URL to create a new Looker Studio report displays a sample dashboard for the e-commerce data analysis" } output "bigquery_editor_url" { - value = "https://console.cloud.google.com/bigquery?project=${var.project_id}&ws=!1m5!1m4!6m3!1s${var.project_id}!2sds_edw!3ssp_sample_queries" + value = "https://console.cloud.google.com/bigquery?project=${module.project-services.project_id}&ws=!1m5!1m4!6m3!1s${module.project-services.project_id}!2s${google_bigquery_dataset.ds_edw.dataset_id}!3ssp_sample_queries" description = "The URL to launch the BigQuery editor with the sample query procedure opened" } diff --git a/modules/data_warehouse/src/schema/distribution_centers_schema.json b/modules/data_warehouse/src/schema/distribution_centers_schema.json new file mode 100644 index 00000000..c50e6779 --- /dev/null +++ b/modules/data_warehouse/src/schema/distribution_centers_schema.json @@ -0,0 +1,22 @@ +[ + { + "mode": "NULLABLE", + "name": "id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "latitude", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "longitude", + "type": "FLOAT" + } +] diff --git a/modules/data_warehouse/src/schema/events_schema.json b/modules/data_warehouse/src/schema/events_schema.json new file mode 100644 index 00000000..4a5a1f3f --- /dev/null +++ b/modules/data_warehouse/src/schema/events_schema.json @@ -0,0 +1,67 @@ +[ + { + "mode": "NULLABLE", + "name": "id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "user_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "sequence_number", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "session_id", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "created_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "ip_address", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "city", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "state", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "postal_code", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "browser", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "traffic_source", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "uri", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "event_type", + "type": "STRING" + } +] diff --git a/modules/data_warehouse/src/schema/inventory_items_schema.json b/modules/data_warehouse/src/schema/inventory_items_schema.json new file mode 100644 index 00000000..4b064798 --- /dev/null +++ b/modules/data_warehouse/src/schema/inventory_items_schema.json @@ -0,0 +1,62 @@ +[ + { + "mode": "NULLABLE", + "name": "id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "product_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "created_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "sold_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "cost", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "product_category", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "product_name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "product_brand", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "product_retail_price", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "product_department", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "product_sku", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "product_distribution_center_id", + "type": "INTEGER" + } +] diff --git a/modules/data_warehouse/src/schema/order_items_schema.json b/modules/data_warehouse/src/schema/order_items_schema.json new file mode 100644 index 00000000..9b0d6829 --- /dev/null +++ b/modules/data_warehouse/src/schema/order_items_schema.json @@ -0,0 +1,57 @@ +[ + { + "mode": "NULLABLE", + "name": "id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "order_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "user_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "product_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "inventory_item_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "status", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "created_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "shipped_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "delivered_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "returned_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "sale_price", + "type": "FLOAT" + } +] diff --git a/modules/data_warehouse/src/schema/orders_schema.json b/modules/data_warehouse/src/schema/orders_schema.json new file mode 100644 index 00000000..bb872ca5 --- /dev/null +++ b/modules/data_warehouse/src/schema/orders_schema.json @@ -0,0 +1,47 @@ +[ + { + "mode": "NULLABLE", + "name": "order_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "user_id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "status", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "gender", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "created_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "returned_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "shipped_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "delivered_at", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "num_of_item", + "type": "INTEGER" + } +] diff --git a/modules/data_warehouse/src/schema/products_schema.json b/modules/data_warehouse/src/schema/products_schema.json new file mode 100644 index 00000000..da918220 --- /dev/null +++ b/modules/data_warehouse/src/schema/products_schema.json @@ -0,0 +1,47 @@ +[ + { + "mode": "NULLABLE", + "name": "id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "cost", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "category", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "brand", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "retail_price", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "department", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "sku", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "distribution_center_id", + "type": "INTEGER" + } +] diff --git a/modules/data_warehouse/src/schema/users_schema.json b/modules/data_warehouse/src/schema/users_schema.json new file mode 100644 index 00000000..ae067f0d --- /dev/null +++ b/modules/data_warehouse/src/schema/users_schema.json @@ -0,0 +1,77 @@ +[ + { + "mode": "NULLABLE", + "name": "id", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "first_name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "last_name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "email", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "age", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "gender", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "state", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "street_address", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "postal_code", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "city", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "country", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "latitude", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "longitude", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "traffic_source", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "created_at", + "type": "TIMESTAMP" + } +] diff --git a/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql b/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql index 152ee298..83efd546 100644 --- a/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql +++ b/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql @@ -12,29 +12,60 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -/* Run a query to see the prediction results of the model +/* +Run a query to see the results of the model -- -select * from ML.PREDICT(MODEL ds_edw.model_taxi_estimate, - TABLE ds_edw.taxi_trips) - limit 1000; */ +SELECT + CONCAT('cluster ', CAST(centroid_id as STRING)) as cluster, + avg_spend as average_spend, + count_orders as count_of_orders, + days_since_order +FROM ( + SELECT + centroid_id, + feature, + ROUND(numerical_value, 2) as value + FROM + ML.CENTROIDS(MODEL `${dataset_id}.customer_segment_clustering`) +) +PIVOT ( + SUM(value) + FOR feature IN ('avg_spend', 'count_orders', 'days_since_order') +) +ORDER BY centroid_id +*/ --Model Example CREATE OR REPLACE MODEL - `${project_id}.ds_edw.model_taxi_estimate` -OPTIONS ( MODEL_TYPE='LINEAR_REG', - LS_INIT_LEARN_RATE=0.15, - L1_REG=1, - MAX_ITERATIONS=5 ) AS -SELECT - pickup_datetime, - dropoff_datetime, - IFNULL(passenger_count,0) passenger_count, - IFNULL(trip_distance,0) trip_distance, - IFNULL(rate_code,'') rate_code, - IFNULL(payment_type,'') payment_type, - IFNULL(fare_amount,0) label, - IFNULL(pickup_location_id,'') pickup_location_id -FROM - `${project_id}.ds_edw.taxi_trips` -WHERE - fare_amount > 0; + `${project_id}.${dataset_id}.customer_segment_clustering` + OPTIONS( + MODEL_TYPE = 'KMEANS', -- model name + NUM_CLUSTERS = 5, -- how many clusters to create + KMEANS_INIT_METHOD = 'KMEANS++', + STANDARDIZE_FEATURES = TRUE -- note: normalization taking place to scale the range of independent variables (each feature contributes proportionately to the final distance) + ) + AS ( + SELECT + * EXCEPT (user_id) + FROM ( + SELECT + user_id, + DATE_DIFF(CURRENT_DATE(), CAST(MAX(order_created_date) as DATE), day) as days_since_order, ---RECENCY + COUNT(DISTINCT order_id) as count_orders, --FREQUENCY + AVG(sale_price) as avg_spend --MONETARY + FROM ( + SELECT + user_id, + order_id, + sale_price, + created_at as order_created_date + FROM + `${project_id}.${dataset_id}.order_items` + WHERE + created_at BETWEEN TIMESTAMP('2020-07-31 00:00:00') + AND TIMESTAMP('2023-07-31 00:00:00') + ) + GROUP BY user_id + ) + ) +; diff --git a/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql b/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql index 423ed5a5..88d643ca 100644 --- a/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql +++ b/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql @@ -12,99 +12,213 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -CREATE OR REPLACE TABLE `${project_id}.ds_edw.lookerstudio_report` +CREATE OR REPLACE VIEW `${project_id}.${dataset_id}.lookerstudio_report_distribution_centers` OPTIONS( labels=[("data-warehouse","true")] ) AS -WITH TaxiData AS +WITH OrdersData AS ( -SELECT VENDOR_ID as TaxiCompany, - EXTRACT(YEAR FROM Pickup_DateTime) AS Year, - EXTRACT(WEEK FROM Pickup_DateTime) AS WeekNumber, - CONCAT('Week ',FORMAT("%02d", - EXTRACT(WEEK FROM Pickup_DateTime))) AS WeekName, - CONCAT(VENDOR_ID,':',EXTRACT(YEAR FROM Pickup_DateTime),':',FORMAT("%02d",EXTRACT(WEEK FROM Pickup_DateTime))) AS GroupPartition, - COUNT(1) AS NumberOfRides, - AVG(Trip_Distance) AS AvgDistance, - SUM(Fare_Amount) AS Total_Fare_Amount, - SUM(Extra) AS Total_Surcharge, - SUM(MTA_Tax) AS Total_MTA_Tax, - SUM(Tolls_Amount) AS Total_Tolls_Amount, - SUM(imp_Surcharge) AS Total_Improvement_Surcharge, - SUM(Tip_Amount) AS Total_Tip_Amount, - SUM(Total_Amount) AS Total_Total_Amount - FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips - WHERE Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-02' --'2015-01-01' AND '2021-12-31' -- There is odd data in some of the source files from NYC - GROUP BY 1, 2, 3, 4, 5 + SELECT dc.name AS distribution_center_name, + EXTRACT(YEAR FROM order_items.created_at) AS year, + EXTRACT(WEEK FROM order_items.created_at) AS week_number, + CONCAT('Week ',FORMAT("%02d", + EXTRACT(WEEK FROM order_items.created_at))) AS week_name, + EXTRACT(DATE FROM TIMESTAMP_TRUNC(order_items.created_at, WEEK)) AS week_start_date, + CONCAT(product_distribution_center_id,':',EXTRACT(YEAR FROM order_items.created_at), + ':',FORMAT("%02d",EXTRACT(WEEK FROM order_items.created_at))) AS GroupPartition, + COUNT(order_items.product_id) AS products_ordered_count, + COUNT(DISTINCT order_items.order_id) AS orders_count, + SUM(cost) AS inventory_sold_cost_total, + AVG(cost) AS inventory_sold_cost_avg, + SUM(order_items.sale_price - cost) AS profit_total, + AVG(order_items.sale_price - cost) AS profit_avg, + AVG(TIMESTAMP_DIFF(delivered_at, shipped_at, HOUR)) AS shipping_hours, + AVG(TIMESTAMP_DIFF(shipped_at, order_items.created_at, HOUR)) AS processing_hours, + AVG(TIMESTAMP_DIFF(delivered_at, order_items.created_at, HOUR)) AS order_to_delivery_hours + FROM + `${project_id}.${dataset_id}.order_items` AS order_items + JOIN + `${project_id}.${dataset_id}.inventory_items` AS inventory_items ON order_items.product_id = inventory_items.product_id AND order_items.inventory_item_id = inventory_items.id + JOIN + `${project_id}.${dataset_id}.distribution_centers` AS dc ON inventory_items.product_distribution_center_id = dc.id + WHERE + order_items.created_at IS NOT NULL + AND order_items.created_at <= CURRENT_TIMESTAMP() + GROUP BY 1, 2, 3, 4, 5, 6 ) , LagPercents AS ( -SELECT TaxiCompany, - Year, - WeekNumber, - WeekName, - NumberOfRides, - GroupPartition, - AvgDistance, - Total_Fare_Amount, - Total_Surcharge, - Total_MTA_Tax, - Total_Tolls_Amount, - Total_Improvement_Surcharge, - Total_Tip_Amount, - Total_Total_Amount, - LAG(NumberOfRides) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_NumberOfRides, - LAG(AvgDistance) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_AvgDistance, - LAG(Total_Fare_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Fare_Amount, - LAG(Total_Surcharge) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Surcharge, - LAG(Total_MTA_Tax) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_MTA_Tax, - LAG(Total_Tolls_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tolls_Amount, - LAG(Total_Improvement_Surcharge) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Improvement_Surcharge, - LAG(Total_Tip_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tip_Amount, - LAG(Total_Total_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Total_Amount - FROM TaxiData + SELECT distribution_center_name, + year, + week_number, + week_name, + week_start_date, + GroupPartition, + products_ordered_count, + orders_count, + profit_total, + profit_avg, + inventory_sold_cost_total, + inventory_sold_cost_avg, + shipping_hours, + processing_hours, + order_to_delivery_hours, + LAG(products_ordered_count) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_products_ordered_count, + LAG(orders_count) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_orders_count, + LAG(profit_total) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_profit_total, + LAG(profit_avg) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_profit_avg, + LAG(inventory_sold_cost_total) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_inventory_sold_cost_total, + LAG(inventory_sold_cost_avg) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_inventory_sold_cost_avg, + LAG(shipping_hours) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_shipping_hours, + LAG(processing_hours) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_processing_hours, + LAG(order_to_delivery_hours) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_order_to_delivery_hours + FROM OrdersData ) , PercentChange AS ( -SELECT TaxiCompany, - Year, - WeekNumber, - WeekName, - GroupPartition, - NumberOfRides, - AvgDistance, - Total_Fare_Amount, - Total_Surcharge, - Total_MTA_Tax, - Total_Tolls_Amount, - Total_Improvement_Surcharge, - Total_Tip_Amount, - Total_Total_Amount, - Prior_Week_NumberOfRides, - Prior_Week_AvgDistance, - Prior_Week_Total_Fare_Amount, - Prior_Week_Total_Surcharge, - Prior_Week_Total_MTA_Tax, - Prior_Week_Total_Tolls_Amount, - Prior_Week_Total_Improvement_Surcharge, - Prior_Week_Total_Tip_Amount, - Prior_Week_Total_Total_Amount, - SAFE_DIVIDE(CAST(NumberOfRides - Prior_Week_NumberOfRides AS NUMERIC) , CAST(Prior_Week_NumberOfRides AS NUMERIC)) AS PercentChange_NumberOfRides, - SAFE_DIVIDE(CAST(AvgDistance - Prior_Week_AvgDistance AS NUMERIC) , CAST(Prior_Week_AvgDistance AS NUMERIC)) AS PercentChange_AvgDistance, - SAFE_DIVIDE((Total_Fare_Amount - Prior_Week_Total_Fare_Amount) , Prior_Week_Total_Fare_Amount) AS PercentChange_Total_Fare_Amount, - SAFE_DIVIDE((Total_Surcharge - Prior_Week_Total_Surcharge) , Prior_Week_Total_Surcharge) AS PercentChange_Total_Surcharge, - SAFE_DIVIDE((Total_MTA_Tax - Prior_Week_Total_MTA_Tax) , Prior_Week_Total_MTA_Tax) AS PercentChange_Total_MTA_Tax, - SAFE_DIVIDE((Total_Tolls_Amount - Prior_Week_Total_Tolls_Amount) , Prior_Week_Total_Tolls_Amount) AS PercentChange_Total_Tolls_Amount, - SAFE_DIVIDE((Total_Improvement_Surcharge - Prior_Week_Total_Improvement_Surcharge) , Prior_Week_Total_Improvement_Surcharge) AS PercentChange_Total_Improvement_Surcharge, - SAFE_DIVIDE((Total_Tip_Amount - Prior_Week_Total_Tip_Amount) , Prior_Week_Total_Tip_Amount) AS PercentChange_Total_Tip_Amount, - SAFE_DIVIDE((Total_Total_Amount - Prior_Week_Total_Total_Amount) , Prior_Week_Total_Total_Amount) AS PercentChange_Total_Total_Amount + SELECT distribution_center_name, + year, + week_number, + week_name, + week_start_date, + GroupPartition, + products_ordered_count, + orders_count, + profit_total, + profit_avg, + inventory_sold_cost_total, + inventory_sold_cost_avg, + shipping_hours, + processing_hours, + order_to_delivery_hours, + prior_week_products_ordered_count, + prior_week_orders_count, + prior_week_profit_total, + prior_week_profit_avg, + prior_week_inventory_sold_cost_total, + prior_week_inventory_sold_cost_avg, + prior_week_shipping_hours, + prior_week_processing_hours, + prior_week_order_to_delivery_hours, + SAFE_DIVIDE(CAST(products_ordered_count - prior_week_products_ordered_count AS NUMERIC) , CAST(prior_week_products_ordered_count AS NUMERIC)) AS percent_change_products_ordered_count, + SAFE_DIVIDE(CAST(orders_count - prior_week_orders_count AS NUMERIC) , CAST(prior_week_orders_count AS NUMERIC)) AS percent_change_orders_count, + SAFE_DIVIDE((profit_total - prior_week_profit_total) , prior_week_profit_total) AS percent_change_profit_total, + SAFE_DIVIDE((profit_avg - prior_week_profit_avg) , prior_week_profit_avg) AS percent_change_profit_avg, + SAFE_DIVIDE((inventory_sold_cost_total - prior_week_inventory_sold_cost_total) , prior_week_inventory_sold_cost_total) AS percent_change_inventory_sold_cost_total, + SAFE_DIVIDE((inventory_sold_cost_avg - prior_week_inventory_sold_cost_avg) , prior_week_inventory_sold_cost_avg) AS percent_change_inventory_sold_cost_avg, + SAFE_DIVIDE((shipping_hours - prior_week_shipping_hours) , prior_week_shipping_hours) AS percent_change_shipping_hours, + SAFE_DIVIDE((processing_hours - prior_week_processing_hours) , prior_week_processing_hours) AS percent_change_processing_hours, + SAFE_DIVIDE((order_to_delivery_hours - prior_week_order_to_delivery_hours) , prior_week_order_to_delivery_hours) AS percent_change_order_to_delivery_hours FROM LagPercents ) SELECT * - FROM PercentChange +FROM PercentChange ORDER BY GroupPartition; -CREATE OR REPLACE VIEW `${project_id}.ds_edw.vw_lookerstudio_report` as -SELECT * FROM `${project_id}.ds_edw.lookerstudio_report` -WHERE Year in (2022); +CREATE OR REPLACE VIEW `${project_id}.${dataset_id}.lookerstudio_report_profit` +OPTIONS( + labels=[("data-warehouse","true")] +) +AS +with SubsetInventory AS( + SELECT + SUM(ROUND(product_retail_price,2)) AS revenue_total, + SUM(ROUND(cost,2)) AS cost_total, + SUM(ROUND(product_retail_price-cost, 2)) AS profit_total, + CONCAT(product_department, " - ", product_category) AS product_dept_cat, + EXTRACT(DATE from sold_at) AS sold_at_day + FROM + `${project_id}.${dataset_id}.inventory_items` + WHERE + sold_at <= CURRENT_TIMESTAMP() + GROUP BY + product_dept_cat, sold_at_day +), + +Inventory7d AS ( + SELECT + product_dept_cat, + sold_at_day AS day, + revenue_total, + cost_total, + profit_total, + SUM(ROUND(revenue_total,2)) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(sold_at_day) ASC RANGE BETWEEN 6 PRECEDING and CURRENT ROW) AS revenue_last_7d, + SUM(ROUND(cost_total,2)) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(sold_at_day) ASC RANGE BETWEEN 6 PRECEDING and CURRENT ROW) AS cost_last_7d + FROM + SubsetInventory +), + +Lags AS ( + SELECT + product_dept_cat, + day, + revenue_total, + cost_total, + profit_total, + revenue_last_7d, + cost_last_7d, + ROUND(SAFE_SUBTRACT(revenue_last_7d, cost_last_7d),2) AS profit_last_7d, + LAG(revenue_last_7d,30) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_month_revenue_last_7d, + LAG(cost_last_7d,30) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_month_cost_last_7d, + LAG(revenue_last_7d,365) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_year_revenue_last_7d, + LAG(cost_last_7d,365) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_year_cost_last_7d, + FROM + Inventory7d +), + +LagPercentages AS ( + SELECT + day, + product_dept_cat, + revenue_total, + cost_total, + profit_total, + revenue_last_7d, + prior_month_revenue_last_7d, + prior_year_revenue_last_7d, + SAFE_DIVIDE((revenue_last_7d - prior_month_revenue_last_7d), prior_month_revenue_last_7d) AS percent_change_revenue_month, + SAFE_DIVIDE((revenue_last_7d - prior_year_revenue_last_7d), prior_year_revenue_last_7d) AS percent_change_revenue_year, + cost_last_7d, + prior_month_cost_last_7d, + prior_year_cost_last_7d, + SAFE_DIVIDE((cost_last_7d - prior_month_cost_last_7d), prior_month_cost_last_7d) AS percent_change_cost_month, + SAFE_DIVIDE((cost_last_7d - prior_year_cost_last_7d), prior_year_cost_last_7d) AS percent_change_cost_year, + profit_last_7d, + ROUND(SAFE_SUBTRACT(prior_month_revenue_last_7d, prior_month_cost_last_7d),2) AS prior_month_profit_last_7d, + ROUND(SAFE_SUBTRACT(prior_year_revenue_last_7d, prior_year_cost_last_7d),2) AS prior_year_profit_last_7d, + FROM + Lags +), + +ProfitPercentages AS ( + SELECT + day, + product_dept_cat, + revenue_total, + revenue_last_7d, + prior_month_revenue_last_7d, + percent_change_revenue_month, + prior_year_revenue_last_7d, + percent_change_revenue_year, + cost_total, + cost_last_7d, + prior_month_cost_last_7d, + percent_change_cost_month, + prior_year_cost_last_7d, + percent_change_cost_year, + profit_total, + profit_last_7d, + prior_month_profit_last_7d, + SAFE_DIVIDE((profit_last_7d - prior_month_profit_last_7d), prior_month_profit_last_7d) AS percent_change_profit_month, + prior_year_profit_last_7d, + SAFE_DIVIDE((profit_last_7d - prior_year_profit_last_7d), prior_year_profit_last_7d) AS percent_change_profit_year + FROM + LagPercentages + ORDER BY + day DESC +) + +SELECT * +FROM ProfitPercentages +ORDER BY day DESC; diff --git a/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql b/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql index 1cacb53b..3a6457f5 100644 --- a/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql +++ b/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql @@ -12,36 +12,36 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -CREATE OR REPLACE TABLE `${project_id}.ds_edw.vendor` - ( - Vendor_Id INTEGER, - Vendor_Description STRING - ) -OPTIONS( - labels=[("data-warehouse","true")] -) -AS -SELECT 1, 'Creative Mobile Technologies, LLC' -UNION ALL -SELECT 2, 'VeriFone Inc.'; +CREATE OR REPLACE TABLE `${project_id}.${dataset_id}.distribution_centers` -CREATE OR REPLACE TABLE `${project_id}.ds_edw.payment_type` ( - Payment_Type_Id INTEGER, - Payment_Type_Description STRING + id INTEGER, + name STRING, + longitude FLOAT64, + latitude FLOAT64, + distribution_center_geom GEOGRAPHY ) -OPTIONS( - labels=[("data-warehouse","true")] -) + OPTIONS( + labels=[("data-warehouse","true")] + ) AS -SELECT 1, 'Credit card' +SELECT 1, 'Memphis TN', -89.9711, 35.1174, ST_GEOGPOINT(-89.9711, 35.1174) +UNION ALL +SELECT 2, 'Chicago IL', -87.6847, 41.8369, ST_GEOGPOINT(-87.6847, 41.8369) +UNION ALL +SELECT 3, 'Houston TX', -95.3698, 29.7604, ST_GEOGPOINT(-95.3698, 29.7604) +UNION ALL +SELECT 4, 'Los Angeles CA', -118.25, 34.05, ST_GEOGPOINT(-118.25, 34.05) +UNION ALL +SELECT 5, 'New Orleans LA', -90.0667, 29.95, ST_GEOGPOINT(-90.0667, 29.95) UNION ALL -SELECT 2, 'Cash' +SELECT 6, 'Port Authority of New York/New Jersey NY/NJ', -73.7834, 40.634, ST_GEOGPOINT(-73.7834, 40.634) UNION ALL -SELECT 3, 'No charge' +SELECT 7, 'Philadelphia PA', -75.1667, 39.95, ST_GEOGPOINT(-75.1667, 39.95) UNION ALL -SELECT 4, 'Dispute' +SELECT 8, 'Mobile AL', -88.0431, 30.6944, ST_GEOGPOINT(-88.0431, 30.6944) UNION ALL -SELECT 5, 'Unknown' +SELECT 9, 'Charleston SC', -79.9333, 32.7833, ST_GEOGPOINT(-79.9333, 32.7833) UNION ALL -SELECT 6, 'Voided trip'; +SELECT 10, 'Savannah GA', -81.1167, 32.0167, ST_GEOGPOINT(-81.1167, 32.0167) +; diff --git a/modules/data_warehouse/src/sql/sp_sample_queries.sql b/modules/data_warehouse/src/sql/sp_sample_queries.sql index c88183ba..fdb4ace3 100644 --- a/modules/data_warehouse/src/sql/sp_sample_queries.sql +++ b/modules/data_warehouse/src/sql/sp_sample_queries.sql @@ -29,118 +29,118 @@ Clean up / Reset script: --Rank, Pivot, Json --- Query: Get trips over $50 for each day of the week. --- Shows: Date Functions, Joins, Group By, Having, Ordinal Group/Having -SELECT FORMAT_DATE("%w", Pickup_DateTime) AS WeekdayNumber, - FORMAT_DATE("%A", Pickup_DateTime) AS WeekdayName, - vendor.Vendor_Description, - payment_type.Payment_Type_Description, - SUM(taxi_trips.Total_Amount) AS high_value_trips - FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips - INNER JOIN `${project_id}.ds_edw.vendor` AS vendor - ON cast(taxi_trips.Vendor_Id as INT64) = vendor.Vendor_Id - AND taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - LEFT JOIN `${project_id}.ds_edw.payment_type` AS payment_type - ON cast(taxi_trips.payment_type as INT64) = payment_type.Payment_Type_Id -GROUP BY 1, 2, 3, 4 -HAVING SUM(taxi_trips.Total_Amount) > 50 -ORDER BY WeekdayNumber, 3, 4; +-- Query: See the order price quartiles for each day of the week. +-- Shows: Date Functions, Joins, Group By, Having, Ordinal Group/Having, Quantiles +SELECT + FORMAT_DATE("%w", created_at) AS WeekdayNumber, + FORMAT_DATE("%A", created_at) AS WeekdayName, + APPROX_QUANTILES(order_price, 4) AS quartiles + FROM ( + SELECT + created_at, + SUM(sale_price) AS order_price + FROM + `${project_id}.${dataset_id}.order_items` + GROUP BY + order_id, 1 + HAVING SUM(sale_price) > 10) + GROUP BY + 1, 2 + ORDER BY + WeekdayNumber, 3 +; +-- Query: Items with less than 30 days of inventory remaining +WITH Orders AS ( + SELECT + order_items.product_id AS product_id, + COUNT(order_items.id) AS count_sold_30d + FROM + `${project_id}.${dataset_id}.order_items` AS order_items + WHERE + order_items.created_at > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 DAY) + GROUP BY + product_id +), --- Query: amounts (Cash/Credit) by passenger type -WITH TaxiDataRanking AS -( -SELECT CAST(Pickup_DateTime AS DATE) AS Pickup_Date, - cast(taxi_trips.payment_type as INT64) as Payment_Type_Id, - taxi_trips.Passenger_Count, - taxi_trips.Total_Amount, - RANK() OVER (PARTITION BY CAST(Pickup_DateTime AS DATE), - taxi_trips.payment_type - ORDER BY taxi_trips.Passenger_Count DESC, - taxi_trips.Total_Amount DESC) AS Ranking - FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips - WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - AND cast(taxi_trips.payment_type as INT64) IN (1,2) -) -SELECT Pickup_Date, - Payment_Type_Description, - Passenger_Count, - Total_Amount - FROM TaxiDataRanking - INNER JOIN `${project_id}.ds_edw.payment_type` AS payment_type - ON TaxiDataRanking.Payment_Type_Id = payment_type.Payment_Type_Id -WHERE Ranking = 1 -ORDER BY Pickup_Date, Payment_Type_Description; - +OnHand AS ( + SELECT + inventory.product_id AS product_id, + inventory.product_name AS product_name, + COUNT(inventory.id) AS count_in_stock + FROM + `${project_id}.${dataset_id}.inventory_items` AS inventory + WHERE + inventory.sold_at IS NULL + GROUP BY + product_id, + product_name + ORDER BY + count_in_stock DESC +), --- Query: data summed by payment type and passenger count, then pivoted based upon payment type -WITH MonthlyData AS -( -SELECT FORMAT_DATE("%B", taxi_trips.Pickup_DateTime) AS MonthName, - FORMAT_DATE("%m", taxi_trips.Pickup_DateTime) AS MonthNumber, - CASE WHEN cast(taxi_trips.payment_type as INT64) = 1 THEN 'Credit' - WHEN cast(taxi_trips.payment_type as INT64) = 2 THEN 'Cash' - WHEN cast(taxi_trips.payment_type as INT64) = 3 THEN 'NoCharge' - WHEN cast(taxi_trips.payment_type as INT64) = 4 THEN 'Dispute' - END AS PaymentDescription, - taxi_trips.Passenger_Count, - taxi_trips.Total_Amount - FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips - WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - AND Passenger_Count IS NOT NULL - AND cast(payment_type as INT64) IN (1,2,3,4) +End30dInventory AS ( + SELECT + OnHand.*, + Orders.count_sold_30d, + count_in_stock - count_sold_30d AS expected_inventory_30d + FROM + OnHand + INNER JOIN + Orders USING (product_id) ) -SELECT MonthName, - Passenger_Count, - FORMAT("%'d", CAST(Credit AS INTEGER)) AS Credit, - FORMAT("%'d", CAST(Cash AS INTEGER)) AS Cash, - FORMAT("%'d", CAST(NoCharge AS INTEGER)) AS NoCharge, - FORMAT("%'d", CAST(Dispute AS INTEGER)) AS Dispute - FROM MonthlyData - PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute')) -ORDER BY MonthNumber, Passenger_Count; +SELECT + RANK() OVER (ORDER BY expected_inventory_30d ASC) AS rank, + End30dInventory.product_name, + End30dInventory.expected_inventory_30d, + End30dInventory.count_in_stock AS current_stock, + End30dInventory.count_sold_30d +FROM + End30dInventory +ORDER BY + rank ASC, current_stock DESC +; --- Query: data pivoted by payment type -WITH MonthlyData AS -( -SELECT FORMAT_DATE("%B", taxi_trips.Pickup_DateTime) AS MonthName, - FORMAT_DATE("%m", taxi_trips.Pickup_DateTime) AS MonthNumber, - CASE WHEN cast(taxi_trips.payment_type as INT64) = 1 THEN 'Credit' - WHEN cast(taxi_trips.payment_type as INT64) = 2 THEN 'Cash' - WHEN cast(taxi_trips.payment_type as INT64) = 3 THEN 'NoCharge' - WHEN cast(taxi_trips.payment_type as INT64) = 4 THEN 'Dispute' - END AS PaymentDescription, - SUM(taxi_trips.Total_Amount) AS Total_Amount - FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips - WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - AND Passenger_Count IS NOT NULL - AND cast(taxi_trips.payment_type as INT64) IN (1,2,3,4) - GROUP BY 1, 2, 3 +-- Query: data summed by month, then pivoted by department +with MonthlyData AS( + SELECT + sold_at, + FORMAT_DATE("%B", inventory.sold_at) AS month_name, + FORMAT_DATE("%m", inventory.sold_at) AS month_number, + SAFE_SUBTRACT(inventory.product_retail_price, inventory.cost) AS profit, + inventory.product_department AS product_department + FROM + `${project_id}.${dataset_id}.inventory_items` AS inventory + WHERE + sold_at IS NOT NULL ) -SELECT MonthName, - FORMAT("%'d", CAST(Credit AS INTEGER)) AS Credit, - FORMAT("%'d", CAST(Cash AS INTEGER)) AS Cash, - FORMAT("%'d", CAST(NoCharge AS INTEGER)) AS NoCharge, - FORMAT("%'d", CAST(Dispute AS INTEGER)) AS Dispute - FROM MonthlyData - PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute')) -ORDER BY MonthNumber; +SELECT + month_name, + FORMAT("%'d", CAST(Profit_Men AS INTEGER)) AS Profit_Men, + FORMAT("%'d", CAST(Profit_Women AS INTEGER)) AS Profit_Women +FROM + MonthlyData +PIVOT + (SUM(profit) AS Profit FOR product_department IN ("Men", "Women")) +ORDER BY month_number ASC +; --- Query: See what day of the week in each month has the greatest amount (that's the month/day to work) -WITH WeekdayData AS -( -SELECT FORMAT_DATE("%B", Pickup_DateTime) AS MonthName, - FORMAT_DATE("%m", Pickup_DateTime) AS MonthNumber, - FORMAT_DATE("%A", Pickup_DateTime) AS WeekdayName, - SUM(taxi_trips.Total_Amount) AS Total_Amount - FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips - WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - AND cast(taxi_trips.payment_type as INT64) IN (1,2,3,4) +-- Query: See what day of the week in each month has the greatest amount of sales(that's the month/day to work) +WITH WeekdayData AS ( + SELECT + FORMAT_DATE("%B", inventory.sold_at) AS month_name, + FORMAT_DATE("%m", inventory.sold_at) AS month_number, + FORMAT_DATE("%A", inventory.sold_at) AS weekday_name, + SUM(inventory.product_retail_price) AS revenue + FROM + `${project_id}.${dataset_id}.inventory_items` AS inventory + WHERE + inventory.sold_at IS NOT NULL GROUP BY 1, 2, 3 ) -SELECT MonthName, +SELECT month_name, FORMAT("%'d", CAST(Sunday AS INTEGER)) AS Sunday, FORMAT("%'d", CAST(Monday AS INTEGER)) AS Monday, FORMAT("%'d", CAST(Tuesday AS INTEGER)) AS Tuesday, @@ -149,5 +149,49 @@ SELECT MonthName, FORMAT("%'d", CAST(Friday AS INTEGER)) AS Friday, FORMAT("%'d", CAST(Saturday AS INTEGER)) AS Saturday, FROM WeekdayData - PIVOT(SUM(Total_Amount) FOR WeekdayName IN ('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')) -ORDER BY MonthNumber; + PIVOT(SUM(revenue) FOR weekday_name IN ('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')) +ORDER BY month_number +; + +-- Query: Revenue pivoted by category name for each month. +-- This query dynamically generates the pivot column names based on the distinct values in the product_category column +EXECUTE IMMEDIATE FORMAT(""" + with Subset AS( + SELECT + EXTRACT(MONTH FROM inventory.sold_at) AS month_number, + inventory.product_category, + inventory.product_retail_price + FROM + `${project_id}.${dataset_id}.inventory_items` AS inventory + WHERE + inventory.sold_at IS NOT NULL) + + SELECT + CASE + WHEN month_number = 1 THEN 'January' + WHEN month_number = 2 THEN 'February' + WHEN month_number = 3 THEN 'March' + WHEN month_number = 4 THEN 'April' + WHEN month_number = 5 THEN 'May' + WHEN month_number = 6 THEN 'June' + WHEN month_number = 7 THEN 'July' + WHEN month_number = 8 THEN 'August' + WHEN month_number = 9 THEN 'September' + WHEN month_number = 10 THEN 'October' + WHEN month_number = 11 THEN 'November' + WHEN month_number = 12 THEN 'December' + END AS month_name, + * EXCEPT (month_number) + FROM + Subset + PIVOT (SUM(Subset.product_retail_price) as Revenue FOR product_category IN %s) + ORDER BY month_number; + """, + ( + SELECT + CONCAT("(", STRING_AGG(DISTINCT CONCAT("'", product_category, "'"), ','), ")") + FROM + `${project_id}.${dataset_id}.inventory_items` + ) +) +; diff --git a/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql b/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql index c59ede6a..fa7c410f 100644 --- a/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql +++ b/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql @@ -16,56 +16,27 @@ The queries below are examples of non-BigQuery SQL syntax that can be used with the interactive translator to see before and after changes performed. -The sample queries below use PostgreSQL syntax.*/ +The sample query below uses PostgreSQL syntax.*/ /* Query 1 ------------- -CREATE TABLE taxi_trips (payment_type VARCHAR, Vendor_Id VARCHAR); -SELECT FORMAT_DATE("%w", Pickup_DateTime) AS WeekdayNumber, - FORMAT_DATE("%A", Pickup_DateTime) AS WeekdayName, - vendor.Vendor_Description, - payment_type.Payment_Type_Description, - SUM(taxi_trips.Total_Amount) AS high_value_trips - FROM ds_edw.taxi_trips AS taxi_trips - INNER JOIN ds_edw.vendor AS vendor - ON cast(taxi_trips.Vendor_Id as int) = vendor.Vendor_Id - AND taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - LEFT JOIN ds_edw.payment_type AS payment_type - ON taxi_trips.payment_type::int = payment_type.Payment_Type_Id -GROUP BY 1, 2, 3, 4 -HAVING SUM(taxi_trips.Total_Amount) > 50 -ORDER BY WeekdayNumber, 3, 4; +CREATE TABLE ${project_id}.${dataset_id}.inventory_items (id VARCHAR, product_id VARCHAR, created_at TIMESTAMP, sold_at TIMESTAMP, cost NUMERIC, product_category VARCHAR, product_name VARCHAR, product_brand VARCHAR, product_retail_price NUMERIC, product_department VARCHAR, product_sku VARCHAR, product_distribution_center_id VARCHAR); +CREATE TABLE ${project_id}.${dataset_id}.order_items (id INTEGER, order_id INTEGER, user_id INTEGER, product_id INTEGER, inventory_item_id INTEGER, status VARCHAR, created_at TIMESTAMP, shipped_at TIMESTAMP, delivered_at TIMESTAMP, returned_at TIMESTAMP, sale_price NUMERIC); + +SELECT + EXTRACT(dow from order_items.created_at) AS WeekdayNumber, + TO_CHAR(order_items.created_at, 'DAY') AS WeekdayName, + inventory.product_category AS product_category, + COUNT(DISTINCT order_items.order_id) AS num_high_value_orders +FROM ${project_id}.${dataset_id}.inventory_items AS inventory + INNER JOIN ${project_id}.${dataset_id}.order_items AS order_items + ON inventory.id::int = order_items.inventory_item_id + AND cast(inventory.product_id as int) = order_items.product_id + AND order_items.created_at BETWEEN TO_TIMESTAMP('2022-01-01','YYYY-MM-DD') AND TO_TIMESTAMP('2022-12-31','YYYY-MM-DD') +GROUP BY 1, 2, 3 +HAVING AVG(order_items.sale_price) > 85; */ -/* Query 2 -------------- -CREATE TABLE taxi_trips (payment_type VARCHAR, Vendor_Id VARCHAR); - -WITH TaxiDataRanking AS -( -SELECT CAST(Pickup_DateTime AS DATE) AS Pickup_Date, - taxi_trips.payment_type as Payment_Type_Id, - taxi_trips.Passenger_Count, - taxi_trips.Total_Amount, - RANK() OVER (PARTITION BY CAST(Pickup_DateTime AS DATE), - taxi_trips.payment_type - ORDER BY taxi_trips.Passenger_Count DESC, - taxi_trips.Total_Amount DESC) AS Ranking - FROM ds_edw.taxi_trips AS taxi_trips -WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01' - AND taxi_trips.payment_type::int IN (1,2) -) -SELECT Pickup_Date, - Payment_Type_Description, - Passenger_Count, - Total_Amount - FROM TaxiDataRanking - INNER JOIN ds_edw.payment_type AS payment_type - ON TaxiDataRanking.Payment_Type_Id = payment_type.Payment_Type_Id -WHERE Ranking = 1 -ORDER BY Pickup_Date, Payment_Type_Description; -*/ - SELECT 'OPEN THE STORED PROCEDURE FOR MORE DETAILS TO USE THE TRANSLATION SERVICE' as sql_text; diff --git a/modules/data_warehouse/src/taxi_trips_schema.json b/modules/data_warehouse/src/taxi_trips_schema.json deleted file mode 100644 index 5bf80035..00000000 --- a/modules/data_warehouse/src/taxi_trips_schema.json +++ /dev/null @@ -1,116 +0,0 @@ -[ - { - "name": "vendor_id", - "type": "STRING", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "pickup_datetime", - "type": "TIMESTAMP", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "dropoff_datetime", - "type": "TIMESTAMP", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "passenger_count", - "type": "INTEGER", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "trip_distance", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "rate_code", - "type": "STRING", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "store_and_fwd_flag", - "type": "STRING", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "payment_type", - "type": "STRING", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "fare_amount", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "extra", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "mta_tax", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "tip_amount", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "tolls_amount", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "imp_surcharge", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "airport_fee", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "total_amount", - "type": "NUMERIC", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "pickup_location_id", - "type": "STRING", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "data_file_year", - "type": "INTEGER", - "mode": "NULLABLE", - "description": "" - }, - { - "name": "data_file_month", - "type": "INTEGER", - "mode": "NULLABLE", - "description": "" - } - ] diff --git a/modules/data_warehouse/templates/workflow.tftpl b/modules/data_warehouse/templates/workflow.tftpl index b39338a6..51ff0ee3 100644 --- a/modules/data_warehouse/templates/workflow.tftpl +++ b/modules/data_warehouse/templates/workflow.tftpl @@ -33,12 +33,13 @@ copy_objects: assign: - source_bucket: "data-analytics-demos" - dest_bucket: ${raw_bucket} + - dataset_id: ${dataset_id} - copied_objects: [] - list_objects: call: googleapis.storage.v1.objects.list args: bucket: $${source_bucket} - prefix: "new-york-taxi-trips/tlc-yellow-trips-2022" + prefix: "thelook-ecommerce" result: list_result - start_counter: assign: @@ -81,8 +82,8 @@ create_tables: - results: {} - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} - map: - 1: $${"CALL `"+project_id+".ds_edw.sp_provision_lookup_tables`();"} - 2: $${"CALL `"+project_id+".ds_edw.sp_lookerstudio_report`();"} + 1: $${"CALL `"+project_id+".dataset_id.sp_provision_lookup_tables`();"} + 2: $${"CALL `"+project_id+".dataset_id.sp_lookerstudio_report`();"} - loopStepTables: for: value: key diff --git a/modules/data_warehouse/workflows.tf b/modules/data_warehouse/workflows.tf index c8835944..81973415 100644 --- a/modules/data_warehouse/workflows.tf +++ b/modules/data_warehouse/workflows.tf @@ -14,8 +14,7 @@ * limitations under the License. */ -# Set up Workflows service account -# # Set up the Workflows service account +# Set up the Workflows service account resource "google_service_account" "workflow_service_account" { project = module.project-services.project_id account_id = "cloud-workflow-sa-${random_id.id.hex}" @@ -32,12 +31,11 @@ resource "google_project_iam_member" "workflow_service_account_roles" { "roles/bigquery.connectionUser", "roles/bigquery.jobUser", "roles/bigquery.dataEditor", - ]) - + ] + ) project = module.project-services.project_id role = each.key member = "serviceAccount:${google_service_account.workflow_service_account.email}" - } # # Create the workflow @@ -49,7 +47,8 @@ resource "google_workflows_workflow" "workflow" { service_account = google_service_account.workflow_service_account.id source_contents = templatefile("${path.module}/templates/workflow.tftpl", { - raw_bucket = google_storage_bucket.raw_bucket.name + raw_bucket = google_storage_bucket.raw_bucket.name, + dataset_id = google_bigquery_dataset.ds_edw.dataset_id }) labels = var.labels