Skip to content

Commit

Permalink
feat!: data_warehosue migrating to TheLook Ecommerce dataset (#257)
Browse files Browse the repository at this point in the history
  • Loading branch information
shanecglass authored Oct 9, 2023
1 parent 096ca4e commit e97adfb
Show file tree
Hide file tree
Showing 20 changed files with 976 additions and 450 deletions.
4 changes: 2 additions & 2 deletions modules/data_warehouse/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The resources/services/activations/deletions that this module will create/trigge
- Creates a BigQuery Dataset
- Creates a BigQuery Table
- Creates a Google Cloud Storage bucket
- Loads the Google Cloud Storage bucket with data from https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips
- Loads the Google Cloud Storage bucket with data from [TheLook eCommerce Public Dataset](https://console.cloud.google.com/marketplace/product/bigquery-public-data/thelook-ecommerce)
- Provides SQL examples
- Creates and inferences with a BigQuery ML model
- Creates a Looker Studio report
Expand Down Expand Up @@ -47,7 +47,7 @@ Functional examples are included in the
|------|-------------|
| bigquery\_editor\_url | The URL to launch the BigQuery editor with the sample query procedure opened |
| ds\_friendly\_name | Dataset name |
| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the taxi data analysis |
| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the e-commerce data analysis |
| neos\_tutorial\_url | The URL to launch the in-console tutorial for the EDW solution |
| raw\_bucket | Raw bucket name |

Expand Down
4 changes: 2 additions & 2 deletions modules/data_warehouse/assets/data-warehouse-architecture.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
157 changes: 124 additions & 33 deletions modules/data_warehouse/bigquery.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
# # Create the BigQuery dataset
resource "google_bigquery_dataset" "ds_edw" {
project = module.project-services.project_id
dataset_id = "ds_edw"
dataset_id = "thelook"
friendly_name = "My EDW Dataset"
description = "My EDW Dataset with tables"
location = var.region
labels = var.labels
delete_contents_on_destroy = var.force_destroy

depends_on = [time_sleep.wait_after_apis]
}

# # Create a BigQuery connection
Expand All @@ -33,6 +35,7 @@ resource "google_bigquery_connection" "ds_connection" {
location = var.region
friendly_name = "Storage Bucket Connection"
cloud_resource {}
depends_on = [time_sleep.wait_after_apis]
}

# # Grant IAM access to the BigQuery Connection account for Cloud Storage
Expand All @@ -42,64 +45,146 @@ resource "google_storage_bucket_iam_binding" "bq_connection_iam_object_viewer" {
members = [
"serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}",
]
}

depends_on = [
google_bigquery_connection.ds_connection,
]
# # Create a Biglake table for events with metadata caching
resource "google_bigquery_table" "tbl_edw_events" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
table_id = "events"
project = module.project-services.project_id
deletion_protection = var.deletion_protection

schema = file("${path.module}/src/schema/events_schema.json")

external_data_configuration {
autodetect = true
connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/events.parquet"]
}

labels = var.labels
}

# # Create a BigQuery external table
resource "google_bigquery_table" "tbl_edw_taxi" {
# # Create a Biglake table for inventory_items
resource "google_bigquery_table" "tbl_edw_inventory_items" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
table_id = "taxi_trips"
table_id = "inventory_items"
project = module.project-services.project_id
deletion_protection = var.deletion_protection

schema = file("${path.module}/src/schema/inventory_items_schema.json")

external_data_configuration {
autodetect = true
connection_id = "${module.project-services.project_id}.${var.region}.ds_connection"
connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/new-york-taxi-trips/tlc-yellow-trips-2022/taxi-*.Parquet"]
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/inventory_items.parquet"]
}

labels = var.labels
}

# # Create a Biglake table with metadata caching for order_items
resource "google_bigquery_table" "tbl_edw_order_items" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
table_id = "order_items"
project = module.project-services.project_id
deletion_protection = var.deletion_protection

schema = file("${path.module}/src/schema/order_items_schema.json")

external_data_configuration {
autodetect = true
connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/order_items.parquet"]
}

schema = file("${path.module}/src/taxi_trips_schema.json")
labels = var.labels
}

# # Create a Biglake table for orders
resource "google_bigquery_table" "tbl_edw_orders" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
table_id = "orders"
project = module.project-services.project_id
deletion_protection = var.deletion_protection

schema = file("${path.module}/src/schema/orders_schema.json")

external_data_configuration {
autodetect = true
connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/orders.parquet"]
}

labels = var.labels
}

depends_on = [
google_bigquery_connection.ds_connection,
google_storage_bucket.raw_bucket,
]
# # Create a Biglake table for products
resource "google_bigquery_table" "tbl_edw_products" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
table_id = "products"
project = module.project-services.project_id
deletion_protection = var.deletion_protection

schema = file("${path.module}/src/schema/products_schema.json")

external_data_configuration {
autodetect = true
connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/products.parquet"]
}

labels = var.labels
}

# # Create a Biglake table for products
resource "google_bigquery_table" "tbl_edw_users" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
table_id = "users"
project = module.project-services.project_id
deletion_protection = var.deletion_protection

schema = file("${path.module}/src/schema/users_schema.json")

external_data_configuration {
autodetect = true
connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/users.parquet"]
}

labels = var.labels
}

# Load Queries for Stored Procedure Execution
# # Load Lookup Data Tables
# # Load Distribution Center Lookup Data Tables
resource "google_bigquery_routine" "sp_provision_lookup_tables" {
project = module.project-services.project_id
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
routine_id = "sp_provision_lookup_tables"
routine_type = "PROCEDURE"
language = "SQL"
definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id })

depends_on = [
google_bigquery_dataset.ds_edw,
]
definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
}


# # Add Looker Studio Data Report Procedure
resource "google_bigquery_routine" "sproc_sp_demo_datastudio_report" {
# Add Looker Studio Data Report Procedure
resource "google_bigquery_routine" "sproc_sp_demo_lookerstudio_report" {
project = module.project-services.project_id
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
routine_id = "sp_lookerstudio_report"
routine_type = "PROCEDURE"
language = "SQL"
definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id })
definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })

depends_on = [
google_bigquery_table.tbl_edw_taxi,
google_bigquery_table.tbl_edw_inventory_items,
google_bigquery_table.tbl_edw_order_items,
google_bigquery_routine.sp_provision_lookup_tables,
]
}

Expand All @@ -110,24 +195,26 @@ resource "google_bigquery_routine" "sp_sample_queries" {
routine_id = "sp_sample_queries"
routine_type = "PROCEDURE"
language = "SQL"
definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id })
definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })

depends_on = [
google_bigquery_table.tbl_edw_taxi,
google_bigquery_table.tbl_edw_inventory_items,
google_bigquery_table.tbl_edw_order_items,
]
}

# # Add Bigquery ML Model

# Add Bigquery ML Model
resource "google_bigquery_routine" "sp_bigqueryml_model" {
project = module.project-services.project_id
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
routine_id = "sp_bigqueryml_model"
routine_type = "PROCEDURE"
language = "SQL"
definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id })
definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })

depends_on = [
google_bigquery_table.tbl_edw_taxi,
google_bigquery_table.tbl_edw_order_items,
]
}

Expand All @@ -138,10 +225,10 @@ resource "google_bigquery_routine" "sp_sample_translation_queries" {
routine_id = "sp_sample_translation_queries"
routine_type = "PROCEDURE"
language = "SQL"
definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id })
definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })

depends_on = [
google_bigquery_table.tbl_edw_taxi,
google_bigquery_table.tbl_edw_inventory_items,
]
}

Expand All @@ -151,6 +238,8 @@ resource "google_project_service_identity" "bigquery_data_transfer_sa" {
provider = google-beta
project = module.project-services.project_id
service = "bigquerydatatransfer.googleapis.com"

depends_on = [time_sleep.wait_after_apis]
}

# # Grant the DTS service account access
Expand All @@ -162,6 +251,8 @@ resource "google_project_iam_member" "dts_service_account_roles" {
project = module.project-services.project_id
role = each.key
member = "serviceAccount:${google_project_service_identity.bigquery_data_transfer_sa.email}"

depends_on = [time_sleep.wait_after_apis]
}

# Create specific service account for DTS Run
Expand Down Expand Up @@ -206,7 +297,7 @@ resource "google_bigquery_data_transfer_config" "dts_config" {
data_source_id = "scheduled_query"
schedule = "every day 00:00"
params = {
query = "CALL `${module.project-services.project_id}.ds_edw.sp_bigqueryml_model`()"
query = "CALL `${module.project-services.project_id}.${google_bigquery_dataset.ds_edw.dataset_id}.sp_bigqueryml_model`()"
}
service_account_name = google_service_account.dts.email

Expand Down
Loading

0 comments on commit e97adfb

Please sign in to comment.