From 00faf09b6633450a6ddc1936e680e3b4321d2899 Mon Sep 17 00:00:00 2001 From: Anu Reddy Date: Mon, 11 Mar 2024 11:43:37 -0700 Subject: [PATCH] Improvements to Cloud SQL & cluster creation: 1) Pass already constructed cloud SQL instance connection name to Ray workers so users don't have to set it in the notebook 2) Set Cloud SQL region in frontend 3) Fix bug with cluster creation where random zones without GPU availability were being selected 4) Reduce SA name length (gets truncated when IM deployment prefix is added for Marketplace) --- .../rag-kaggle-ray-sql-latest.ipynb | 4 +- applications/rag/frontend/main.tf | 2 +- applications/rag/frontend/variables.tf | 6 +++ applications/rag/main.tf | 1 + applications/rag/metadata.yaml | 6 +-- applications/rag/variables.tf | 6 +-- applications/rag/workloads.tfvars | 6 +-- infrastructure/main.tf | 4 +- .../kuberay-autopilot-values.yaml | 18 +++---- .../kuberay-cluster/kuberay-gpu-values.yaml | 12 ++--- .../kuberay-cluster/kuberay-tpu-values.yaml | 18 +++---- modules/kuberay-cluster/kuberay-values.yaml | 12 ++--- modules/kuberay-cluster/main.tf | 53 +++++++++---------- modules/kuberay-cluster/variables.tf | 6 +++ 14 files changed, 73 insertions(+), 81 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb index f4e7ce9fd..e94e48958 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb @@ -76,7 +76,7 @@ "import sqlalchemy\n", "\n", "# initialize parameters\n", - "INSTANCE_CONNECTION_NAME = \"{project}:{region}:pgvector-instance\".format(project=os.environ[\"PROJECT_ID\"], region=os.environ[\"DB_REGION\"])\n", + "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", "DB_NAME = \"pgvector-database\"\n", "\n", @@ -264,7 +264,7 @@ "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df", "metadata": {}, "outputs": [], - "source": [ + "source": [ "job_id = client.submit_job(\n", " entrypoint=\"python test.py\",\n", " # Path to the local directory that contains the entrypoint file.\n", diff --git a/applications/rag/frontend/main.tf b/applications/rag/frontend/main.tf index 5ee5dc2ad..eaf19f09b 100644 --- a/applications/rag/frontend/main.tf +++ b/applications/rag/frontend/main.tf @@ -16,7 +16,7 @@ data "google_project" "project" { } locals { - instance_connection_name = format("%s:%s:%s", var.project_id, var.region, var.cloudsql_instance) + instance_connection_name = format("%s:%s:%s", var.project_id, var.cloudsql_instance_region, var.cloudsql_instance) } # IAP Section: Creates the GKE components diff --git a/applications/rag/frontend/variables.tf b/applications/rag/frontend/variables.tf index 0fa73dc47..3146f1ace 100644 --- a/applications/rag/frontend/variables.tf +++ b/applications/rag/frontend/variables.tf @@ -35,6 +35,12 @@ variable "cloudsql_instance" { default = "pgvector-instance" } +variable "cloudsql_instance_region" { + type = string + description = "Name of the CloudSQL instance for RAG VectorDB" + default = "us-central1" +} + variable "db_secret_name" { type = string description = "CloudSQL user credentials" diff --git a/applications/rag/main.tf b/applications/rag/main.tf index d428f55fb..f36afc271 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -225,6 +225,7 @@ module "frontend" { namespace = var.kubernetes_namespace inference_service_endpoint = module.inference-server.inference_service_endpoint cloudsql_instance = module.cloudsql.instance + cloudsql_instance_region = var.cloudsql_instance_region db_secret_name = module.cloudsql.db_secret_name dataset_embeddings_table_name = var.dataset_embeddings_table_name diff --git a/applications/rag/metadata.yaml b/applications/rag/metadata.yaml index 44fc12c7e..585f1dd4f 100644 --- a/applications/rag/metadata.yaml +++ b/applications/rag/metadata.yaml @@ -83,7 +83,7 @@ spec: - name: jupyter_service_account description: Google Cloud IAM service account for authenticating with GCP services varType: string - defaultValue: jupyter-system-account + defaultValue: jupyter - name: k8s_backend_config_name description: Name of the Backend Config on GCP varType: string @@ -112,11 +112,11 @@ spec: - name: rag_service_account description: Google Cloud IAM service account for authenticating with GCP services varType: string - defaultValue: rag-system-account + defaultValue: rag-sa - name: ray_service_account description: Google Cloud IAM service account for authenticating with GCP services varType: string - defaultValue: ray-system-account + defaultValue: ray-sa - name: support_email description: Email for users to contact with questions about their consent varType: string diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf index d7d02713a..2f7a8f068 100644 --- a/applications/rag/variables.tf +++ b/applications/rag/variables.tf @@ -40,7 +40,7 @@ variable "kubernetes_namespace" { variable "jupyter_service_account" { type = string description = "Google Cloud IAM service account for authenticating with GCP services" - default = "jupyter-system-account" + default = "jupyter" } variable "enable_grafana_on_ray_dashboard" { @@ -57,7 +57,7 @@ variable "create_ray_service_account" { variable "ray_service_account" { type = string description = "Google Cloud IAM service account for authenticating with GCP services" - default = "ray-system-account" + default = "ray-sa" } variable "create_rag_service_account" { @@ -69,7 +69,7 @@ variable "create_rag_service_account" { variable "rag_service_account" { type = string description = "Google Cloud IAM service account for authenticating with GCP services" - default = "rag-system-account" + default = "rag-sa" } variable "create_gcs_bucket" { diff --git a/applications/rag/workloads.tfvars b/applications/rag/workloads.tfvars index 1b57f5ddd..6b2c9b44a 100644 --- a/applications/rag/workloads.tfvars +++ b/applications/rag/workloads.tfvars @@ -30,16 +30,16 @@ cloudsql_instance_region = "us-central1" # Creates a google service account & k8s service account & configures workload identity with appropriate permissions. # Set to false & update the variable `ray_service_account` to use an existing IAM service account. create_ray_service_account = true -ray_service_account = "ray-system-account" +ray_service_account = "ray-sa" enable_grafana_on_ray_dashboard = false # Creates a google service account & k8s service account & configures workload identity with appropriate permissions. # Set to false & update the variable `rag_service_account` to use an existing IAM service account. create_rag_service_account = true -rag_service_account = "rag-system-account" +rag_service_account = "rag-sa" # Creates a google service account & k8s service account & configures workload identity with appropriate permissions. # Set to false & update the variable `jupyter_service_account` to use an existing IAM service account. -jupyter_service_account = "jupyter-system-account" +jupyter_service_account = "jupyter" ## Embeddings table name - change this to the TABLE_NAME used in the notebook. dataset_embeddings_table_name = "googlemaps_reviews_db" diff --git a/infrastructure/main.tf b/infrastructure/main.tf index 243281a00..1a1f3fdac 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -62,7 +62,9 @@ locals { subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name region = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : "" regional = local.region != "" ? true : false - zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : [] + # zone needs to be set even for regional clusters, otherwise this module picks random zones that don't have GPU availability: + # https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/blob/af354afdf13b336014cefbfe8f848e52c17d4415/main.tf#L46 + zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : split(",", local.gpu_l4_t4_location[local.region]) # Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm] } diff --git a/modules/kuberay-cluster/kuberay-autopilot-values.yaml b/modules/kuberay-cluster/kuberay-autopilot-values.yaml index f1a9956d5..e6cd19616 100644 --- a/modules/kuberay-cluster/kuberay-autopilot-values.yaml +++ b/modules/kuberay-cluster/kuberay-autopilot-values.yaml @@ -73,10 +73,8 @@ head: value: http://grafana:80 - name: RAY_PROMETHEUS_HOST value: http://frontend:9090 - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret @@ -184,10 +182,8 @@ additionalWorkerGroups: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret @@ -288,10 +284,8 @@ additionalWorkerGroups: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret diff --git a/modules/kuberay-cluster/kuberay-gpu-values.yaml b/modules/kuberay-cluster/kuberay-gpu-values.yaml index fccd2a1ce..fc3695a70 100644 --- a/modules/kuberay-cluster/kuberay-gpu-values.yaml +++ b/modules/kuberay-cluster/kuberay-gpu-values.yaml @@ -68,10 +68,8 @@ head: value: http://grafana:80 - name: RAY_PROMETHEUS_HOST value: http://frontend:9090 - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret @@ -176,10 +174,8 @@ worker: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret diff --git a/modules/kuberay-cluster/kuberay-tpu-values.yaml b/modules/kuberay-cluster/kuberay-tpu-values.yaml index 637f6104b..5ae9d737d 100644 --- a/modules/kuberay-cluster/kuberay-tpu-values.yaml +++ b/modules/kuberay-cluster/kuberay-tpu-values.yaml @@ -67,10 +67,8 @@ head: value: http://grafana:80 - name: RAY_PROMETHEUS_HOST value: http://frontend:9090 - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} ports: - containerPort: 6379 name: gcs @@ -166,10 +164,8 @@ worker: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret @@ -262,10 +258,8 @@ additionalWorkerGroups: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret diff --git a/modules/kuberay-cluster/kuberay-values.yaml b/modules/kuberay-cluster/kuberay-values.yaml index de9afb655..e30df3e25 100644 --- a/modules/kuberay-cluster/kuberay-values.yaml +++ b/modules/kuberay-cluster/kuberay-values.yaml @@ -68,10 +68,8 @@ head: value: http://grafana:80 - name: RAY_PROMETHEUS_HOST value: http://frontend:9090 - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret @@ -173,10 +171,8 @@ worker: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - - name: PROJECT_ID - value: ${project_id} - - name: DB_REGION - value: ${db_region} + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret diff --git a/modules/kuberay-cluster/main.tf b/modules/kuberay-cluster/main.tf index 7f249683a..95bc7c20a 100644 --- a/modules/kuberay-cluster/main.tf +++ b/modules/kuberay-cluster/main.tf @@ -20,6 +20,7 @@ resource "google_storage_bucket_iam_member" "gcs-bucket-iam" { locals { security_context = chomp(yamlencode({ for k, v in var.security_context : k => v if v != null })) + cloudsql_instance_connection_name = format("%s:%s:%s",var.project_id,var.db_region,var.cloudsql_instance_name) } resource "helm_release" "ray-cluster" { @@ -31,37 +32,33 @@ resource "helm_release" "ray-cluster" { version = "1.0.0" values = [ var.autopilot_cluster ? templatefile("${path.module}/kuberay-autopilot-values.yaml", { - gcs_bucket = var.gcs_bucket - k8s_service_account = var.google_service_account - grafana_host = var.grafana_host - security_context = local.security_context - secret_name = var.db_secret_name - project_id = var.project_id - db_region = var.db_region + gcs_bucket = var.gcs_bucket + k8s_service_account = var.google_service_account + grafana_host = var.grafana_host + security_context = local.security_context + secret_name = var.db_secret_name + cloudsql_instance_connection_name = local.cloudsql_instance_connection_name }) : var.enable_tpu ? templatefile("${path.module}/kuberay-tpu-values.yaml", { - gcs_bucket = var.gcs_bucket - k8s_service_account = var.google_service_account - grafana_host = var.grafana_host - security_context = local.security_context - secret_name = var.db_secret_name - project_id = var.project_id - db_region = var.db_region + gcs_bucket = var.gcs_bucket + k8s_service_account = var.google_service_account + grafana_host = var.grafana_host + security_context = local.security_context + secret_name = var.db_secret_name + cloudsql_instance_connection_name = local.cloudsql_instance_connection_name }) : var.enable_gpu ? templatefile("${path.module}/kuberay-gpu-values.yaml", { - gcs_bucket = var.gcs_bucket - k8s_service_account = var.google_service_account - grafana_host = var.grafana_host - security_context = local.security_context - secret_name = var.db_secret_name - project_id = var.project_id - db_region = var.db_region + gcs_bucket = var.gcs_bucket + k8s_service_account = var.google_service_account + grafana_host = var.grafana_host + security_context = local.security_context + secret_name = var.db_secret_name + cloudsql_instance_connection_name = local.cloudsql_instance_connection_name }) : templatefile("${path.module}/kuberay-values.yaml", { - gcs_bucket = var.gcs_bucket - k8s_service_account = var.google_service_account - grafana_host = var.grafana_host - security_context = local.security_context - secret_name = var.db_secret_name - project_id = var.project_id - db_region = var.db_region + gcs_bucket = var.gcs_bucket + k8s_service_account = var.google_service_account + grafana_host = var.grafana_host + security_context = local.security_context + secret_name = var.db_secret_name + cloudsql_instance_connection_name = local.cloudsql_instance_connection_name }) ] } diff --git a/modules/kuberay-cluster/variables.tf b/modules/kuberay-cluster/variables.tf index 1d2a6f699..2ccf85950 100644 --- a/modules/kuberay-cluster/variables.tf +++ b/modules/kuberay-cluster/variables.tf @@ -29,6 +29,12 @@ variable "db_region" { default = "us-central1" } +variable "cloudsql_instance_name" { + type = string + description = "Cloud SQL instance name" + default = "pgvector-instance" +} + variable "namespace" { type = string description = "Kubernetes namespace where resources are deployed"