Skip to content

Commit

Permalink
Improvements to Cloud SQL & cluster creation:
Browse files Browse the repository at this point in the history
1) Pass already constructed cloud SQL instance connection name to Ray workers so users don't have to set it in the notebook
2) Set Cloud SQL region in frontend
3) Fix bug with cluster creation where random zones without GPU availability were being selected
4) Reduce SA name length (gets truncated when IM deployment prefix is added for Marketplace)
  • Loading branch information
imreddy13 committed Mar 13, 2024
1 parent 17a8d18 commit 00faf09
Show file tree
Hide file tree
Showing 14 changed files with 73 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"import sqlalchemy\n",
"\n",
"# initialize parameters\n",
"INSTANCE_CONNECTION_NAME = \"{project}:{region}:pgvector-instance\".format(project=os.environ[\"PROJECT_ID\"], region=os.environ[\"DB_REGION\"])\n",
"INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n",
"print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n",
"DB_NAME = \"pgvector-database\"\n",
"\n",
Expand Down Expand Up @@ -264,7 +264,7 @@
"id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df",
"metadata": {},
"outputs": [],
"source": [
"source": [
"job_id = client.submit_job(\n",
" entrypoint=\"python test.py\",\n",
" # Path to the local directory that contains the entrypoint file.\n",
Expand Down
2 changes: 1 addition & 1 deletion applications/rag/frontend/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ data "google_project" "project" {
}

locals {
instance_connection_name = format("%s:%s:%s", var.project_id, var.region, var.cloudsql_instance)
instance_connection_name = format("%s:%s:%s", var.project_id, var.cloudsql_instance_region, var.cloudsql_instance)
}

# IAP Section: Creates the GKE components
Expand Down
6 changes: 6 additions & 0 deletions applications/rag/frontend/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ variable "cloudsql_instance" {
default = "pgvector-instance"
}

variable "cloudsql_instance_region" {
type = string
description = "Name of the CloudSQL instance for RAG VectorDB"
default = "us-central1"
}

variable "db_secret_name" {
type = string
description = "CloudSQL user credentials"
Expand Down
1 change: 1 addition & 0 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ module "frontend" {
namespace = var.kubernetes_namespace
inference_service_endpoint = module.inference-server.inference_service_endpoint
cloudsql_instance = module.cloudsql.instance
cloudsql_instance_region = var.cloudsql_instance_region
db_secret_name = module.cloudsql.db_secret_name
dataset_embeddings_table_name = var.dataset_embeddings_table_name

Expand Down
6 changes: 3 additions & 3 deletions applications/rag/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ spec:
- name: jupyter_service_account
description: Google Cloud IAM service account for authenticating with GCP services
varType: string
defaultValue: jupyter-system-account
defaultValue: jupyter
- name: k8s_backend_config_name
description: Name of the Backend Config on GCP
varType: string
Expand Down Expand Up @@ -112,11 +112,11 @@ spec:
- name: rag_service_account
description: Google Cloud IAM service account for authenticating with GCP services
varType: string
defaultValue: rag-system-account
defaultValue: rag-sa
- name: ray_service_account
description: Google Cloud IAM service account for authenticating with GCP services
varType: string
defaultValue: ray-system-account
defaultValue: ray-sa
- name: support_email
description: Email for users to contact with questions about their consent
varType: string
Expand Down
6 changes: 3 additions & 3 deletions applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ variable "kubernetes_namespace" {
variable "jupyter_service_account" {
type = string
description = "Google Cloud IAM service account for authenticating with GCP services"
default = "jupyter-system-account"
default = "jupyter"
}

variable "enable_grafana_on_ray_dashboard" {
Expand All @@ -57,7 +57,7 @@ variable "create_ray_service_account" {
variable "ray_service_account" {
type = string
description = "Google Cloud IAM service account for authenticating with GCP services"
default = "ray-system-account"
default = "ray-sa"
}

variable "create_rag_service_account" {
Expand All @@ -69,7 +69,7 @@ variable "create_rag_service_account" {
variable "rag_service_account" {
type = string
description = "Google Cloud IAM service account for authenticating with GCP services"
default = "rag-system-account"
default = "rag-sa"
}

variable "create_gcs_bucket" {
Expand Down
6 changes: 3 additions & 3 deletions applications/rag/workloads.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ cloudsql_instance_region = "us-central1"
# Creates a google service account & k8s service account & configures workload identity with appropriate permissions.
# Set to false & update the variable `ray_service_account` to use an existing IAM service account.
create_ray_service_account = true
ray_service_account = "ray-system-account"
ray_service_account = "ray-sa"
enable_grafana_on_ray_dashboard = false
# Creates a google service account & k8s service account & configures workload identity with appropriate permissions.
# Set to false & update the variable `rag_service_account` to use an existing IAM service account.
create_rag_service_account = true
rag_service_account = "rag-system-account"
rag_service_account = "rag-sa"

# Creates a google service account & k8s service account & configures workload identity with appropriate permissions.
# Set to false & update the variable `jupyter_service_account` to use an existing IAM service account.
jupyter_service_account = "jupyter-system-account"
jupyter_service_account = "jupyter"

## Embeddings table name - change this to the TABLE_NAME used in the notebook.
dataset_embeddings_table_name = "googlemaps_reviews_db"
Expand Down
4 changes: 3 additions & 1 deletion infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ locals {
subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name
region = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : ""
regional = local.region != "" ? true : false
zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : []
# zone needs to be set even for regional clusters, otherwise this module picks random zones that don't have GPU availability:
# https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/blob/af354afdf13b336014cefbfe8f848e52c17d4415/main.tf#L46
zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : split(",", local.gpu_l4_t4_location[local.region])
# Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided
gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm]
}
Expand Down
18 changes: 6 additions & 12 deletions modules/kuberay-cluster/kuberay-autopilot-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,8 @@ head:
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down Expand Up @@ -184,10 +182,8 @@ additionalWorkerGroups:
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down Expand Up @@ -288,10 +284,8 @@ additionalWorkerGroups:
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down
12 changes: 4 additions & 8 deletions modules/kuberay-cluster/kuberay-gpu-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,8 @@ head:
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down Expand Up @@ -176,10 +174,8 @@ worker:
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down
18 changes: 6 additions & 12 deletions modules/kuberay-cluster/kuberay-tpu-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,8 @@ head:
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
ports:
- containerPort: 6379
name: gcs
Expand Down Expand Up @@ -166,10 +164,8 @@ worker:
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down Expand Up @@ -262,10 +258,8 @@ additionalWorkerGroups:
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down
12 changes: 4 additions & 8 deletions modules/kuberay-cluster/kuberay-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,8 @@ head:
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down Expand Up @@ -173,10 +171,8 @@ worker:
# containerEnv specifies environment variables for the Ray container,
# Follows standard K8s container env schema.
containerEnv:
- name: PROJECT_ID
value: ${project_id}
- name: DB_REGION
value: ${db_region}
- name: CLOUDSQL_INSTANCE_CONNECTION_NAME
value: ${cloudsql_instance_connection_name}
envFrom: []
# - secretRef:
# name: my-env-secret
Expand Down
53 changes: 25 additions & 28 deletions modules/kuberay-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ resource "google_storage_bucket_iam_member" "gcs-bucket-iam" {

locals {
security_context = chomp(yamlencode({ for k, v in var.security_context : k => v if v != null }))
cloudsql_instance_connection_name = format("%s:%s:%s",var.project_id,var.db_region,var.cloudsql_instance_name)
}

resource "helm_release" "ray-cluster" {
Expand All @@ -31,37 +32,33 @@ resource "helm_release" "ray-cluster" {
version = "1.0.0"
values = [
var.autopilot_cluster ? templatefile("${path.module}/kuberay-autopilot-values.yaml", {
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
project_id = var.project_id
db_region = var.db_region
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
cloudsql_instance_connection_name = local.cloudsql_instance_connection_name
}) : var.enable_tpu ? templatefile("${path.module}/kuberay-tpu-values.yaml", {
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
project_id = var.project_id
db_region = var.db_region
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
cloudsql_instance_connection_name = local.cloudsql_instance_connection_name
}) : var.enable_gpu ? templatefile("${path.module}/kuberay-gpu-values.yaml", {
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
project_id = var.project_id
db_region = var.db_region
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
cloudsql_instance_connection_name = local.cloudsql_instance_connection_name
}) : templatefile("${path.module}/kuberay-values.yaml", {
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
project_id = var.project_id
db_region = var.db_region
gcs_bucket = var.gcs_bucket
k8s_service_account = var.google_service_account
grafana_host = var.grafana_host
security_context = local.security_context
secret_name = var.db_secret_name
cloudsql_instance_connection_name = local.cloudsql_instance_connection_name
})
]
}
Expand Down
6 changes: 6 additions & 0 deletions modules/kuberay-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ variable "db_region" {
default = "us-central1"
}

variable "cloudsql_instance_name" {
type = string
description = "Cloud SQL instance name"
default = "pgvector-instance"
}

variable "namespace" {
type = string
description = "Kubernetes namespace where resources are deployed"
Expand Down

0 comments on commit 00faf09

Please sign in to comment.