diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 1cf146df4..2b30c4fef 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -153,15 +153,13 @@ module "namespace" { } module "kuberay-operator" { - source = "../../modules/kuberay-operator" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - name = "kuberay-operator" - project_id = var.project_id - create_namespace = true - namespace = local.kubernetes_namespace - google_service_account = local.ray_service_account - create_service_account = var.create_ray_service_account - autopilot_cluster = local.enable_autopilot + source = "../../modules/kuberay-operator" + providers = { helm = helm.rag, kubernetes = kubernetes.rag } + name = "kuberay-operator" + project_id = var.project_id + create_namespace = true + namespace = local.kubernetes_namespace + autopilot_cluster = local.enable_autopilot } module "gcs" { @@ -225,6 +223,32 @@ module "kuberay-logging" { depends_on = [module.namespace] } +module "kuberay-workload-identity" { + providers = { kubernetes = kubernetes.rag } + source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" + version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts + use_existing_gcp_sa = !var.create_ray_service_account + name = local.ray_service_account + namespace = local.kubernetes_namespace + project_id = var.project_id + roles = ["roles/cloudsql.client", "roles/monitoring.viewer"] + automount_service_account_token = true + depends_on = [module.namespace] +} + +module "kuberay-monitoring" { + source = "../../modules/kuberay-monitoring" + providers = { helm = helm.rag, kubernetes = kubernetes.rag } + project_id = var.project_id + autopilot_cluster = local.enable_autopilot + namespace = local.kubernetes_namespace + create_namespace = true + enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard + k8s_service_account = local.ray_service_account + //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. + depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity] +} + module "kuberay-cluster" { source = "../../modules/kuberay-cluster" providers = { helm = helm.rag, kubernetes = kubernetes.rag } @@ -233,16 +257,17 @@ module "kuberay-cluster" { enable_gpu = true gcs_bucket = var.gcs_bucket autopilot_cluster = local.enable_autopilot - db_secret_name = module.cloudsql.db_secret_name cloudsql_instance_name = local.cloudsql_instance db_region = local.cloudsql_instance_region google_service_account = local.ray_service_account - grafana_host = module.kuberay-monitoring.grafana_uri disable_network_policy = var.disable_ray_cluster_network_policy - depends_on = [module.kuberay-operator] use_custom_image = true additional_labels = var.additional_labels + # Implicit dependency + db_secret_name = module.cloudsql.db_secret_name + grafana_host = module.kuberay-monitoring.grafana_uri + # IAP Auth parameters add_auth = var.ray_dashboard_add_auth create_brand = var.create_brand @@ -256,19 +281,8 @@ module "kuberay-cluster" { k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port domain = var.ray_dashboard_domain members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] -} - -module "kuberay-monitoring" { - source = "../../modules/kuberay-monitoring" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - project_id = var.project_id - autopilot_cluster = local.enable_autopilot - namespace = local.kubernetes_namespace - create_namespace = true - enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard - k8s_service_account = local.ray_service_account - # TODO(umeshkumhar): remove kuberay-operator depends, figure out service account dependency - depends_on = [module.namespace, module.kuberay-operator] + //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. + depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity] } module "inference-server" { diff --git a/applications/ray/main.tf b/applications/ray/main.tf index 207807532..8f4a5ecef 100644 --- a/applications/ray/main.tf +++ b/applications/ray/main.tf @@ -134,16 +134,27 @@ module "namespace" { namespace = local.kubernetes_namespace } +module "kuberay-workload-identity" { + providers = { kubernetes = kubernetes.ray } + source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" + version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts + use_existing_gcp_sa = !var.create_service_account + name = local.workload_identity_service_account + namespace = local.kubernetes_namespace + project_id = var.project_id + roles = ["roles/cloudsql.client", "roles/monitoring.viewer"] + automount_service_account_token = true + depends_on = [module.namespace] +} + module "kuberay-operator" { - source = "../../modules/kuberay-operator" - providers = { helm = helm.ray, kubernetes = kubernetes.ray } - name = "kuberay-operator" - create_namespace = true - namespace = local.kubernetes_namespace - project_id = var.project_id - autopilot_cluster = local.enable_autopilot - google_service_account = local.workload_identity_service_account - create_service_account = var.create_service_account + source = "../../modules/kuberay-operator" + providers = { helm = helm.ray, kubernetes = kubernetes.ray } + name = "kuberay-operator" + create_namespace = true + namespace = local.kubernetes_namespace + project_id = var.project_id + autopilot_cluster = local.enable_autopilot } module "kuberay-logging" { @@ -164,7 +175,8 @@ module "kuberay-monitoring" { create_namespace = true enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard k8s_service_account = local.workload_identity_service_account - depends_on = [module.kuberay-operator] + //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. + depends_on = [module.kuberay-workload-identity, module.kuberay-operator] } module "gcs" { @@ -204,7 +216,8 @@ module "kuberay-cluster" { k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port domain = var.ray_dashboard_domain members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] - depends_on = [module.gcs, module.kuberay-operator] + //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. + depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity] } diff --git a/modules/kuberay-operator/kuberay.tf b/modules/kuberay-operator/kuberay.tf index 2315a3149..e754bdf89 100644 --- a/modules/kuberay-operator/kuberay.tf +++ b/modules/kuberay-operator/kuberay.tf @@ -23,33 +23,6 @@ resource "helm_release" "kuberay-operator" { create_namespace = var.create_namespace } -module "kuberay-workload-identity" { - source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" - version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts - use_existing_gcp_sa = !var.create_service_account - name = var.google_service_account - namespace = var.namespace - project_id = var.project_id - roles = ["roles/cloudsql.client", "roles/monitoring.viewer"] - - automount_service_account_token = true - - depends_on = [helm_release.kuberay-operator] -} - -resource "kubernetes_secret_v1" "service_account_token" { - metadata { - name = "kuberay-sa-token" - namespace = var.namespace - annotations = { - "kubernetes.io/service-account.name" = var.google_service_account - } - } - type = "kubernetes.io/service-account-token" - - depends_on = [module.kuberay-workload-identity] -} - # Grant access to batchv1/Jobs to kuberay-operator since the kuberay-operator role is missing some permissions. # See https://github.com/ray-project/kuberay/issues/1706 for more details. # TODO: remove this role binding once the kuberay-operator helm chart is upgraded to v1.1 diff --git a/modules/kuberay-operator/variables.tf b/modules/kuberay-operator/variables.tf index 7b984b4f7..a977c27de 100644 --- a/modules/kuberay-operator/variables.tf +++ b/modules/kuberay-operator/variables.tf @@ -34,15 +34,3 @@ variable "create_namespace" { variable "autopilot_cluster" { type = bool } - -variable "google_service_account" { - type = string - description = "Google service account name" - default = "kuberay-gcp-sa" -} - -variable "create_service_account" { - type = bool - description = "Creates a google service account & k8s service account & configures workload identity" - default = true -}