Skip to content

Commit

Permalink
Refactored the Terraform directory structures. (#1046)
Browse files Browse the repository at this point in the history
* Reorganize Terraform scripts by moving the sub directories.

* Updated the flexible to PSC-based cluster type
  • Loading branch information
kangminxie authored Oct 6, 2023
1 parent 2758580 commit 906be52
Show file tree
Hide file tree
Showing 37 changed files with 1,184 additions and 91 deletions.
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
variable "project_id" {}
variable "resource_name_prefix" {}
variable "node_pool_prefix" {}
variable "region" {}
variable "tpu_node_pools" {}
variable "maintenance_interval" {}
variable "is_tpu_node_private" {}


module "tpu-gke" {
source = "../../module"
project_id = var.project_id
resource_name_prefix = var.resource_name_prefix
node_pool_prefix = var.node_pool_prefix
region = var.region
tpu_node_pools = var.tpu_node_pools
maintenance_interval = var.maintenance_interval
is_tpu_node_private = var.is_tpu_node_private
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ output "kubernetes_cluster_name" {
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = module.tpu-gke.kubernetes_cluster_host
description = "GKE Cluster Host"
}

output "placement_policy_names" {
value = module.tpu-gke.placement_policy_names
description = "GKE TPU Placement Policy Names"
}

output "is_tpu_node_private" {
value = var.is_tpu_node_private
description = "whether we want to make TPU node private"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
project_id = "project-id"
resource_name_prefix = "tpu-v5e-test"
node_pool_prefix = "rp1"
region = "us-east5"
is_tpu_node_private = false
tpu_node_pools = [{
zone = "us-east5-b"
node_count = 32
machine_type = "ct5lp-hightpu-4t"
topology = "8x16"
policy = "sb-compact-rp1"
disk_type = "pd-balanced"
disk_size_gb = 120
}, {
zone = "us-east5-b"
node_count = 32
machine_type = "ct5lp-hightpu-4t"
topology = "8x16"
policy = "sb-compact-rp1"
disk_type = "pd-balanced"
disk_size_gb = 120
}]
maintenance_interval = "PERIODIC"
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

# GKE cluster
data "google_container_engine_versions" "gke_version" {
location = var.region
version_prefix = "1.27."
}

provider "google" {
project = var.project_id
region = var.region
}

# Separately Managed Node Pool
resource "google_container_node_pool" "multihost_tpu" {
count = length(var.tpu_node_pools)
name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}"
provider = google-beta
project = var.project_id
location = var.region
node_locations = [var.tpu_node_pools[count.index].zone]
cluster = "${var.resource_name_prefix}-gke-cluster"

initial_node_count = var.tpu_node_pools[count.index].node_count

management {
auto_upgrade = false
}

node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/cloud-platform",
]
host_maintenance_policy {
maintenance_interval = var.maintenance_interval
}
labels = {
env = var.project_id
}
gvnic {
enabled = true
}
gcfs_config {
enabled = true
}

image_type = "COS_CONTAINERD"
machine_type = var.tpu_node_pools[count.index].machine_type
disk_type = var.tpu_node_pools[count.index].disk_type
disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb
tags = ["gke-node"]
metadata = {
disable-legacy-endpoints = "true"
}
}
placement_policy {
type = "COMPACT"
policy_name = var.tpu_node_pools[count.index].policy
}

network_config {
enable_private_nodes = var.is_tpu_node_private
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ output "project_id" {
}

output "kubernetes_cluster_name" {
value = google_container_cluster.tpu_cluster.name
value = google_container_node_pool.multihost_tpu[0].cluster
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = google_container_cluster.tpu_cluster.endpoint
description = "GKE Cluster Host"
}

output "placement_policy_names" {
value = flatten([
value = flatten([
google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name
])
description = "GKE TPU Placement Policy Names"
}

output "is_tpu_node_private" {
value = var.is_tpu_node_private
description = "whether we want to make TPU node private"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
project_id = "project-id"
resource_name_prefix = "tpu-test"
region = "us-east5"
tpu_node_pools = [{
zone = "us-east5-b"
node_count = 32
machine_type = "ct5lp-hightpu-4t"
topology = "8x16"
policy = "sb-compact-rp1"
}]
maintenance_interval = "AS_NEEDED"
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ variable "resource_name_prefix" {
description = "prefix for all the resouce naming"
}

variable "node_pool_prefix" {
default = ""
description = "prefix for all the resouce naming"
}

variable "tpu_node_pools" {
description = "tpu podslice config"
type = list(object({
Expand All @@ -35,10 +40,17 @@ variable "tpu_node_pools" {
machine_type = string,
topology = string,
policy = string,
disk_type = optional(string),
disk_size_gb = optional(number),
}))
}

variable "is_tpu_node_private" {
description = "whether we want to make TPU node private"
default = false
}

variable "maintenance_interval" {
default = "AS_NEEDED"
default = "AS_NEEDED"
description = "maintenance interval for TPU machines."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
variable "project_id" {}
variable "resource_name_prefix" {}
variable "region" {}
variable "cpu_node_pool" {}
variable "authorized_cidr_blocks" {}
variable "is_cpu_node_private" {}


module "tpu-gke" {
source = "../../module"
project_id = var.project_id
resource_name_prefix = var.resource_name_prefix
region = var.region
cpu_node_pool = var.cpu_node_pool
is_cpu_node_private = var.is_cpu_node_private
authorized_cidr_blocks = var.authorized_cidr_blocks
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
output "region" {
value = var.region
description = "GCloud Region"
}

output "project_id" {
value = var.project_id
description = "GCloud Project ID"
}

output "kubernetes_cluster_name" {
value = module.tpu-gke.kubernetes_cluster_name
description = "GKE Cluster Name"
}

output "authorized_cidr_blocks" {
value = var.authorized_cidr_blocks
description = "Cluster allowed cidr blocks "
}

output "is_cpu_node_private" {
value = var.is_cpu_node_private
description = "whether we want to make CPU node private"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
project_id = "project-id"
resource_name_prefix = "tpu-v5e-test"
region = "us-east5"
authorized_cidr_blocks = []
is_cpu_node_private = false
cpu_node_pool = {
zone = ["us-east5-a", "us-east5-b", "us-east5-c"]
machine_type = "n2-standard-8",
initial_node_count_per_zone = 1,
min_node_count_per_zone = 1,
max_node_count_per_zone = 30,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/**
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

# GKE cluster
data "google_container_engine_versions" "gke_version" {
location = var.region
version_prefix = "1.27."
}

provider "google" {
project = var.project_id
region = var.region
}

# VPC
resource "google_compute_network" "vpc" {
name = "${var.resource_name_prefix}-vpc"
auto_create_subnetworks = "false"
}

# Subnet
resource "google_compute_subnetwork" "subnet" {
name = "${var.resource_name_prefix}-subnet"
region = var.region
network = google_compute_network.vpc.name
ip_cidr_range = "10.10.0.0/19"
}

resource "google_container_cluster" "tpu_cluster" {
name = "${var.resource_name_prefix}-gke-cluster"
location = var.region

# We can't create a cluster with no node pool defined, but we want to only use
# separately managed node pools. So we create the smallest possible default
# node pool and immediately delete it.
remove_default_node_pool = true
initial_node_count = 1
networking_mode = "VPC_NATIVE"
ip_allocation_policy {
cluster_ipv4_cidr_block = "/14"
services_ipv4_cidr_block = "/20"
}
default_max_pods_per_node = 15

release_channel {
channel = "UNSPECIFIED"
}

network = google_compute_network.vpc.name
subnetwork = google_compute_subnetwork.subnet.name
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"

master_authorized_networks_config {
gcp_public_cidrs_access_enabled = false

dynamic "cidr_blocks" {
for_each = var.authorized_cidr_blocks
content {
cidr_block = cidr_blocks.value
display_name = "cidr-blocks-group-${cidr_blocks.key}"
}
}
}

// Needs to be false when creating a PSC-based GKE cluster.
// After that, set as true to disable public endpoint of cluster master.
private_cluster_config {
enable_private_endpoint = false
}

timeouts {
create = "120m"
update = "120m"
}
}

resource "google_container_node_pool" "cpu_node_pool" {
provider = google-beta
project = var.project_id
name = "cpu-node-pool"
location = var.region
node_locations = var.cpu_node_pool.zone
cluster = google_container_cluster.tpu_cluster.name
initial_node_count = var.cpu_node_pool.initial_node_count_per_zone
autoscaling {
min_node_count = var.cpu_node_pool.min_node_count_per_zone
max_node_count = var.cpu_node_pool.max_node_count_per_zone
}
max_pods_per_node = 63
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
machine_type = var.cpu_node_pool.machine_type

metadata = {
disable-legacy-endpoints = "true"
}
gcfs_config {
enabled = true
}
}

network_config {
enable_private_nodes = var.is_cpu_node_private
}
}
Loading

0 comments on commit 906be52

Please sign in to comment.