-
Notifications
You must be signed in to change notification settings - Fork 197
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ability to scale on tgi custom metrics (#263)
- Loading branch information
Showing
12 changed files
with
479 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
31 changes: 31 additions & 0 deletions
31
benchmarks/inference-server/text-generation-inference/autoscaling.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Autoscaling TGI | ||
|
||
## tl;dr | ||
|
||
Recommendation: TODO | ||
|
||
## Autoscaling Options | ||
|
||
### CPU | ||
|
||
CPU scaling is a poor choice for this workload - the TGI workload starts up, | ||
pulls the model weights, and then spends a minute or two worth of cpu time | ||
crunching some numbers. This causes hpa to add a replica, which then spends | ||
more cpu time, which causes hpa to add a replica, etc. Eventually, things | ||
settle, and hpa scales down the replicas. This whole process could take up to | ||
an hour. | ||
|
||
### Custom Metrics | ||
|
||
Workload/custom metrics can be viewed in | ||
https://console.cloud.google.com/monitoring/metrics-explorer. (Just search for | ||
the metric name, e.g. "tgi_batch_current_size". The full name should be | ||
"prometheus/tgi_batch_current_size/gauge") | ||
|
||
#### `tgi_batch_current_size` | ||
|
||
TODO | ||
|
||
### External Metrics | ||
|
||
TODO |
31 changes: 31 additions & 0 deletions
31
...e-server/text-generation-inference/custom-metrics-stackdriver-adapter/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Custom Metrics Stackdriver Adapter | ||
|
||
Adapted from https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml | ||
|
||
## Usage | ||
|
||
To use this module, include it from your main terraform config, i.e.: | ||
|
||
``` | ||
module "custom_metrics_stackdriver_adapter" { | ||
source = "./path/to/custom-metrics-stackdriver-adapter" | ||
} | ||
``` | ||
|
||
For a workload identity enabled cluster, some additional configuration is | ||
needed: | ||
|
||
``` | ||
module "custom_metrics_stackdriver_adapter" { | ||
source = "./path/to/custom-metrics-stackdriver-adapter" | ||
workload_identity = { | ||
enabled = true | ||
project_id = "<PROJECT_ID>" | ||
} | ||
} | ||
``` | ||
|
||
# TODO | ||
|
||
This module should be moved out of the text-generation-inference subdirectory, | ||
as it should be more broadly applicable. |
278 changes: 278 additions & 0 deletions
278
...rks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/main.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,278 @@ | ||
resource "kubernetes_namespace_v1" "custom-metrics" { | ||
metadata { | ||
name = "custom-metrics" | ||
} | ||
} | ||
|
||
resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-no-wi" { | ||
count = var.workload_identity.enabled ? 0 : 1 | ||
metadata { | ||
name = "custom-metrics-stackdriver-adapter" | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
} | ||
|
||
resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-wi" { | ||
count = var.workload_identity.enabled ? 1 : 0 | ||
metadata { | ||
name = "custom-metrics-stackdriver-adapter" | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
annotations = { | ||
"iam.gke.io/gcp-service-account" = google_service_account.cmsa-sa[0].email | ||
} | ||
} | ||
} | ||
|
||
resource "kubernetes_cluster_role_binding_v1" "custom-metrics-system-auth-delegator" { | ||
metadata { | ||
name = "custom-metrics:system:auth-delegator" | ||
} | ||
role_ref { | ||
api_group = "rbac.authorization.k8s.io" | ||
kind = "ClusterRole" | ||
name = "system:auth-delegator" | ||
} | ||
subject { | ||
kind = "ServiceAccount" | ||
name = (var.workload_identity.enabled | ||
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name | ||
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name | ||
) | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
} | ||
|
||
resource "kubernetes_role_binding_v1" "custom-metrics-auth-reader" { | ||
metadata { | ||
name = "custom-metrics-auth-reader" | ||
namespace = "kube-system" | ||
} | ||
role_ref { | ||
api_group = "rbac.authorization.k8s.io" | ||
kind = "Role" | ||
name = "extension-apiserver-authentication-reader" | ||
} | ||
subject { | ||
kind = "ServiceAccount" | ||
name = (var.workload_identity.enabled | ||
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name | ||
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name | ||
) | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
} | ||
|
||
resource "kubernetes_cluster_role_v1" "custom-metrics-resource-reader" { | ||
metadata { | ||
name = "custom-metrics-resource-reader" | ||
} | ||
rule { | ||
api_groups = [""] | ||
resources = ["pods", "nodes", "nodes/stats"] | ||
verbs = ["get", "list", "watch"] | ||
} | ||
} | ||
|
||
resource "kubernetes_cluster_role_binding_v1" "custom-metrics-resource-reader" { | ||
metadata { | ||
name = "custom-metrics-resource-reader" | ||
} | ||
role_ref { | ||
api_group = "rbac.authorization.k8s.io" | ||
kind = "ClusterRole" | ||
name = kubernetes_cluster_role_v1.custom-metrics-resource-reader.metadata[0].name | ||
} | ||
subject { | ||
kind = "ServiceAccount" | ||
name = (var.workload_identity.enabled | ||
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name | ||
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name | ||
) | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
} | ||
|
||
resource "kubernetes_deployment_v1" "custom-metrics-stackdriver-adapter" { | ||
metadata { | ||
name = "custom-metrics-stackdriver-adapter" | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
labels = { | ||
run = "custom-metrics-stackdriver-adapter" | ||
k8s-app = "custom-metrics-stackdriver-adapter" | ||
} | ||
} | ||
spec { | ||
replicas = 1 | ||
|
||
selector { | ||
match_labels = { | ||
run = "custom-metrics-stackdriver-adapter" | ||
k8s-app = "custom-metrics-stackdriver-adapter" | ||
} | ||
} | ||
|
||
template { | ||
metadata { | ||
labels = { | ||
run = "custom-metrics-stackdriver-adapter" | ||
k8s-app = "custom-metrics-stackdriver-adapter" | ||
"kubernetes.io/cluster-service" = "true" | ||
} | ||
} | ||
|
||
spec { | ||
service_account_name = (var.workload_identity.enabled | ||
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name | ||
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name | ||
) | ||
|
||
container { | ||
image = "gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.14.2-gke.0" | ||
image_pull_policy = "Always" | ||
name = "pod-custom-metrics-stackdriver-adapter" | ||
command = ["/adapter", "--use-new-resource-model=true", "--fallback-for-container-metrics=true"] | ||
resources { | ||
limits = { | ||
cpu = "250m" | ||
memory = "200Mi" | ||
} | ||
requests = { | ||
cpu = "250m" | ||
memory = "200Mi" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
resource "kubernetes_service_v1" "custom-metrics-stackdriver-adapter" { | ||
metadata { | ||
name = "custom-metrics-stackdriver-adapter" | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
labels = { | ||
run = "custom-metrics-stackdriver-adapter" | ||
k8s-app = "custom-metrics-stackdriver-adapter" | ||
"kubernetes.io/cluster-service" = "true" | ||
"kubernetes.io/name" = "Adapter" | ||
} | ||
} | ||
spec { | ||
selector = { | ||
run = "custom-metrics-stackdriver-adapter" | ||
k8s-app = "custom-metrics-stackdriver-adapter" | ||
} | ||
port { | ||
port = 443 | ||
protocol = "TCP" | ||
target_port = 443 | ||
} | ||
type = "ClusterIP" | ||
} | ||
} | ||
|
||
resource "kubernetes_api_service_v1" "v1beta1-custom-metrics-k8s-io" { | ||
metadata { | ||
name = "v1beta1.custom.metrics.k8s.io" | ||
} | ||
spec { | ||
insecure_skip_tls_verify = true | ||
group = "custom.metrics.k8s.io" | ||
group_priority_minimum = 100 | ||
version_priority = 100 | ||
service { | ||
name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
version = "v1beta1" | ||
} | ||
} | ||
|
||
resource "kubernetes_api_service_v1" "v1beta2-custom-metrics-k8s-io" { | ||
metadata { | ||
name = "v1beta2.custom.metrics.k8s.io" | ||
} | ||
spec { | ||
insecure_skip_tls_verify = true | ||
group = "custom.metrics.k8s.io" | ||
group_priority_minimum = 100 | ||
version_priority = 200 | ||
service { | ||
name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
version = "v1beta2" | ||
} | ||
} | ||
|
||
resource "kubernetes_api_service_v1" "v1beta1-external-metrics-k8s-io" { | ||
metadata { | ||
name = "v1beta1.external.metrics.k8s.io" | ||
} | ||
spec { | ||
insecure_skip_tls_verify = true | ||
group = "external.metrics.k8s.io" | ||
group_priority_minimum = 100 | ||
version_priority = 100 | ||
service { | ||
name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name | ||
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name | ||
} | ||
version = "v1beta1" | ||
} | ||
} | ||
|
||
resource "kubernetes_cluster_role_binding_v1" "external-metrics-reader" { | ||
metadata { | ||
name = "external-metrics-reader" | ||
} | ||
role_ref { | ||
api_group = "rbac.authorization.k8s.io" | ||
kind = "ClusterRole" | ||
name = "external-metrics-reader" | ||
} | ||
subject { | ||
kind = "ServiceAccount" | ||
name = "horizontal-pod-autoscaler" | ||
namespace = "kube-system" | ||
} | ||
} | ||
|
||
|
||
# If workload identity is enabled, extra steps are required. We need to: | ||
# - create a service account | ||
# - grant it the monitoring.viewer IAM role | ||
# - bind it to the workload identity user for the cmsa | ||
# - annotate the cmsa service account (done above) | ||
|
||
resource "google_service_account" "cmsa-sa" { | ||
count = var.workload_identity.enabled ? 1 : 0 | ||
account_id = "cmsa-sa" | ||
project = var.workload_identity.project_id | ||
} | ||
|
||
# Equivalent to: | ||
# gcloud projects add-iam-policy-binding PROJECT_ID \ | ||
# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ | ||
# --role=roles/monitoring.viewer | ||
resource "google_project_iam_binding" "cmsa-project-binding" { | ||
count = var.workload_identity.enabled ? 1 : 0 | ||
project = var.workload_identity.project_id | ||
role = "roles/monitoring.viewer" | ||
members = [ | ||
"serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" | ||
] | ||
} | ||
|
||
# Equivalent to: | ||
# gcloud iam service-accounts add-iam-policy-binding \ | ||
# --role roles/iam.workloadIdentityUser \ | ||
# --member "serviceAccount:PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \ | ||
# cmsa-sa@PROJECT_ID.iam.gserviceaccount.com | ||
resource "google_service_account_iam_member" "cmsa-bind-to-gsa" { | ||
count = var.workload_identity.enabled ? 1 : 0 | ||
service_account_id = google_service_account.cmsa-sa[0].name | ||
role = "roles/iam.workloadIdentityUser" | ||
member = "serviceAccount:${var.workload_identity.project_id}.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" | ||
} |
16 changes: 16 additions & 0 deletions
16
...nference-server/text-generation-inference/custom-metrics-stackdriver-adapter/variables.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
variable "workload_identity" { | ||
type = object({ | ||
enabled = bool | ||
project_id = optional(string) | ||
}) | ||
default = { | ||
enabled = false | ||
} | ||
validation { | ||
condition = ( | ||
(var.workload_identity.enabled && var.workload_identity.project_id != null) | ||
|| (!var.workload_identity.enabled) | ||
) | ||
error_message = "A project_id must be specified if workload_identity_enabled is set." | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 20 additions & 0 deletions
20
...inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: tgi | ||
namespace: ${namespace} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: tgi | ||
minReplicas: ${hpa_min_replicas} | ||
maxReplicas: ${hpa_max_replicas} | ||
metrics: | ||
- type: Pods | ||
pods: | ||
metric: | ||
name: prometheus.googleapis.com|${custom_metric_name}|gauge | ||
target: | ||
type: AverageValue | ||
averageValue: ${hpa_averagevalue_target} |
Oops, something went wrong.