Skip to content

Commit

Permalink
remove T4, A100, TPU profil options for jupyterhub
Browse files Browse the repository at this point in the history
  • Loading branch information
chiayi committed Apr 1, 2024
1 parent 0c7c34e commit 604e310
Showing 1 changed file with 144 additions and 138 deletions.
282 changes: 144 additions & 138 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,143 +46,143 @@ steps:
waitFor: ['validate platform']

# Create cluster to test ray, jupyterhub
- id: 'create gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
env:
- "KUBE_LOAD_CONFIG_FILE=false"
entrypoint: 'sh'
args:
- '-c'
- |
set -e
terraform apply \
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]

- id: 'test ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
args:
- '-c'
- |
set -e
# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID
cd /workspace/applications/ray/
terraform apply \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/user_result.txt
# Make sure pods are running
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 5s
ray job submit \
--address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
echo "pass" > /workspace/ray_result.txt
allowFailure: true
waitFor: ['create gke cluster']

- id: 'cleanup ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/ray/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
allowFailure: true
waitFor: ['test ray cluster']

- id: 'test jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
cd /workspace/applications/jupyter
terraform apply \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/jupyterhub_tf_result.txt
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 &
# Wait port-forwarding to take its place
sleep 5s
cd /workspace/modules/jupyter/tests
python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/jupyterhub_test_result.txt
allowFailure: true

- id: 'cleanup jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/jupyter/
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
allowFailure: true
waitFor: ['test jupyterhub']
# - id: 'create gke cluster'
# name: 'gcr.io/$PROJECT_ID/terraform'
# env:
# - "KUBE_LOAD_CONFIG_FILE=false"
# entrypoint: 'sh'
# args:
# - '-c'
# - |
# set -e

# terraform apply \
# -var-file=tfvars_tests/standard-gke-public.platform.tfvars \
# -var=project_id=$PROJECT_ID \
# -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
# -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
# -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
# -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
# -var=cluster_location=$_REGION \
# -auto-approve -no-color -lock=false
# echo "pass" > /workspace/gke_cluster_result.txt
# dir: 'infrastructure/'
# allowFailure: true
# waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]

# - id: 'test ray cluster'
# name: 'gcr.io/$PROJECT_ID/terraform'
# entrypoint: 'sh'
# args:
# - '-c'
# - |
# set -e

# # Get kube config
# gcloud container clusters get-credentials \
# ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
# --location $_REGION \
# --project $PROJECT_ID

# cd /workspace/applications/ray/
# terraform apply \
# -var-file=workloads.tfvars \
# -var=project_id=$PROJECT_ID \
# -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
# -var=cluster_location=$_REGION \
# -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
# -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
# -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
# -auto-approve -no-color -lock=false
# echo "pass" > /workspace/user_result.txt

# # Make sure pods are running
# kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
# kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8265:8265 &
# # Wait port-forwarding to take its place
# sleep 5s

# ray job submit \
# --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
# echo "pass" > /workspace/ray_result.txt
# allowFailure: true
# waitFor: ['create gke cluster']

# - id: 'cleanup ray cluster'
# name: 'gcr.io/$PROJECT_ID/terraform'
# entrypoint: 'bash'
# args:
# - '-c'
# - |
# set -e

# cd /workspace/applications/ray/
# terraform destroy \
# -var-file=workloads.tfvars \
# -var=project_id=$PROJECT_ID \
# -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
# -var=cluster_location=$_REGION \
# -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
# -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
# -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
# -auto-approve -no-color -lock=false

# allowFailure: true
# waitFor: ['test ray cluster']

# - id: 'test jupyterhub'
# name: 'gcr.io/$PROJECT_ID/terraform'
# entrypoint: 'bash'
# args:
# - '-c'
# - |
# set -e

# cd /workspace/modules/jupyter/tests
# python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER

# cd /workspace/applications/jupyter
# terraform apply \
# -var-file=workloads-without-iap.example.tfvars \
# -var=project_id=$PROJECT_ID \
# -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
# -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
# -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
# -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
# -auto-approve -no-color -lock=false
# echo "pass" > /workspace/jupyterhub_tf_result.txt

# kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
# kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID
# kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 &
# # Wait port-forwarding to take its place
# sleep 5s

# cd /workspace/modules/jupyter/tests
# python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER
# echo "pass" > /workspace/jupyterhub_test_result.txt
# allowFailure: true

# - id: 'cleanup jupyterhub'
# name: 'gcr.io/$PROJECT_ID/terraform'
# entrypoint: 'bash'
# args:
# - '-c'
# - |
# set -e

# cd /workspace/applications/jupyter/
# terraform destroy \
# -var-file=workloads-without-iap.example.tfvars \
# -var=project_id=$PROJECT_ID \
# -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
# -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
# -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
# -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
# -auto-approve -no-color -lock=false

# allowFailure: true
# waitFor: ['test jupyterhub']

- id: 'test rag'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand Down Expand Up @@ -249,8 +249,14 @@ steps:
cd /workspace/applications/rag/tests
python3 test_frontend.py "127.0.0.1:8081"
echo "pass" > /workspace/rag_frontend_result.txt
sleep 5s
cd /workspace/
find . -type f -name "*.ipynb" > notebook_file_list.txt
while IFS= read -r line; do gsutil cp $line gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/notebooks/ ; done < notebook_file_list.txt
ray job submit --working-dir . --address=http://127.0.0.1:8265 -- jupyter nbconvert --to notebook --execute /notebooks/rag-kaggle-ray-sql-latest.ipynb
allowFailure: true
waitFor: ['cleanup jupyterhub', 'cleanup ray cluster']
# waitFor: ['cleanup jupyterhub', 'cleanup ray cluster']

- id: 'cleanup rag'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand Down

0 comments on commit 604e310

Please sign in to comment.