From e86becd494432c2b6ceff6ecf544a6cbfa009453 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 2 Dec 2024 17:25:01 -0800 Subject: [PATCH] Change queue name to use the new CI cluster (#53) * Update test-template-aws.j2 * Update ci_aws_bootstrap.sh * Update test-template-aws.j2 * Update test-template-aws.j2 * Update ci_aws_bootstrap.sh * Update test-template-aws.j2 * Update test-template-aws.j2 * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update test-template-aws.j2 * Update test-template-aws.j2 * Update test-template-aws.j2 * Update test-template-aws.j2 * Update test-template-aws.j2 * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update test-template-aws.j2 * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update test-template-aws.j2 * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh * Update ci_aws_bootstrap.sh --- scripts/ci_aws_bootstrap.sh | 5 +++-- scripts/test-template-aws.j2 | 26 ++++++++++++++------------ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index 27223db..01f583b 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -25,7 +25,7 @@ upload_pipeline() { exit 0 fi if [ ! -e ".buildkite/test-template.j2" ]; then - curl -o .buildkite/test-template.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-aws.j2 + curl -o .buildkite/test-template.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-aws.j2?$(date +%s) fi if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then python -m pip install click pydantic @@ -37,7 +37,8 @@ upload_pipeline() { echo "List file diff: $LIST_FILE_DIFF" echo "Run all: $RUN_ALL" echo "Nightly: $NIGHTLY" - minijinja-cli test-template.j2 test-pipeline.yaml -D list_file_diff="$LIST_FILE_DIFF" -D run_all="$RUN_ALL" -D nightly="$NIGHTLY" > pipeline.yml + minijinja-cli test-template.j2 test-pipeline.yaml -D branch="$BUILDKITE_BRANCH" -D list_file_diff="$LIST_FILE_DIFF" -D run_all="$RUN_ALL" -D nightly="$NIGHTLY" > pipeline.yml + cat pipeline.yml buildkite-agent pipeline upload pipeline.yml exit 0 } diff --git a/scripts/test-template-aws.j2 b/scripts/test-template-aws.j2 index f26a0c1..25632c2 100644 --- a/scripts/test-template-aws.j2 +++ b/scripts/test-template-aws.j2 @@ -8,7 +8,11 @@ steps: - label: ":docker: build image" key: image-build agents: - queue: cpu_queue + {% if branch == "main" %} + queue: cpu_queue_postmerge + {% else %} + queue: cpu_queue_premerge + {% endif %} commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - | @@ -69,11 +73,11 @@ steps: {% endif %} agents: {% if step.label == "Documentation Build" %} - queue: small_cpu_queue + queue: small_cpu_queue_premerge {% elif step.no_gpu %} - queue: cpu_queue + queue: cpu_queue_premerge {% elif step.gpu == "a100" %} - queue: a100-queue + queue: a100_queue {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -111,7 +115,7 @@ steps: - VLLM_USAGE_SOURCE=ci-test - HF_HOME={{ hf_home }} - HF_TOKEN - {% if "$BUILDKITE_BRANCH" == "main" %} + {% if branch == "main" %} - BUILDKITE_ANALYTICS_TOKEN {% endif %} {% if step.label == "Speculative decoding tests" %} @@ -126,10 +130,8 @@ steps: priorityClassName: ci containers: - image: {{ docker_image }} - command: ["bash"] - args: - - '-c' - - "'(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + command: + - bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}' resources: limits: nvidia.com/gpu: {{ step.num_gpus or 1 }} @@ -196,7 +198,7 @@ steps: depends_on: - "amd-build" agents: - queue: amd + queue: amd_gpu command: bash .buildkite/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: DOCKER_BUILDKIT: "1" @@ -261,13 +263,13 @@ steps: depends_on: block-ibm-test soft_fail: true agents: - queue: ppc64le + queue: ibm-ppc64le command: bash .buildkite/run-cpu-test-ppc64le.sh - label: "TPU Test" depends_on: ~ agents: - queue: tpu + queue: tpu_queue commands: - if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi - yes | docker system prune -a