Skip to content

Commit

Permalink
Merge branch 'inference' into optimize_attn
Browse files Browse the repository at this point in the history
  • Loading branch information
jiazhihao authored Sep 24, 2023
2 parents 6d1b5e9 + 48cca2b commit 230a9a1
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 100 deletions.
33 changes: 14 additions & 19 deletions .github/workflows/docker-build-skip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,22 @@ concurrency:
cancel-in-progress: true

jobs:
docker-build:
name: Build and Install FlexFlow in a Docker Container
runs-on: ubuntu-20.04
docker-build-rocm:
name: Build and Install FlexFlow in a Docker Container (ROCm backend)
runs-on: ubuntu-latest
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
# The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
exclude:
- gpu_backend: "hip_rocm"
cuda_version: "11.1"
- gpu_backend: "hip_rocm"
cuda_version: "11.2"
- gpu_backend: "hip_rocm"
cuda_version: "11.3"
- gpu_backend: "hip_rocm"
cuda_version: "11.5"
- gpu_backend: "hip_rocm"
cuda_version: "11.6"
- gpu_backend: "hip_rocm"
cuda_version: "11.7"
hip_version: ["5.3", "5.4", "5.5", "5.6"]
fail-fast: false
steps:
- run: 'echo "No docker-build required"'

docker-build-cuda:
name: Build and Install FlexFlow in a Docker Container (CUDA backend)
runs-on: ubuntu-latest
strategy:
matrix:
cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
fail-fast: false
steps:
- run: 'echo "No docker-build required"'
210 changes: 132 additions & 78 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ on:
branches:
- "inference"
- "master"
schedule:
# Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
- cron: "0 8 * * 0"
# schedule:
# # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
# - cron: "0 8 * * 0"
workflow_dispatch:

# Cancel outdated workflows if they are still running
Expand All @@ -20,120 +20,174 @@ concurrency:
cancel-in-progress: true

jobs:
docker-build:
name: Build and Install FlexFlow in a Docker Container
runs-on: ubuntu-20.04
oracle-runner-start:
name: Start a self-hosted Oracle machine to build the ROCM Docker images
runs-on: ubuntu-latest
if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }}
OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3

- name: Install Oracle Cloud Infrastructure library
run: pip install oci

- name: Start Oracle Machine
run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID

# 1. in docker-build cases: if there are push/ workflow_dispatch to inference,
# the job should be run on oracle machine to build rocm and 11.8 cuda version
# 2. add a job to turn off the oracle machine if not needed
rocm_runner_choice:
name: Determine what runner to use to build the ROCm Docker image(s)
runs-on: ubuntu-latest
outputs:
rocm_runner: ${{ steps.step1.outputs.runner }}
steps:
- name: Determine the runner
id: step1
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
run: |
if [ $deploy_needed == "true" ]; then
echo "::set-output name=runner::[self-hosted, cpu_only]"
else
echo "::set-output name=runner::ubuntu-20.04"
fi
docker-build-rocm:
needs: rocm_runner_choice
name: Build and Install FlexFlow in a Docker Container (ROCm backend)
runs-on: ${{ needs.rocm_runner_choice.outputs.rocm_runner }}
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
gpu_backend_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0", "5.3", "5.4", "5.5", "5.6"]
# The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
exclude:
- gpu_backend: "cuda"
gpu_backend_version: "5.3"
- gpu_backend: "cuda"
gpu_backend_version: "5.4"
- gpu_backend: "cuda"
gpu_backend_version: "5.5"
- gpu_backend: "cuda"
gpu_backend_version: "5.6"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.1"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.2"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.3"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.4"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.5"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.6"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.7"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.8"
- gpu_backend: "hip_rocm"
gpu_backend_version: "12.0"
hip_version: ["5.3", "5.4", "5.5", "5.6"]
fail-fast: false
env:
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
gpu_backend_version: ${{ matrix.gpu_backend_version }}
# one of the two variables below will be unused
cuda_version: ${{ matrix.gpu_backend_version }}
hip_version: ${{ matrix.gpu_backend_version }}
branch_name: ${{ github.head_ref || github.ref_name }}
timeout-minutes: 480
FF_GPU_BACKEND: "hip_rocm"
hip_version: ${{ matrix.hip_version }}
steps:
- name: Checkout Git Repository
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
uses: actions/checkout@v3
with:
submodules: recursive

- name: Free additional space on runner
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
run: |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
.github/workflows/helpers/free_space_on_runner.sh
else
echo "Skipping this step to save time"
fi
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Build Docker container
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
build_needed: ${{ matrix.hip_version == '5.6' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ $deploy_needed == "true" ]] ; then
export FF_CUDA_ARCH=all
export FF_HIP_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
export FF_CUDA_ARCH=70
export FF_HIP_ARCH=gfx1100,gfx1036
./docker/build.sh flexflow
else
echo "Skipping build to save time"
fi
- name: Check availability of flexflow modules in Python
if: ${{ matrix.gpu_backend == 'cuda' }}
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"

- name: Publish Docker environment image (on push to inference)
if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
if [[ $FF_GPU_BACKEND == "cuda" ]]; then
docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
else
docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
fi
else
echo "Skipping test to save time"
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
docker-build-cuda:
name: Build and Install FlexFlow in a Docker Container (CUDA backend)
runs-on: ubuntu-20.04
strategy:
matrix:
cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
fail-fast: false
env:
FF_GPU_BACKEND: "cuda"
cuda_version: ${{ matrix.cuda_version }}
steps:
- name: Checkout Git Repository
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
uses: actions/checkout@v3
with:
submodules: recursive

- name: Free additional space on runner
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Build Docker container
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
build_needed: ${{ matrix.cuda_version == '11.8' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ $deploy_needed == "true" ]] ; then
export FF_CUDA_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
export FF_CUDA_ARCH=86
./docker/build.sh flexflow
fi
- name: Check availability of flexflow modules in Python
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"

- name: Publish Docker environment image (on push to inference)
if: github.repository_owner == 'flexflow'
if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
run: |
if [[ $deploy_needed == "true" ]]; then
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
else
echo "No need to update Docker containers in ghrc.io registry at this time."
fi
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
oracle-runner-stop:
needs: docker-build-rocm
if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
runs-on: ubuntu-latest
name: Turn off the self-hosted Oracle machine where we built the ROCM Docker images
env:
OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }}
OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3

- name: Install Oracle Cloud Infrastructure library
run: pip install oci

- name: Stop Oracle Machine
run: python3 docker/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID

notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
needs: docker-build
needs: [docker-build-cuda, docker-build-rocm]
if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
steps:
- name: Send Slack message
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
python-interface-check:
name: Check Python Interface
runs-on: self-hosted
runs-on: [self-hosted, gpu]
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
Expand Down Expand Up @@ -135,7 +135,7 @@ jobs:
inference-tests:
name: Inference Tests
runs-on: self-hosted
runs-on: [self-hosted, gpu]
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
Expand Down Expand Up @@ -210,7 +210,7 @@ jobs:

gpu-ci-flexflow:
name: Single Machine, Multiple GPUs Tests
runs-on: self-hosted
runs-on: [self-hosted, gpu]
# skip this time-consuming test for PRs to the inference branch
# if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
defaults:
Expand Down
38 changes: 38 additions & 0 deletions .github/workflows/helpers/oracle_con.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import oci
import argparse
import os

parser = argparse.ArgumentParser(description="Program with optional flags")
group = parser.add_mutually_exclusive_group()
group.add_argument("--start", action="store_true", help="Start action")
group.add_argument("--stop", action="store_true", help="Stop action")
parser.add_argument("--instance_id", type=str, required=True, help="instance id required")
args = parser.parse_args()

oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT")

config = {
"user": os.getenv("OCI_CLI_USER"),
"key_content": os.getenv("OCI_CLI_KEY_CONTENT"),
"fingerprint": os.getenv("OCI_CLI_FINGERPRINT"),
"tenancy": os.getenv("OCI_CLI_TENANCY"),
"region": os.getenv("OCI_CLI_REGION")
}

# Initialize the OCI configuration
# config = oci.config.from_file()
oci.config.validate_config(config)

# Initialize the ComputeClient to interact with VM instances
compute = oci.core.ComputeClient(config)

# Replace 'your_instance_id' with the actual instance ID of your VM
instance_id = args.instance_id

# Perform the action
if args.start:
# Start the VM
compute.instance_action(instance_id, "START")
else:
# Stop the VM
compute.instance_action(instance_id, "STOP")
3 changes: 3 additions & 0 deletions python/flexflow/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def init(
configs_dict = {
"num_gpus": num_gpus,
"memory_per_gpu": memory_per_gpu,
"num_cpus": num_cpus,
"zero_copy_memory_per_node": zero_copy_memory_per_node,
"legion_utility_processors": legion_utility_processors,
"data_parallelism_degree": data_parallelism_degree,
Expand Down Expand Up @@ -174,6 +175,8 @@ def init(
__check_positive_int(configs_dict, param)

# Set default values
if configs_dict.get("num_cpus", None) is None:
configs_dict["num_cpus"] = 4
if configs_dict.get("legion_utility_processors", None) is None:
configs_dict["legion_utility_processors"] = 8
if configs_dict.get("data_parallelism_degree", None) is None:
Expand Down

0 comments on commit 230a9a1

Please sign in to comment.