Merge branch 'inference' into optimize_attn

xinhaoc · Sep 24, 2023 · 230a9a1 · 230a9a1
2 parents 6d1b5e9 + 48cca2b
commit 230a9a1
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 100 deletions.
diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
@@ -13,27 +13,22 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  docker-build:
-    name: Build and Install FlexFlow in a Docker Container
-    runs-on: ubuntu-20.04
+  docker-build-rocm:
+    name: Build and Install FlexFlow in a Docker Container (ROCm backend)
+    runs-on: ubuntu-latest
     strategy:
       matrix:
-        gpu_backend: ["cuda", "hip_rocm"]
-        cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
-        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
-        exclude:
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.1"
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.2"
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.3"
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.5"
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.6"
-          - gpu_backend: "hip_rocm"
-            cuda_version: "11.7"
+        hip_version: ["5.3", "5.4", "5.5", "5.6"]
+      fail-fast: false
+    steps:
+      - run: 'echo "No docker-build required"'
+
+  docker-build-cuda:
+    name: Build and Install FlexFlow in a Docker Container (CUDA backend)
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
       fail-fast: false
     steps:
       - run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -9,9 +9,9 @@ on:
     branches:
       - "inference"
       - "master"
-  schedule:
-    # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
-    - cron: "0 8 * * 0"
+  # schedule:
+  #   # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
+  #   - cron: "0 8 * * 0"
   workflow_dispatch:
 
 # Cancel outdated workflows if they are still running
@@ -20,120 +20,174 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  docker-build:
-    name: Build and Install FlexFlow in a Docker Container
-    runs-on: ubuntu-20.04
+  oracle-runner-start:
+    name: Start a self-hosted Oracle machine to build the ROCM Docker images
+    runs-on: ubuntu-latest
+    if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+    env: 
+      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
+      OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }}
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+
+      - name: Install Oracle Cloud Infrastructure library
+        run: pip install oci
+
+      - name: Start Oracle Machine
+        run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID
+
+  # 1. in docker-build cases: if there are push/ workflow_dispatch to inference, 
+  # the job should be run on oracle machine to build rocm and 11.8 cuda version
+  # 2. add a job to turn off the oracle machine if not needed
+  rocm_runner_choice:
+    name: Determine what runner to use to build the ROCm Docker image(s)
+    runs-on: ubuntu-latest
+    outputs:
+      rocm_runner: ${{ steps.step1.outputs.runner }}
+    steps:
+      - name: Determine the runner
+        id: step1
+        env:
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+        run: |
+          if [ $deploy_needed == "true" ]; then
+            echo "::set-output name=runner::[self-hosted, cpu_only]"
+          else
+            echo "::set-output name=runner::ubuntu-20.04"
+          fi
+
+  docker-build-rocm:
+    needs: rocm_runner_choice
+    name: Build and Install FlexFlow in a Docker Container (ROCm backend)
+    runs-on: ${{ needs.rocm_runner_choice.outputs.rocm_runner }}
     strategy:
       matrix:
-        gpu_backend: ["cuda", "hip_rocm"]
-        gpu_backend_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0", "5.3", "5.4", "5.5", "5.6"]
-        # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
-        exclude:
-          - gpu_backend: "cuda"
-            gpu_backend_version: "5.3"
-          - gpu_backend: "cuda"
-            gpu_backend_version: "5.4"
-          - gpu_backend: "cuda"
-            gpu_backend_version: "5.5"
-          - gpu_backend: "cuda"
-            gpu_backend_version: "5.6"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.1"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.2"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.3"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.4"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.5"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.6"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.7"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.8"
-          - gpu_backend: "hip_rocm"
-            gpu_backend_version: "12.0"
+        hip_version: ["5.3", "5.4", "5.5", "5.6"]
       fail-fast: false
     env:
-      FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
-      gpu_backend_version: ${{ matrix.gpu_backend_version }}
-      # one of the two variables below will be unused
-      cuda_version: ${{ matrix.gpu_backend_version }}
-      hip_version: ${{ matrix.gpu_backend_version }}
-      branch_name: ${{ github.head_ref || github.ref_name }}
-    timeout-minutes: 480
+      FF_GPU_BACKEND: "hip_rocm"
+      hip_version: ${{ matrix.hip_version }}
     steps:
       - name: Checkout Git Repository
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        env:
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
-          build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
-        run: |
-          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
-            .github/workflows/helpers/free_space_on_runner.sh
-          else
-            echo "Skipping this step to save time"
-          fi
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
+        run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Build Docker container
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
         env:
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
-          build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+          build_needed: ${{ matrix.hip_version == '5.6' }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
           # a pre-built general-purpose image. On all other cases, only build for one architecture
           # to save time.
           if [[ $deploy_needed == "true" ]] ; then
-            export FF_CUDA_ARCH=all
             export FF_HIP_ARCH=all
             ./docker/build.sh flexflow
           elif [[ $build_needed == "true" ]]; then
-            export FF_CUDA_ARCH=70
             export FF_HIP_ARCH=gfx1100,gfx1036
             ./docker/build.sh flexflow
-          else
-            echo "Skipping build to save time"
           fi
 
       - name: Check availability of flexflow modules in Python
-        if: ${{ matrix.gpu_backend == 'cuda' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }}
+        run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
+
+      - name: Publish Docker environment image (on push to inference)
+        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
         env:
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
-          build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }}
+          FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
         run: |
-          if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
-            if [[ $FF_GPU_BACKEND == "cuda" ]]; then
-              docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
-            else
-              docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
-            fi
-          else
-            echo "Skipping test to save time"
+          ./docker/publish.sh flexflow-environment
+          ./docker/publish.sh flexflow
+  
+  docker-build-cuda:
+    name: Build and Install FlexFlow in a Docker Container (CUDA backend)
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
+      fail-fast: false
+    env:
+      FF_GPU_BACKEND: "cuda"
+      cuda_version: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout Git Repository
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Free additional space on runner
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        run: .github/workflows/helpers/free_space_on_runner.sh
+
+      - name: Build Docker container
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        env:
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+          build_needed: ${{ matrix.cuda_version == '11.8' }}
+        run: |
+          # On push to inference, build for all compatible architectures, so that we can publish 
+          # a pre-built general-purpose image. On all other cases, only build for one architecture
+          # to save time.
+          if [[ $deploy_needed == "true" ]] ; then
+            export FF_CUDA_ARCH=all
+            ./docker/build.sh flexflow
+          elif [[ $build_needed == "true" ]]; then
+            export FF_CUDA_ARCH=86
+            ./docker/build.sh flexflow
           fi
 
+      - name: Check availability of flexflow modules in Python
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
+
       - name: Publish Docker environment image (on push to inference)
-        if: github.repository_owner == 'flexflow'
+        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }}
         run: |
-          if [[ $deploy_needed == "true" ]]; then
-            ./docker/publish.sh flexflow-environment
-            ./docker/publish.sh flexflow
-          else
-            echo "No need to update Docker containers in ghrc.io registry at this time."
-          fi
+          ./docker/publish.sh flexflow-environment
+          ./docker/publish.sh flexflow
+
+  oracle-runner-stop:
+    needs: docker-build-rocm
+    if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+    runs-on: ubuntu-latest
+    name: Turn off the self-hosted Oracle machine where we built the ROCM Docker images
+    env:
+      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
+      OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }}
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+
+      - name: Install Oracle Cloud Infrastructure library
+        run: pip install oci
+
+      - name: Stop Oracle Machine
+        run: python3 docker/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID
 
   notify-slack:
     name: Notify Slack in case of failure
     runs-on: ubuntu-20.04
-    needs: docker-build
+    needs: [docker-build-cuda, docker-build-rocm]
     if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
     steps:
       - name: Send Slack message

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -61,7 +61,7 @@ jobs:
 
   python-interface-check:
     name: Check Python Interface
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     defaults:
       run:
         shell: bash -l {0} # required to use an activated conda environment
@@ -135,7 +135,7 @@ jobs:
 
   inference-tests:
     name: Inference Tests
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     defaults:
       run:
         shell: bash -l {0} # required to use an activated conda environment
@@ -210,7 +210,7 @@ jobs:
 
   gpu-ci-flexflow:
     name: Single Machine, Multiple GPUs Tests
-    runs-on: self-hosted
+    runs-on: [self-hosted, gpu]
     # skip this time-consuming test for PRs to the inference branch
     # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
     defaults:

diff --git a/.github/workflows/helpers/oracle_con.py b/.github/workflows/helpers/oracle_con.py
@@ -0,0 +1,38 @@
+import oci
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Program with optional flags")
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--start", action="store_true", help="Start action")
+group.add_argument("--stop", action="store_true", help="Stop action")
+parser.add_argument("--instance_id", type=str, required=True, help="instance id required")
+args = parser.parse_args()
+
+oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT")
+
+config = {
+    "user": os.getenv("OCI_CLI_USER"),
+    "key_content": os.getenv("OCI_CLI_KEY_CONTENT"),
+    "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"),
+    "tenancy": os.getenv("OCI_CLI_TENANCY"),
+    "region": os.getenv("OCI_CLI_REGION")
+}
+
+# Initialize the OCI configuration
+# config = oci.config.from_file()
+oci.config.validate_config(config)
+
+# Initialize the ComputeClient to interact with VM instances
+compute = oci.core.ComputeClient(config)
+
+# Replace 'your_instance_id' with the actual instance ID of your VM
+instance_id = args.instance_id
+
+# Perform the action
+if args.start:
+    # Start the VM
+    compute.instance_action(instance_id, "START")
+else:
+    # Stop the VM
+    compute.instance_action(instance_id, "STOP")
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
@@ -141,6 +141,7 @@ def init(
         configs_dict = {
             "num_gpus": num_gpus,
             "memory_per_gpu": memory_per_gpu,
+            "num_cpus": num_cpus,
             "zero_copy_memory_per_node": zero_copy_memory_per_node,
             "legion_utility_processors": legion_utility_processors,
             "data_parallelism_degree": data_parallelism_degree,
@@ -174,6 +175,8 @@ def init(
         __check_positive_int(configs_dict, param)
 
     # Set default values
+    if configs_dict.get("num_cpus", None) is None:
+        configs_dict["num_cpus"] = 4
     if configs_dict.get("legion_utility_processors", None) is None:
         configs_dict["legion_utility_processors"] = 8
     if configs_dict.get("data_parallelism_degree", None) is None: