run GPU CI manually #50
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "docker-build" | |
on: | |
pull_request: | |
paths: | |
- "docker/**" | |
- "!docker/README.md" | |
- ".github/workflows/docker-build.yml" | |
push: | |
branches: | |
- "master" | |
schedule: | |
# Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated | |
- cron: "0 8 * * 0" | |
workflow_dispatch: | |
# Cancel outdated workflows if they are still running | |
concurrency: | |
group: docker-build-${{ github.head_ref || github.run_id }} | |
cancel-in-progress: true | |
jobs: | |
docker-build: | |
name: Build and Install FlexFlow in a Docker Container | |
runs-on: ubuntu-20.04 | |
strategy: | |
matrix: | |
gpu_backend: ["cuda", "hip_rocm"] | |
cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"] | |
# The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported | |
exclude: | |
- gpu_backend: "hip_rocm" | |
cuda_version: "11.1" | |
- gpu_backend: "hip_rocm" | |
cuda_version: "11.2" | |
- gpu_backend: "hip_rocm" | |
cuda_version: "11.3" | |
- gpu_backend: "hip_rocm" | |
cuda_version: "11.5" | |
- gpu_backend: "hip_rocm" | |
cuda_version: "11.6" | |
- gpu_backend: "hip_rocm" | |
cuda_version: "11.7" | |
fail-fast: false | |
env: | |
FF_GPU_BACKEND: ${{ matrix.gpu_backend }} | |
cuda_version: ${{ matrix.cuda_version }} | |
branch_name: ${{ github.head_ref || github.ref_name }} | |
steps: | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Free additional space on runner | |
env: | |
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} | |
build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }} | |
run: | | |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then | |
.github/workflows/helpers/free_space_on_runner.sh | |
else | |
echo "Skipping this step to save time" | |
fi | |
- name: Build Docker container | |
env: | |
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} | |
build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }} | |
run: | | |
# On push to inference, build for all compatible architectures, so that we can publish | |
# a pre-built general-purpose image. On all other cases, only build for one architecture | |
# to save time. | |
if [[ $deploy_needed == "true" ]] ; then | |
export FF_CUDA_ARCH=all | |
./docker/build.sh flexflow | |
elif [[ $build_needed == "true" ]]; then | |
export FF_CUDA_ARCH=70 | |
./docker/build.sh flexflow | |
else | |
echo "Skipping build to save time" | |
fi | |
- name: Check availability of Python flexflow.core module | |
if: ${{ matrix.gpu_backend == 'cuda' }} | |
env: | |
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} | |
build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }} | |
run: | | |
if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then | |
docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'" | |
else | |
echo "Skipping test to save time" | |
fi | |
- name: Publish Docker environment image (on push to inference) | |
if: github.repository_owner == 'flexflow' | |
env: | |
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} | |
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} | |
run: | | |
if [[ $deploy_needed == "true" ]]; then | |
./docker/publish.sh flexflow-environment | |
./docker/publish.sh flexflow | |
else | |
echo "No need to update Docker containers in ghrc.io registry at this time." | |
fi | |
notify-slack: | |
name: Notify Slack in case of failure | |
runs-on: ubuntu-20.04 | |
needs: docker-build | |
if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} | |
steps: | |
- name: Send Slack message | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} | |
run: | | |
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK |