forked from flexflow/flexflow-train
-
Notifications
You must be signed in to change notification settings - Fork 1
98 lines (90 loc) · 3.79 KB
/
docker-build.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
name: "docker-build"
on:
pull_request:
paths:
- "docker/**"
- "!docker/README.md"
- ".github/workflows/docker-build.yml"
push:
branches:
- "master"
schedule:
# Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
- cron: "0 8 * * 0"
workflow_dispatch:
# Cancel outdated workflows if they are still running
concurrency:
group: docker-build-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
docker-build:
name: Build and Install FlexFlow in a Docker Container
runs-on: ubuntu-20.04
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6","11.7", "11.8"]
exclude:
- gpu_backend: "hip_rocm"
cuda_version: "11.2"
- gpu_backend: "hip_rocm"
cuda_version: "11.3"
- gpu_backend: "hip_rocm"
cuda_version: "11.5"
- gpu_backend: "hip_rocm"
cuda_version: "11.6"
- gpu_backend: "hip_rocm"
cuda_version: "11.7"
- gpu_backend: "hip_rocm"
cuda_version: "11.8"
fail-fast: false
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Free additional space on runner
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
env:
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
cuda_version: ${{ matrix.cuda_version }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
export FF_CUDA_ARCH=all
else
export FF_CUDA_ARCH=70
fi
./docker/build.sh flexflow
- name: Check availability of Python flexflow.core module
if: ${{ matrix.gpu_backend == 'cuda' }}
env:
cuda_version: ${{ matrix.cuda_version }}
run: docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
- name: Publish Docker environment image (on push to inference)
if: github.repository_owner == 'flexflow'
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
cuda_version: ${{ matrix.cuda_version }}
run: |
if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
else
echo "No need to update Docker containers in ghrc.io registry at this time."
fi
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
needs: docker-build
if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
steps:
- name: Send Slack message
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
run: |
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK