forked from flexflow/flexflow-train
-
Notifications
You must be signed in to change notification settings - Fork 1
175 lines (155 loc) · 7.93 KB
/
docker-build.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
name: "docker-build"
on:
pull_request:
paths:
- "docker/**"
- "!docker/README.md"
- ".github/workflows/docker-build.yml"
push:
branches:
- "inference"
- "master"
# schedule:
# # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
# - cron: "0 8 * * 0"
workflow_dispatch:
# Cancel outdated workflows if they are still running
concurrency:
group: docker-build-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
rocm-builder-start:
name: Start an AWS instance to build the ROCM Docker images
runs-on: ubuntu-latest
if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-2
- name: Start EC2 instance
run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
docker-build-rocm:
name: Build and Install FlexFlow in a Docker Container (ROCm backend)
runs-on: ubuntu-20.04
if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }}
env:
FF_GPU_BACKEND: "hip_rocm"
hip_version: 5.6
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Free additional space on runner
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
run: FF_HIP_ARCH="gfx1100,gfx1036" ./docker/build.sh flexflow
- name: Check availability of flexflow modules in Python
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
docker-build-and-publish-rocm:
name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
needs: rocm-builder-start
runs-on: [self-hosted, rocm_builder]
if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
strategy:
matrix:
hip_version: ["5.3", "5.4", "5.5", "5.6"]
fail-fast: false
env:
FF_GPU_BACKEND: "hip_rocm"
hip_version: ${{ matrix.hip_version }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Build Docker container
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
run: FF_HIP_ARCH=all ./docker/build.sh flexflow
- name: Check availability of flexflow modules in Python
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
- name: Publish Docker environment image (on push to inference)
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
docker-build-cuda:
name: Build and Install FlexFlow in a Docker Container (CUDA backend)
runs-on: ubuntu-20.04
strategy:
matrix:
cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
fail-fast: false
env:
FF_GPU_BACKEND: "cuda"
cuda_version: ${{ matrix.cuda_version }}
steps:
- name: Checkout Git Repository
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
uses: actions/checkout@v3
with:
submodules: recursive
- name: Free additional space on runner
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
build_needed: ${{ matrix.cuda_version == '12.0' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ $deploy_needed == "true" ]] ; then
export FF_CUDA_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
export FF_CUDA_ARCH=86
./docker/build.sh flexflow
fi
- name: Check availability of flexflow modules in Python
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
- name: Publish Docker environment image (on push to inference)
if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
rocm-builder-stop:
needs: docker-build-and-publish-rocm
if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
runs-on: ubuntu-latest
name: Stop the AWS instance we used to build the ROCM Docker images
env:
ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-2
- name: Start EC2 instance
run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
needs: [docker-build-cuda, docker-build-and-publish-rocm]
if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
steps:
- name: Send Slack message
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
run: |
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK