Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combining Inference and PEFT Tokens in a Batch #1153

Merged
merged 306 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
306 commits
Select commit Hold shift + click to select a range
d8e92e9
.
jiazhihao Oct 23, 2023
0a512d2
.
jiazhihao Oct 24, 2023
4ee710a
Update the default cublas behavior when CUDA_VERSION is not specified
jiazhihao Oct 24, 2023
2adca3a
Merge branch 'fix_cublas_default' of https://github.com/flexflow/Flex…
jiazhihao Oct 24, 2023
464424e
fix bugs in IncMHA peft_bwd kernel
jiazhihao Oct 24, 2023
82d6e58
resolve merge conflict
jiazhihao Oct 24, 2023
45c1e01
uncomment softmaxbackward
jiazhihao Oct 24, 2023
07636e8
add layernorm to align test
goliaro Oct 24, 2023
28a5e84
add peft test scripts
goliaro Oct 24, 2023
dd94370
fix import
goliaro Oct 24, 2023
3c01328
fix
goliaro Oct 24, 2023
fa56364
add code to convert peft models
goliaro Oct 26, 2023
a484100
add script to download peft for c++, fix bug
goliaro Oct 26, 2023
c83c376
fix
goliaro Oct 26, 2023
aa9f004
add script to fine-tune models
goliaro Oct 27, 2023
4609e9e
implement loading lora configs/weights from file
goliaro Oct 31, 2023
17fa6f3
remove peft_bwd assertion failure in embedding
goliaro Oct 31, 2023
cdc12e6
fix download script
goliaro Oct 31, 2023
eb9e2b8
add peft dependencies in dockerfile
goliaro Oct 31, 2023
3dfa14d
fix softmax backward
goliaro Oct 31, 2023
78523e8
fix bc print indentation
goliaro Nov 1, 2023
bf78ea4
Temporarily Revert "Update the default cublas behavior when CUDA_VERS…
goliaro Nov 2, 2023
b9e7f60
Fix cublas default (#1220)
goliaro Nov 2, 2023
463c757
fix bugs, work on align opt-lora
goliaro Nov 3, 2023
1c231ba
Merge branch 'inference' into peft
goliaro Nov 6, 2023
7c65521
update scripts
goliaro Nov 6, 2023
f4b3f8f
add code to output peft tensors in hf
goliaro Nov 6, 2023
9e5fea9
update, fixes
goliaro Nov 7, 2023
62edfaa
linting
goliaro Nov 7, 2023
ddb5c29
fix printing of tensors for numpy
goliaro Nov 7, 2023
d276496
update save_inference_tensors_to_file
goliaro Nov 8, 2023
bc79d3b
linting
goliaro Nov 8, 2023
8e34632
update
goliaro Nov 8, 2023
b11c5e9
fix issue with save_inference_tensors_to_file
goliaro Nov 8, 2023
fca16cc
fix layer names for save_inference_tensors_to_file
goliaro Nov 8, 2023
9095f2b
fix peft
goliaro Nov 9, 2023
9769604
fix bwd bugs
goliaro Nov 10, 2023
880ede8
linting
goliaro Nov 10, 2023
818375d
fixes
goliaro Nov 10, 2023
2990e20
fix
goliaro Nov 10, 2023
6959e68
fix
goliaro Nov 10, 2023
266368c
fix
goliaro Nov 10, 2023
06775bd
add bc fields for peft training
goliaro Nov 10, 2023
ca879e2
merge conflicts
goliaro Nov 10, 2023
9f60177
linting
goliaro Nov 10, 2023
9442b62
fix
goliaro Nov 10, 2023
11eccb1
remove ptr check
goliaro Nov 10, 2023
9bfc557
fix
goliaro Nov 10, 2023
bcfae08
implement save_operators for bwd
goliaro Nov 12, 2023
d86272c
fix bug
goliaro Nov 13, 2023
0a3258a
implement save tensors for bwd
goliaro Nov 13, 2023
e34c405
.
goliaro Nov 15, 2023
87fbada
bug fix
goliaro Nov 15, 2023
52759bd
fix
goliaro Nov 15, 2023
2a5371d
align linear
goliaro Nov 15, 2023
ed0be61
fix
goliaro Nov 16, 2023
8a0b6ea
bwd kernel updates
goliaro Nov 17, 2023
b0e686d
undo use of CUBLAS_COMPUTE_32F_FAST_16F for now
goliaro Nov 17, 2023
0daf232
only send dataset entry once
goliaro Nov 19, 2023
ec131c7
update peft test scripts
goliaro Nov 20, 2023
0431c73
loss
xinhaoc Nov 20, 2023
371dffd
.
xinhaoc Nov 20, 2023
da690ff
update generate/request api to take both inference and fine-tuning pr…
goliaro Nov 21, 2023
1e5bb72
linting
goliaro Nov 21, 2023
f3ff40b
alignment fixes in lora & linear layer
goliaro Nov 21, 2023
7efd3a7
alignment fix
goliaro Nov 21, 2023
b6fe334
diagonal
xinhaoc Nov 22, 2023
bcf8b19
fix
goliaro Nov 22, 2023
4bfee96
alignment fix ssm
goliaro Nov 22, 2023
efd1976
sigmoid-silu-multi now fully aligned
goliaro Nov 24, 2023
7ae195a
rms norm kernel updates
goliaro Nov 24, 2023
7030814
fix
goliaro Nov 26, 2023
eb3b6ab
in-place residual rms
goliaro Nov 26, 2023
9f26cc1
Merge branch 'inference' into peft
goliaro Nov 27, 2023
a122e30
bug fix and linting
goliaro Nov 28, 2023
53e737b
align backward of o_proj, attn_heads, qk_prods_softmax, and v_proj wi…
goliaro Nov 30, 2023
edc02af
cleanup
goliaro Nov 30, 2023
f00c7e0
finished all alignment fixes in attention backward kernel
goliaro Nov 30, 2023
3955b0b
fix
goliaro Nov 30, 2023
c534638
Update inc_multihead_self_attention.cu
goliaro Dec 3, 2023
fd956c9
Update inc_multihead_self_attention.cu
goliaro Dec 4, 2023
d9b154f
Merge branch 'inference' into peft
goliaro Dec 4, 2023
3a34c88
use grad to store peft in/output (#1241)
xinhaoc Dec 6, 2023
94230d9
format
jiazhihao Dec 6, 2023
b985cc9
enable peft request
jiazhihao Dec 6, 2023
b9c3926
several hacks for performance measurement; some of the changes should…
jiazhihao Dec 6, 2023
4d5c3e0
Update sigmoid_silu_multi.cu
goliaro Dec 16, 2023
7bf863a
RoPE backward
goliaro Dec 18, 2023
960654e
PEFT bug fixes and alignment (#1269)
goliaro Jan 10, 2024
2028900
Fuse bias + relu in OPT (#1271)
goliaro Jan 10, 2024
3bbde56
fix
goliaro Jan 10, 2024
2ebd7f4
fix
goliaro Jan 17, 2024
1b2018b
fix
goliaro Jan 17, 2024
bc61e9d
Peft alignment & debugging tools (#1288)
goliaro Jan 27, 2024
32f0a15
fix legion aliasing error
goliaro Jan 27, 2024
c97f63a
fix warnings
goliaro Jan 27, 2024
3d5a37c
fix
goliaro Jan 27, 2024
571f0d3
fix pipeline parallelism
goliaro Jan 29, 2024
f4a10f3
fix tp issue in combine op
goliaro Jan 29, 2024
ca683f7
fix lora weight loading with tensor parallelism
goliaro Jan 29, 2024
378bdb5
fixes, implement Combine::peft_bwd_task
goliaro Jan 29, 2024
afdae45
fix
goliaro Jan 29, 2024
5660f55
replicate peft bwd
goliaro Jan 29, 2024
a9bacd3
fixes
goliaro Jan 30, 2024
f3a97ff
fix
goliaro Jan 31, 2024
e0a58bb
fix combine and fwd-bwd pass dependencies
goliaro Jan 31, 2024
50fc13d
fix replicate bwd
goliaro Jan 31, 2024
f2c9a05
fix
goliaro Feb 1, 2024
cd68f5d
let user control amount of peft memory
goliaro Feb 3, 2024
64a59d8
only run peft_bwd if peft is enabled
goliaro Feb 3, 2024
32a0716
fix rms norm inference region reqs
goliaro Feb 6, 2024
a37b173
fix in-place fusion (part 1)
goliaro Feb 7, 2024
85f4d40
fix inplace fusion (part 2)
goliaro Feb 7, 2024
bb56a99
fix
goliaro Feb 7, 2024
63f1fce
disable automatic inplace rms norm for now
goliaro Feb 7, 2024
0d3aa7e
fix inf fusion inplace
goliaro Feb 8, 2024
b658061
fix rest input grads for peft without inplace residuals
goliaro Feb 9, 2024
3255fe4
fix
goliaro Feb 9, 2024
ec2002e
fix
goliaro Feb 15, 2024
098e880
fix residual rms
goliaro Feb 16, 2024
5688e16
fix
goliaro Feb 16, 2024
9225e0c
fix
goliaro Feb 16, 2024
e12bff1
enable inf debugging in fusion bwd
goliaro Feb 19, 2024
ed9afb7
hack to silence warning in fused bwd
goliaro Feb 19, 2024
96d0e9b
fix
goliaro Feb 19, 2024
fcbeea0
Merge branch 'inference' into peft
goliaro Feb 19, 2024
2cbc0b7
fix
goliaro Feb 19, 2024
36cb2b3
fix build
goliaro Feb 19, 2024
21b77f1
fix
goliaro Feb 19, 2024
9075d3f
fix
goliaro Feb 19, 2024
0b35b0c
add draft peft test
goliaro Mar 22, 2024
b6ada2f
Peft python interface (#1306)
goliaro Mar 27, 2024
29fcda7
Merge branch 'inference' into peft
goliaro Apr 8, 2024
0ed889a
fix
goliaro Apr 8, 2024
48c431a
update
goliaro Apr 11, 2024
40649ee
fix
goliaro Apr 12, 2024
0580d7e
fix to support prompts larger than max tokens per batch
goliaro Apr 13, 2024
0affe27
fixes to support benchmarking of finetuning throughput
goliaro Apr 14, 2024
d7ebeaf
many upgrades and updates related to finetuning
goliaro Apr 15, 2024
33e873d
add ttft statistics
goliaro Apr 15, 2024
2f92a65
add warmup phase
goliaro Apr 15, 2024
b1e97b1
add benchmarking code
goliaro Apr 16, 2024
e35ebb2
Add scripts for evaluation with Microsoft Azure trace (#1363)
Flechman Apr 17, 2024
f3f6226
Merge branch 'inference' into peft
goliaro Apr 24, 2024
b33f10f
fix
goliaro Apr 25, 2024
97562d6
fix
goliaro May 1, 2024
985c254
add peft tests to ci
goliaro May 1, 2024
33dbd3d
Merge branch 'inference' into peft
goliaro May 1, 2024
f033b4e
shellcheck
goliaro May 8, 2024
1011927
fix
goliaro May 9, 2024
9064c2b
fix python requirements
goliaro May 9, 2024
a125e86
fix
goliaro May 10, 2024
d74fe53
fix
goliaro May 11, 2024
0c6ae09
update ci test
goliaro May 17, 2024
93b6032
update alignment doc
goliaro May 17, 2024
9546239
fix cross entropy loss bug
goliaro May 19, 2024
ff4b703
update alignment test
goliaro May 19, 2024
b613666
update test
goliaro May 20, 2024
dde0b61
add llama peft alignment test to ci
goliaro May 20, 2024
1a31b65
Fix values for unused params in incr_decoding
Flechman May 24, 2024
7e3d111
Add PEFTModelID NO_ID singleton instead of None
Flechman May 24, 2024
079ba59
Fix PEFTModelID::NO_ID reference
Flechman May 25, 2024
f464eb8
reduce logging
goliaro May 25, 2024
8d89acd
fix
goliaro May 26, 2024
33c0fef
fix
goliaro May 29, 2024
6727d3a
Add peft demo
Flechman Jun 11, 2024
6d7c245
Add readme for demo
Flechman Jun 11, 2024
511fd64
fix alignment issue
goliaro Jun 20, 2024
9948b4e
Peft optimizer (#1290)
goliaro Jul 10, 2024
7df0ac5
Merge branch 'inference' into peft
goliaro Jul 11, 2024
2789705
Optimizers python interface (#1441)
goliaro Jul 15, 2024
4d0657a
initialize lora weights where needed
goliaro Jul 16, 2024
70eba7b
Add notebook
Flechman Jul 16, 2024
29a2121
Update demo to use dataset
Flechman Jul 16, 2024
69e5e06
Fix'
Flechman Jul 16, 2024
1c7fd1c
Save weights after end of finetuning (#1446)
goliaro Jul 17, 2024
fc4e3f5
Fully use notebook for demo
Flechman Jul 17, 2024
786c320
Parameterize generation and finetuning configs
Flechman Jul 17, 2024
b3c6242
Comment out inference for now
Flechman Jul 17, 2024
b174a2f
fix bug in lora inference only mode
goliaro Jul 17, 2024
a92daef
fix
goliaro Jul 17, 2024
c515caa
Add finetuning or inference only flags
Flechman Jul 18, 2024
5b7db51
fix
Flechman Jul 18, 2024
bcc9702
fix
goliaro Jul 18, 2024
ab9aa2a
fix
goliaro Jul 19, 2024
09e2471
PEFT model upload (#1450)
goliaro Jul 19, 2024
5a44604
Make demo_class.py executable
Flechman Jul 19, 2024
bac88c6
Merge branch 'peft' of https://github.com/flexflow/FlexFlow into peft
Flechman Jul 19, 2024
55eb7ed
fix
goliaro Jul 19, 2024
ad8308d
add base_model_name_or_path
Flechman Jul 20, 2024
8d7b35c
fix
goliaro Jul 20, 2024
f4aca75
fix
goliaro Jul 20, 2024
2b421e4
support llama-3 tokenizer
goliaro Jul 20, 2024
120bca9
print output tokens when not benchmarking
goliaro Jul 20, 2024
dc36d39
Use Llama3 in demo_class
Flechman Jul 20, 2024
ab547f7
Merge branch 'peft' of https://github.com/flexflow/FlexFlow into peft
Flechman Jul 20, 2024
6b904f6
Use Llama3 in demo
Flechman Jul 20, 2024
242bbe5
fix data loading for llama-3
goliaro Jul 20, 2024
7dfb3d0
Add download models to demo
Flechman Jul 21, 2024
4066467
return/print loss at each finetuning step
goliaro Jul 21, 2024
c1eae6d
fix
goliaro Jul 21, 2024
716bcf1
Adjust demo parameters
Flechman Jul 21, 2024
7c48428
Fix for finetuning
Flechman Jul 21, 2024
6394bff
pass finetuning losses to python interface
goliaro Jul 22, 2024
da6d516
Update demo
Flechman Jul 22, 2024
00d83fb
Fix upload
Flechman Jul 22, 2024
a9d2385
Refactor demo
Flechman Jul 22, 2024
6cd0650
rename demo_class to demo
Flechman Jul 22, 2024
03c2f2e
fix
Flechman Jul 22, 2024
e240537
remove epoch from loss print
Flechman Jul 22, 2024
fc731ec
Finish demo
Flechman Jul 22, 2024
7c371eb
fix test
goliaro Aug 4, 2024
52f4564
rocm fixes
goliaro Aug 4, 2024
556d564
more rocm fixes
goliaro Aug 5, 2024
5604c4d
fix rocm build
goliaro Aug 5, 2024
97a4898
docker fix
goliaro Aug 5, 2024
6936cb6
fix inference test
goliaro Aug 5, 2024
a8d9bc3
fix workflow
goliaro Aug 6, 2024
00a70f3
fix makefile
goliaro Aug 6, 2024
925027f
fix peft test
goliaro Aug 6, 2024
b0af3b8
fix all-reduce issue with lora for TP scenario
goliaro Aug 7, 2024
6d18f7b
fix bwd lm head
goliaro Aug 11, 2024
440ad3d
fixes
goliaro Aug 16, 2024
5cbe1a4
more fixes
goliaro Aug 16, 2024
9ca3687
update
goliaro Aug 16, 2024
d0e98ec
fix alignment up to input ln
goliaro Aug 20, 2024
6ebea46
finished aligning all backward (tp>1)
goliaro Aug 22, 2024
f98999c
align all peft
goliaro Aug 22, 2024
5f73328
Merge branch 'inference' into peft
goliaro Sep 2, 2024
b06ed1a
fix
goliaro Sep 2, 2024
3fe93dc
fix broken link
goliaro Sep 2, 2024
1a2fce3
formatting
goliaro Sep 2, 2024
cf4525f
fix
goliaro Sep 2, 2024
90b2c87
update
goliaro Sep 2, 2024
eae9b12
Revert "update"
goliaro Sep 3, 2024
828f72e
update
goliaro Sep 3, 2024
a8294e8
fix hip build
goliaro Sep 3, 2024
ccb28b1
fix gpu ci
goliaro Sep 3, 2024
ec472c2
fix gpu ci
goliaro Sep 3, 2024
aa1aa7b
update default gpu ci version to 12.0
goliaro Sep 3, 2024
9b2bd47
update ci to 12.0
goliaro Sep 3, 2024
f929cca
fix
goliaro Sep 3, 2024
39b8d49
fix
goliaro Sep 3, 2024
a60618b
update
goliaro Sep 3, 2024
b8be6f5
fix
goliaro Sep 3, 2024
0330272
fix
goliaro Sep 3, 2024
5ab1da5
update
goliaro Sep 3, 2024
08012b0
fix
goliaro Sep 3, 2024
e5785e6
add cleanup
goliaro Sep 3, 2024
c37f363
downgrade to cuda=11.8
goliaro Sep 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,14 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Install CUDA
uses: Jimver/[email protected].11
uses: Jimver/[email protected].16
if: ${{ matrix.gpu_backend == 'cuda' }}
id: cuda-toolkit
with:
cuda: "11.8.0"
cuda: "12.1.1"
# Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
use-github-cache: "false"
log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'

- name: Install system dependencies
run: .github/workflows/helpers/install_dependencies.sh
Expand Down Expand Up @@ -156,11 +157,12 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Install CUDA
uses: Jimver/[email protected].11
uses: Jimver/[email protected].16
id: cuda-toolkit
with:
cuda: "11.8.0"
cuda: "12.1.1"
use-github-cache: "false"
log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'

- name: Install system dependencies
run: .github/workflows/helpers/install_dependencies.sh
Expand All @@ -169,7 +171,7 @@ jobs:
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: flexflow
environment-file: conda/environment.yml
environment-file: conda/flexflow.yml
auto-activate-base: false

- name: Build FlexFlow
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,16 @@ jobs:
../config/config.linux
make -j

- name: Run PEFT tests
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export CUDNN_DIR=/usr/local/cuda
export CUDA_DIR=/usr/local/cuda
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib

source ./build/set_python_envs.sh
./tests/peft_test.sh

- name: Run inference tests
env:
CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
Expand Down
23 changes: 20 additions & 3 deletions .github/workflows/helpers/install_cudnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ set -x
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"

ubuntu_version=$(lsb_release -rs)
ubuntu_version=${ubuntu_version//./}

# Install CUDNN
cuda_version=${1:-11.8.0}
cuda_version=${1:-12.1.1}
cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
Expand Down Expand Up @@ -44,8 +47,11 @@ elif [[ "$cuda_version" == "11.7" ]]; then
elif [[ "$cuda_version" == "11.8" ]]; then
CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
elif [[ "$cuda_version" == "12.0" ]]; then
echo "CUDNN support for CUDA version 12.0 not yet added"
elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
else
echo "CUDNN support for CUDA version above 12.5 not yet added"
exit 1
fi
wget -c -q $CUDNN_LINK
Expand All @@ -55,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version"
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update -y
rm -f cuda-keyring_1.1-1_all.deb
sudo dpkg -i $CUDNN_TARBALL_NAME
sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
sudo apt update -y
sudo apt install -y libcudnn8
sudo apt install -y libcudnn8-dev
sudo apt install -y libcudnn8-samples
else
sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
fi
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/helpers/install_nccl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ cd "${BASH_SOURCE[0]%/*}"
# Add NCCL key ring
ubuntu_version=$(lsb_release -rs)
ubuntu_version=${ubuntu_version//./}
wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"
sudo dpkg -i cuda-keyring_1.0-1_all.deb
wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update -y
rm -f cuda-keyring_1.0-1_all.deb
rm -f cuda-keyring_1.1-1_all.deb

# Install NCCL
cuda_version=${1:-11.8.0}
cuda_version=${1:-12.1.1}
cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
echo "Installing NCCL for CUDA version: ${cuda_version} ..."

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/multinode-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
# 10h timeout, instead of default of 360min (6h)
timeout-minutes: 600
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down Expand Up @@ -87,7 +87,7 @@ jobs:
runs-on: self-hosted
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
# 10h timeout, instead of default of 360min (6h)
timeout-minutes: 600
Expand Down Expand Up @@ -138,7 +138,7 @@ jobs:
runs-on: self-hosted
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pip-install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Install CUDA
uses: Jimver/[email protected].11
uses: Jimver/[email protected].16
id: cuda-toolkit
with:
cuda: "11.8.0"
cuda: "12.1.1"
# Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
use-github-cache: "false"

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/prebuild-legion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ jobs:
strategy:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
gpu_backend_version: ["11.8", "5.6"]
gpu_backend_version: ["12.0", "5.6"]
python_version: ["3.11"]
exclude:
- gpu_backend: "cuda"
gpu_backend_version: "5.6"
- gpu_backend: "hip_rocm"
gpu_backend_version: "11.8"
gpu_backend_version: "12.0"
fail-fast: false
steps:
- name: Checkout Git Repository
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,9 @@ gpt_tokenizer
python/flexflow/version.txt

inference_tensors
hf_peft_tensors
lora_training_logs

Untitled-1.ipynb
Untitled-2.ipynb
tests/inference/python_test_configs/*.json
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ if(NOT BUILD_LEGION_ONLY)
if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
add_subdirectory(inference/spec_infer)
add_subdirectory(inference/incr_decoding)
add_subdirectory(inference/peft)
endif()


Expand Down
7 changes: 7 additions & 0 deletions conda/flexflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,10 @@ dependencies:
- sentencepiece
- einops
- requests
- scipy
- bitsandbytes
- datasets
- accelerate
- loralib
- triton
- peft
2 changes: 1 addition & 1 deletion config/config.inc
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ fi

# set ROCM path
if [ -n "$ROCM_PATH" ]; then
SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}"
fi

ADD_ROCM_TO_PATH=""
Expand Down
9 changes: 4 additions & 5 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,14 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
cuda_version_input=${cuda_version}.3
elif [[ "$cuda_version" == @(11.8) ]]; then
cuda_version_input=${cuda_version}.0
elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
# Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
cuda_version=12.2
cuda_version_input=${cuda_version}.2
else
echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
# Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
cuda_version=12.2
cuda_version_input=${cuda_version}.2
fi
echo "Building $image docker image with CUDA $cuda_version"
ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
gpu_backend_version="-${cuda_version}"
Expand Down
2 changes: 2 additions & 0 deletions docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
RUN conda install pytorch torchvision torchaudio -c pytorch
RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
RUN pip3 install tensorflow notebook
# PEFT-related
RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft

# Install Rust
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
Expand Down
2 changes: 1 addition & 1 deletion docker/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
fi
fi
# Check that CUDA version is supported
if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
Expand Down
42 changes: 38 additions & 4 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "legion.h"
#include <cstddef>
#include <cstdlib>
Expand All @@ -36,13 +37,27 @@ using BeamSearchBatchConfigFuture = Legion::Future;
using TreeVerifyBatchConfigFuture = Legion::Future;
using BeamInferenceResultFuture = Legion::Future;

struct OptimizerTasks {
bool compute_gradients = true;
bool reset_gradients_to_zero = false;
bool update_weights = false;
bool save_updated_weights = false;
};

void set_optimizer_tasks(OptimizerTasks &tasks,
int max_training_steps,
int completed_training_steps,
int gradient_accumulation_steps);

class BatchConfig {
public:
using RequestGuid = size_t;
using TokenId = int;
BatchConfig();
int num_active_requests() const;
int num_active_tokens() const;
int num_active_infr_tokens() const;
int num_active_peft_tokens() const;
static int max_requests_per_batch();
static int max_tokens_per_batch();
static int max_verify_tokens_per_batch();
Expand All @@ -56,26 +71,43 @@ class BatchConfig {
// Maximum possible values for different parameters
// These maximum values are used for copying BatchConfig
// across workers
static int const MAX_NUM_REQUESTS = 64;
static int const MAX_NUM_REQUESTS = 65;
static int const MAX_NUM_TOKENS = 1024;
static int const MAX_SPEC_TREE_TOKEN_NUM = 64;

// Set by update
int num_tokens;

int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
// number of tokens in prompt phase, start offset of tokens in inc_decoding
// phase. num_tokens - num_prompt_tokens = num_generation_tokens;
int num_generation_tokens;
int num_generation_tokens = 0;

struct PerRequestInfo {
PerRequestInfo() {
first_token_depth_in_request = 0;
first_token_offset_in_batch = 0;
num_tokens_in_batch = 0;
max_sequence_length = 0;
request_guid = 0;
prompt_phase = false;
batch_config_request_id = -1;
peft_model_id = PEFTModelID::NO_ID;
peft_bwd = false;
optimizer_tasks = {true, false, false, false};
}
int first_token_depth_in_request;
int first_token_offset_in_batch;
int num_tokens_in_batch;
int max_sequence_length;

// request id in batch config:
int batch_config_request_id;
int batch_config_request_id = -1;
bool prompt_phase = false;
RequestGuid request_guid;
// PEFT fields
PEFTModelID peft_model_id;
bool peft_bwd;
OptimizerTasks optimizer_tasks;
};
struct PerTokenInfo {
int abs_depth_in_request;
Expand All @@ -102,6 +134,7 @@ class BatchConfig {
BitMask causalMask[MAX_NUM_REQUESTS];
PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
PerTokenInfo labelsInfo[MAX_NUM_TOKENS];

bool request_completed[MAX_NUM_REQUESTS];
bool request_running[MAX_NUM_REQUESTS];
Expand Down Expand Up @@ -129,6 +162,7 @@ class TreeVerifyBatchConfig : public BatchConfig {
struct InferenceResult {
static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
float finetuning_loss;
};

class BeamSearchBatchConfig : public BatchConfig {
Expand Down
Loading
Loading