Skip to content

Commit

Permalink
Update LLAMA tokenizer (#1524)
Browse files Browse the repository at this point in the history
* fix tokenizer conversion

* update

* update

* update

* fix

* fix

* lint

* simplify api

* fix

* fix

* fix

* update to 12.1 (#1512)

* fix deadlock?

* remove barrier where not strictly needed

---------

Co-authored-by: zhihao <email>
  • Loading branch information
goliaro authored Oct 19, 2024
1 parent dbd4cf1 commit 2bfa56c
Show file tree
Hide file tree
Showing 28 changed files with 378 additions and 155 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
CONDA: "3"
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Keep alive
Expand All @@ -75,7 +75,7 @@ jobs:
CONDA: "3"
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down Expand Up @@ -151,7 +151,7 @@ jobs:
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down Expand Up @@ -239,7 +239,7 @@ jobs:
CONDA: "3"
needs: inference-tests
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down
11 changes: 6 additions & 5 deletions cmake/nccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
if(NCCL_VERSION VERSION_LESS 2.23)
set(NCCL_OLD TRUE)
else()
set(NCCL_OLD FALSE)
endif()
set(NCCL_OLD FALSE)
# if(NCCL_VERSION VERSION_LESS 2.23)
# set(NCCL_OLD TRUE)
# else()
# set(NCCL_OLD FALSE)
# endif()
message(STATUS "Found NCCL version: ${NCCL_VERSION}")
else()
message(WARNING "NCCL header not found, unable to determine version")
Expand Down
24 changes: 12 additions & 12 deletions docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,18 @@ ENV CUDA_DIR /usr/local/cuda
ARG FF_GPU_BACKEND "cuda"

# Update NCCL if FF_GPU_BACKEND is cuda
RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
ubuntu_version=$(lsb_release -rs); \
ubuntu_version=${ubuntu_version//./}; \
wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
rm -f cuda-keyring_1.0-1_all.deb; \
DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
else \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
fi'
# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
# ubuntu_version=$(lsb_release -rs); \
# ubuntu_version=${ubuntu_version//./}; \
# wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
# DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
# DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
# rm -f cuda-keyring_1.0-1_all.deb; \
# DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
# else \
# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
# fi'

# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
# Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
Expand Down
11 changes: 8 additions & 3 deletions include/flexflow/ops/kernels/lora_linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#include "flexflow/ops/lora_linear.h"

namespace FlexFlow {

using Legion::Context;
using Legion::Runtime;
struct LoraLinearWeight {
// weights
void *w0_ptr, *w1_ptr;
Expand Down Expand Up @@ -46,7 +47,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
BatchConfig const *bc,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);
void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
void peft_bwd_kernel_wrapper(Context ctx,
Runtime *runtime,
LoraLinearMeta *m,
BatchConfig const *bc,
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output_grad);
Expand All @@ -63,7 +66,9 @@ void inference_kernel(LoraLinearMeta *m,
int out_dim,
ffStream_t stream);
template <typename DT>
void peft_bwd_kernel(LoraLinearMeta *m,
void peft_bwd_kernel(Context ctx,
Runtime *runtime,
LoraLinearMeta *m,
BatchConfig const *bc,
DT *input_grad_ptr,
DT const *output_grad_ptr,
Expand Down
11 changes: 8 additions & 3 deletions include/flexflow/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
#include "legion.h"

namespace FlexFlow {

using Legion::Context;
using Legion::Runtime;
class FFModel;
class OpMeta;

Expand Down Expand Up @@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(SGDOptimizer const *op,
static void nccl_update_task_gpu(Context ctx,
Runtime *runtime,
SGDOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
size_t size,
Expand Down Expand Up @@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(AdamOptimizer const *op,
static void nccl_update_task_gpu(Context ctx,
Runtime *runtime,
AdamOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
size_t size,
Expand Down
3 changes: 2 additions & 1 deletion include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ struct Request {
BatchConfig::RequestGuid guid;
PEFTModelID peft_model_id = PEFTModelID::NO_ID;
int max_length = -1;
int max_new_tokens = 128;
int max_new_tokens = -1;
int initial_len;
int ssm_cache_size = 0;
int llm_cache_size = 0;
Expand Down Expand Up @@ -302,6 +302,7 @@ class RequestManager {
ModelType model_type;
int bos_token_id;
int eos_token_id;
bool old_llama_tokenizer = false;
std::string output_filepath;
std::queue<Request> pending_infr_request_queue;
std::queue<Request> pending_peft_request_queue;
Expand Down
2 changes: 1 addition & 1 deletion inference/peft/peft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.max_length = 128;
inference_req.max_new_tokens = 128;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
requests.push_back(inference_req);
Expand Down
3 changes: 1 addition & 2 deletions inference/python/ff_peft.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def main():
ff.Request(
ff.RequestType.REQ_INFERENCE,
prompt=prompt,
max_sequence_length=128,
max_new_tokens=128,
peft_model_id=llm.get_ff_peft_id(lora_inference_config),
)
for prompt in prompts
Expand All @@ -172,7 +172,6 @@ def main():
if len(configs.finetuning_dataset) > 0:
finetuning_request = ff.Request(
ff.RequestType.REQ_FINETUNING,
max_sequence_length=128,
peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
dataset_filepath=configs.finetuning_dataset,
max_training_steps=2,
Expand Down
17 changes: 10 additions & 7 deletions inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
Expand All @@ -71,6 +71,7 @@ def get_configs():
"full_precision": False,
"prompt": "",
"output_file": "",
"max_length": 128,
}
# Merge dictionaries
ff_init_configs.update(llm_configs)
Expand Down Expand Up @@ -106,9 +107,9 @@ def main():
max_seq_length=256,
max_tokens_per_batch=64,
)

llm.start_server()

if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
if "max_length" not in configs_dict:
Expand All @@ -119,8 +120,10 @@ def main():
if "max_length" not in configs_dict:
result = llm.generate("Three tips for staying healthy are: ")
else:
result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)

result = llm.generate(
"Three tips for staying healthy are: ", max_length=configs.max_length
)

llm.stop_server()


Expand Down
24 changes: 17 additions & 7 deletions inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
Expand All @@ -81,6 +81,7 @@ def get_configs():
],
"prompt": "",
"output_file": "",
"max_length": 128,
}
# Merge dictionaries
ff_init_configs.update(llm_configs)
Expand Down Expand Up @@ -144,17 +145,26 @@ def main():
max_tokens_per_batch=64,
ssms=ssms,
)

llm.start_server()

if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
results = llm.generate(prompts)
if "max_length" not in configs_dict:
results = llm.generate(prompts)
else:
results = llm.generate(prompts, max_length=configs.max_length)
else:
result = llm.generate("Three tips for staying healthy are: ")

if "max_length" not in configs_dict:
result = llm.generate("Three tips for staying healthy are: ")
else:
result = llm.generate(
"Three tips for staying healthy are: ", max_length=configs.max_length
)

llm.stop_server()


if __name__ == "__main__":
print("flexflow inference example (speculative inference)")
main()
Loading

0 comments on commit 2bfa56c

Please sign in to comment.