Skip to content

Commit

Permalink
Merge branch 'inference' into optimize_attn
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc authored Sep 18, 2023
2 parents 5a84fd3 + c7cc6b4 commit 238b6bd
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 53 deletions.
2 changes: 1 addition & 1 deletion docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ ENV CUDA_DIR /usr/local/cuda
# Install python packages and other dependencies
RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
# Install CPU-only Pytorch and related dependencies
RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch
RUN conda install pytorch torchvision torchaudio -c pytorch
RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
RUN pip3 install tensorflow notebook

Expand Down
2 changes: 1 addition & 1 deletion inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def get_configs():
"ssms": [
{
# required ssm parameter
"ssm_model": "JackFram/llama-160m",
"ssm_model": "JackFram/llama-160m-base",
# optional ssm parameters
"cache_path": "",
"refresh_cache": False,
Expand Down
24 changes: 12 additions & 12 deletions src/ops/spec_inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -350,18 +350,18 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
}
// add alibi position bias to qk production
// add alibi position bias to qk production
if (*m->position_bias) {
size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
min((size_t)CUDA_NUM_THREADS, parallelism),
0,
stream>>>(C,
num_new_tokens,
total_tokens,
m->num_q_heads,
m->global_num_q_heads,
shard_id);
}
if (*m->position_bias) {
size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
min((size_t)CUDA_NUM_THREADS, parallelism),
0,
stream>>>(C,
num_new_tokens,
total_tokens,
m->num_q_heads,
m->global_num_q_heads,
shard_id);
}
// Fill all elements above diagonal in qk prods with -inf to force
// causal attention.
assert(num_new_tokens <= total_tokens);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ RequestManager::RequestGuid
request.status = Request::PENDING;
request.guid = next_available_guid++;
request.max_sequence_length = max_sequence_length;
if (bos_token_id >= 0) {
if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
request.tokens.push_back(bos_token_id);
}
std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
Expand Down
46 changes: 26 additions & 20 deletions tests/inference/cpp_inference_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ cd "${BASH_SOURCE[0]%/*}"
###############################################################################################

# LLAMA
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
# LLAMA (half precision)
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4

# OPT
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
Expand All @@ -22,9 +22,9 @@ cd "${BASH_SOURCE[0]%/*}"
# Tensor parallelism tests
if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
# LLAMA
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
# LLAMA (half precision)
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2

# OPT
../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
Expand All @@ -37,9 +37,9 @@ fi
###############################################################################################

# LLAMA (small model)
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
# LLAMA (small model, half precision)
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4

# LLAMA (big model)
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4
Expand Down Expand Up @@ -69,11 +69,11 @@ fi
# Tensor parallelism tests
if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
# LLAMA (small model)
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
# LLAMA (small model, half precision)
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4

# LLAMA (big model)
../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
Expand Down Expand Up @@ -216,28 +216,32 @@ fi
######################### Alignment tests with HuggingFace ####################################

# LLAMA (small model, full precision)
python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu
python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu

# LLAMA (small model, half precision)
python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu
python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu

# LLAMA (big model, full precision)
python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt"
python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt"

# LLAMA (big model, half precision)
python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu
python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu

# OPT (small model, full precision)
python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128
python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128

# OPT (small model, half precision)
python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128
python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128

# OPT (big model, full precision)
#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127
python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 128

# OPT (big model, half precision)
#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127
# python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 128

# Falcon (full precision)
python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128


diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt")
diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
Expand All @@ -246,5 +250,7 @@ diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s

diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt")
diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt")
#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt")
diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt")
# diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt")
diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_falcon_7B.txt")

16 changes: 10 additions & 6 deletions tests/inference/huggingface_inference.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import json
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer

def main():
# Change working dir to folder storing this script
Expand All @@ -12,7 +12,6 @@ def main():
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, required=True)
parser.add_argument("--tokenizer-model-name", type=str, required=True)
parser.add_argument("--max-length", type=int, default=128)
parser.add_argument("--prompt-file", type=str, required=True)
parser.add_argument("--output-file", type=str, required=True)
Expand Down Expand Up @@ -46,15 +45,20 @@ def main():

# Run huggingface model
device = "cuda" if args.gpu else "cpu"
# Get Model
model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device)
if args.tokenizer_model_name == "JackFram/llama-160m":
tokenizer = LlamaTokenizer.from_pretrained("JackFram/llama-160m", use_fast=True)
# Get Tokenizer
hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
hf_arch = getattr(hf_config, "architectures")[0]
if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True)
else:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model_name)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
# Generate output
with open(args.output_file, "w") as f:
for i, prompt in enumerate(prompt_list):
batch = tokenizer(
prompt_list, return_tensors="pt", add_special_tokens=True
prompt, return_tensors="pt", add_special_tokens=True
).to(device)
generated = model.generate(batch["input_ids"], max_length=args.max_length)
out = tokenizer.decode(generated[0])
Expand Down
Loading

0 comments on commit 238b6bd

Please sign in to comment.