Skip to content

Commit

Permalink
Update TensorRT-LLM (#1763)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Kota Tsuyuzaki <[email protected]>
Co-authored-by: Pzzzzz <[email protected]>
Co-authored-by: Patrick Reiter Horn <[email protected]>
  • Loading branch information
4 people authored Jun 11, 2024
1 parent b777bd6 commit db4edea
Show file tree
Hide file tree
Showing 301 changed files with 556,832 additions and 523,784 deletions.
2 changes: 1 addition & 1 deletion benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ mpirun -n 2 ./benchmarks/gptManagerBenchmark \
--max_num_samples 500
```

`gptManagerBenchmark` can also be used with the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`). This can be done by passing the argument `--api executor`. Note that the Executor class is still under development and currently does not support models with tp or pp > 1.
`gptManagerBenchmark` by default uses the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`).

#### Emulated static batching

Expand Down
163 changes: 86 additions & 77 deletions benchmarks/cpp/gptManagerBenchmark.cpp

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class EncDecBuildConfig:
builder_opt: Optional[int] = None
n_mels: Optional[int] = None
skip_cross_qkv: bool = False
use_implicit_relative_attention: Optional[bool] = False

def __post_init__(self) -> None:
assert self.head_size is not None
Expand Down Expand Up @@ -584,6 +585,25 @@ class ModelConfig:
builder_opt=None,
remove_input_padding=False,
)),
"glm_10b":
ModelConfig(name="glm_10b",
family="glm",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=48,
num_heads=64,
num_kv_heads=64,
hidden_size=4096,
inter_size=16384,
vocab_size=50304,
hidden_act='gelu',
n_positions=1024,
max_batch_size=128,
max_input_len=1024,
max_output_len=256,
builder_opt=None,
remove_input_padding=False,
)),
"bloom_560m":
ModelConfig(name="bloom_560m",
family="bloom",
Expand Down
52 changes: 47 additions & 5 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def build_gpt(args):
raise Exception(
f'--opt_num_tokens does not support ootb mode. Please using --opt_batch_size instead it.'
)

max_num_tokens = max_batch_size * max(max_input_len, max_beam_width)
quant_config = get_quant_config(args.quantization)
quant_algo = quant_config.quant_algo
kv_cache_quant_algo = quant_config.kv_cache_quant_algo
Expand Down Expand Up @@ -309,6 +309,7 @@ def build_gpt(args):
max_beam_width=max_beam_width,
max_input_len=max_input_len,
max_output_len=max_output_len,
max_num_tokens=max_num_tokens,
int8=(quant_mode.has_act_and_weight_quant()
or quant_mode.is_int8_weight_only()),
quant_mode=quant_mode,
Expand Down Expand Up @@ -572,6 +573,39 @@ def build_gpt(args):
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.ChatGLMForCausalLM(config)

elif family == "glm":
config = {
'architecture': 'ChatGLMForCausalLM',
'dtype': args.dtype,
'num_hidden_layers': build_config['num_layers'],
'num_attention_heads': build_config['num_heads'],
'num_key_value_heads': build_config['num_kv_heads'],
'hidden_size': build_config['hidden_size'],
'intermediate_size': build_config['inter_size'],
'norm_epsilon': 1e-5,
'vocab_size': build_config['vocab_size'],
'position_embedding_type': 'learned_absolute',
'max_position_embeddings': build_config['n_positions'],
'hidden_act': build_config['hidden_act'],
'quantization': {
'quant_algo': quant_algo,
'kv_cache_quant_algo': kv_cache_quant_algo
},
'mapping': {
'world_size': world_size,
'tp_size': world_size
},
'chatglm_version': 'glm',
'add_bias_linear': True,
'add_qkv_bias': True,
'apply_query_key_layer_scaling': False,
'apply_residual_connection_post_layernorm': False,
'rmsnorm': False,
'rope_ratio': 1.0,
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.ChatGLMForCausalLM(config)

elif family == "bloom":
config = {
'architecture': 'BloomForCausalLM',
Expand Down Expand Up @@ -871,6 +905,7 @@ def build_gpt(args):
'layer_types': build_config['layer_types'],
'rnn_hidden_size': build_config['rnn_hidden_size'],
'logits_soft_cap': build_config['logits_soft_cap'],
'rotary_pct': build_config['rotary_pct'],
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.RecurrentGemmaForCausalLM(
Expand Down Expand Up @@ -935,10 +970,13 @@ def build_gpt(args):
print(
f"max_batch_size: {max_batch_size}, max_input_len: {max_input_len}, max_output_len: {max_output_len}, max_beam_width: {max_beam_width}"
)
# NOTE: all other models use PretrainedModel.prepare_inputs(...)
# except RecurrentGemmaForCausalLM and MambaForCausalLM
inputs = tensorrt_llm_model.prepare_inputs(
max_batch_size=max_batch_size,
max_input_len=max_input_len,
max_seq_len=max_input_len + max_output_len,
max_num_tokens=max_num_tokens,
use_cache=True,
max_beam_width=max_beam_width,
opt_batch_size=opt_batch_size,
Expand Down Expand Up @@ -1293,7 +1331,7 @@ def enc_dec_build_helper(component, config, args):
has_embedding_layernorm,
'has_embedding_scale':
config.get('has_embedding_scale', False),
'ffn_hidden_size':
'intermediate_size':
config['ffn_hidden_size'],
'q_scaling':
q_scaling,
Expand Down Expand Up @@ -1358,7 +1396,7 @@ def enc_dec_build_helper(component, config, args):
has_embedding_layernorm,
'has_embedding_scale':
config.get('has_embedding_scale', False),
'ffn_hidden_size':
'intermediate_size':
config['ffn_hidden_size'],
'q_scaling':
q_scaling,
Expand All @@ -1381,12 +1419,16 @@ def enc_dec_build_helper(component, config, args):
'encoder_head_size':
config['head_size'],
'skip_cross_qkv':
config['skip_cross_qkv']
config['skip_cross_qkv'],
'use_implicit_relative_attention':
config['use_implicit_relative_attention']
})
tllm_model = tensorrt_llm.models.DecoderModel(pretrained_config)
if use_weight_only and family == 'whisper':
tllm_model = quantize(tllm_model, quant_config)

tllm_model.precompute_relative_attention_bias(builder_config)

# Module -> Network
engine_name = get_engine_name(args.model, args.dtype, world_size,
runtime_rank)
Expand Down Expand Up @@ -1418,7 +1460,7 @@ def enc_dec_build_helper(component, config, args):
if family == 'whisper':
inputs = tllm_model.prepare_inputs(
max_batch_size=config['max_batch_size'], )
tllm_model(*inputs)
tllm_model(**inputs)
else:
inputs = tllm_model.prepare_inputs(
max_batch_size=config['max_batch_size'],
Expand Down
9 changes: 9 additions & 0 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,15 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.GenerationSession(
model_config, engine_buffer, self.runtime_mapping)
if args.model == 'glm_10b':
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=50258,
pad_id=50256,
num_beams=self.num_beams,
top_k=args.top_k,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
model_config, engine_buffer, self.runtime_mapping)
else:
end_id = 50256
pad_id = 50256
Expand Down
28 changes: 25 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,31 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
endif()
endif()

message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
# Store CMAKE_CUDA_ARCHITECTURES for later use since torch sets this to "OFF"
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
# Detect highest available compute capability
set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
set(CUDAFILE ${CMAKE_SOURCE_DIR}/cmake/utils/detect_cuda_arch.cu)
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${CUDAFILE} -o
${OUTPUTFILE})
message(VERBOSE "Detecting native CUDA compute capability")
execute_process(
COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE CUDA_ARCH_OUTPUT)
if(NOT ${CUDA_RETURN_CODE} EQUAL 0)
message(WARNING "Detecting native CUDA compute capability - fail")
message(
WARNING "CUDA compute capability detection failed, compiling for 'all'")
set(CMAKE_CUDA_ARCHITECTURES_ORIG "all")
else()
message(STATUS "Detecting native CUDA compute capability - done")
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CUDA_ARCH_OUTPUT}")
endif()
else()
# Store CMAKE_CUDA_ARCHITECTURES for later use since torch sets this to "OFF"
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
endif()
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")

enable_language(C CXX CUDA)

Expand Down
39 changes: 39 additions & 0 deletions cpp/cmake/utils/detect_cuda_arch.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include <algorithm>
#include <cuda_runtime.h>
#include <iomanip>
#include <iostream>
#include <vector>

int main(int argc, char* argv[])
{
int n_devices = 0;
int rc = cudaGetDeviceCount(&n_devices);
if (rc != cudaSuccess)
{
cudaError_t error = cudaGetLastError();
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
return rc;
}

std::vector<std::pair<int, int>> arch(n_devices);
for (int cd = 0; cd < n_devices; ++cd)
{
cudaDeviceProp dev;
int rc = cudaGetDeviceProperties(&dev, cd);
if (rc != cudaSuccess)
{
cudaError_t error = cudaGetLastError();
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
return rc;
}
else
{
arch[cd] = {dev.major, dev.minor};
}
}

std::pair<int, int> best_cc = *std::max_element(begin(arch), end(arch));
std::cout << best_cc.first << best_cc.second;

return 0;
}
6 changes: 3 additions & 3 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ class GptManager
using RequestList = std::list<std::shared_ptr<LlmRequest>>;
using TensorPtr = runtime::ITensor::SharedPtr;

GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, SizeType32 maxBeamWidth,
executor::SchedulerConfig const& schedulerConfig, GetInferenceRequestsCallback getInferenceRequestsCb,
SendResponseCallback sendResponseCb, PollStopSignalCallback pollStopSignalCb = nullptr,
GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType,
GetInferenceRequestsCallback getInferenceRequestsCb, SendResponseCallback sendResponseCb,
PollStopSignalCallback pollStopSignalCb = nullptr,
ReturnBatchManagerStatsCallback returnBatchManagerStatsCb = nullptr,
TrtGptModelOptionalParams const& optionalParams = TrtGptModelOptionalParams(),
std::optional<uint64_t> terminateReqId = std::nullopt, bool excludeInputInOutput = false);
Expand Down
19 changes: 19 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,12 @@ class GenericLlmRequest
runtime::ITensor::makeShape({mSamplingConfig.beamWidth, mMaxNewTokens, vocabSizePadded}), logitsDataType);
}

void allocTargetModelAcceptedTokenLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType)
{
mGenerationLogitsHost = runtime::BufferManager::pinned(
runtime::ITensor::makeShape({getNumDraftTokens() + 1, vocabSizePadded}), logitsDataType);
}

[[nodiscard]] std::vector<TensorPtr> const& getGenerationLogitsFragments() const
{
return mGenerationLogitsFragments;
Expand Down Expand Up @@ -901,6 +907,18 @@ class GenericLlmRequest
result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
}

if (getReturnTargetModelAcceptedLogits())
{
auto targetModelAcceptedTokenLogitsShape = getGenerationLogitsHost()->getShape();
TLLM_CHECK(targetModelAcceptedTokenLogitsShape.nbDims == 2);
auto numAcceptedToken = targetModelAcceptedTokenLogitsShape.d[0];
auto vocabSizePadded = targetModelAcceptedTokenLogitsShape.d[1];
// Align the shape of accepted token logits and generation logits
TensorPtr targetModelAcceptedTokenLogitsHostView = runtime::ITensor::view(
getGenerationLogitsHost(), runtime::ITensor::makeShape({1, numAcceptedToken, vocabSizePadded}));
result.generationLogits = executor::detail::ofITensor(targetModelAcceptedTokenLogitsHostView);
}

if (getReturnEncoderOutput())
{
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
Expand Down Expand Up @@ -1023,6 +1041,7 @@ class GenericLlmRequest
auto data = runtime::bufferCast<int32_t>(*tensor);
std::memcpy(data, words.data(), numWords * sizeof(int32_t));
std::memcpy(data + numWords, offsets.data(), numWords * sizeof(int32_t));

// Add leading dim of 1
tensor->unsqueeze(0);

Expand Down
14 changes: 11 additions & 3 deletions cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "tensorrt_llm/runtime/common.h"

#include <optional>
#include <utility>
#include <vector>

namespace tensorrt_llm::batch_manager
Expand All @@ -39,15 +40,19 @@ class TrtGptModelOptionalParams
bool enableTrtOverlap = false, std::optional<std::vector<SizeType32>> const& deviceIds = std::nullopt,
bool normalizeLogProbs = true, bool enableChunkedContext = false,
PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
executor::DecodingConfig const& decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1)
executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
std::optional<SizeType32> maxBeamWidth = std::nullopt,
executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{})
: kvCacheConfig{kvCacheConfig}
, enableTrtOverlap{enableTrtOverlap}
, deviceIds(deviceIds)
, normalizeLogProbs{normalizeLogProbs}
, enableChunkedContext{enableChunkedContext}
, peftCacheManagerConfig(peftCacheManagerConfig)
, decodingConfig(decodingConfig)
, decodingConfig(std::move(decodingConfig))
, gpuWeightsPercent(gpuWeightsPercent)
, maxBeamWidth(maxBeamWidth)
, schedulerConfig{schedulerConfig}
{
}

Expand All @@ -57,7 +62,8 @@ class TrtGptModelOptionalParams
executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
executorConfig.getGpuWeightsPercent())
executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(),
executorConfig.getSchedulerConfig())
{
}

Expand All @@ -80,6 +86,8 @@ class TrtGptModelOptionalParams
executor::DecodingConfig decodingConfig;
// Percentage of weights on the gpu at runtime
float gpuWeightsPercent;
std::optional<SizeType32> maxBeamWidth;
executor::SchedulerConfig schedulerConfig;
};

} // namespace tensorrt_llm::batch_manager
Loading

0 comments on commit db4edea

Please sign in to comment.