From 2094062b4eafe465826e936fbd5cbd8f099d7762 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 4 Nov 2024 15:11:59 -0800 Subject: [PATCH] [4.5/N] bugfix for quant config in speculative decode (#10007) Signed-off-by: youkaichao --- vllm/spec_decode/spec_decode_worker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a402181b13db8..eb3c2e88e668c 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -61,6 +61,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": draft_worker_config = copy.deepcopy(vllm_config) draft_worker_config.model_config = speculative_config.draft_model_config + draft_worker_config.quant_config = VllmConfig._get_quantization_config( + draft_worker_config.model_config, + vllm_config.load_config, + ) draft_worker_config.parallel_config = speculative_config.draft_parallel_config # noqa # TODO allow draft-model specific load config.