From 5f49734171ad8537b4b958406a837f2c3e5e2824 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Mon, 2 Dec 2024 22:15:30 -0500 Subject: [PATCH] chore: add notes for performance Signed-off-by: Aaron Pham --- vllm/model_executor/guided_decoding/xgrammar_decoding.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 043d33ed96665..8287cd6cf3aa0 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -233,6 +233,9 @@ def __call__(self, input_ids: list[int], for i, matcher in enumerate(self.matchers): if not matcher.is_terminated(): + # @ubospica: ideally, fill_next_token_bitmask should be + # parallelized with model decoding + # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303 matcher.fill_next_token_bitmask(self.token_bitmask, i) # token_bitmask is a CPU tensor for use with accept_token and