From 4ee464a73243df8864f0b1d1c385c4498c10fb36 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Mon, 2 Dec 2024 22:15:30 -0500
Subject: [PATCH] chore: add notes for performance

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 043d33ed96665..e93a0bd7b268c 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -233,6 +233,8 @@ def __call__(self, input_ids: list[int],
 
         for i, matcher in enumerate(self.matchers):
             if not matcher.is_terminated():
+                # @ubospica: ideally, fill_next_token_bitmask should be parallelized with model decoding
+                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
                 matcher.fill_next_token_bitmask(self.token_bitmask, i)
 
         # token_bitmask is a CPU tensor for use with accept_token and