diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7b94e1b52a5fd..b4055edd05873 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -332,8 +332,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: # Send token-by-token response for each request.n delta_text = output.text[len(previous_texts[i]):] previous_texts[i] = output.text - completion_tokens = len(output.token_ids) - previous_num_tokens[i] = completion_tokens + previous_num_tokens[i] = len(output.token_ids) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(content=delta_text), @@ -351,8 +350,8 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: prompt_tokens = len(res.prompt_token_ids) final_usage = UsageInfo( prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + previous_num_tokens[i], ) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=[], finish_reason=output.finish_reason)