From b45c158f63827e0392dd9d9625bca5647d4204cf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 19 Nov 2024 19:52:42 +0000 Subject: [PATCH] updated --- benchmarks/benchmark_throughput.py | 4 +++- .../quantization/compressed_tensors/compressed_tensors.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 262b8652e49ff..83c686740784e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -302,7 +302,9 @@ def main(args: argparse.Namespace): for request in requests) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + f"{total_output_tokens / elapsed_time:.2f} output tokens/s, " + f"{total_num_tokens=} | {total_output_tokens=}" + ) # Output JSON results if specified if args.output_json: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 22d0488379537..713bc8e789365 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -359,8 +359,7 @@ def get_scheme( """ scheme = CompressedTensors24( model_compressor=self.model_compressor, - layer_name=layer_name - ) + layer_name=layer_name) # scheme = CompressedTensorsW8A8Fp8( # strategy=QuantizationStrategy.CHANNEL, # is_static_input_scheme=False)