diff --git a/vllm_profile/profile_all.sh b/vllm_profile/profile_all.sh deleted file mode 100755 index df55f25a69b8e..0000000000000 --- a/vllm_profile/profile_all.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# This runs the entire script, generating two CSVs: one for 1 output token, 1 for 11 output tokens. -# takes roughly 35 hours - -DECODE_TOKENS=(1 11) -BATCH_SIZES=(1 2 4 8 16 32 64 128) -CONTEXT_SIZES=(8 16 32 64 128 256 512 1024 2048 4096 8182) -SM_COUNTS=(16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84 88 92 96 100 104 108 112 116 120 124 128 132) - -# this will overwrite existing CSVs -echo "batch size,context size,SM count,milliseconds" > measured_times_decode1.csv -echo "batch size,context size,SM count,milliseconds" > measured_times_decode11.csv - -for decode_tokens in "${DECODE_TOKENS[@]}"; do - for batch_size in "${BATCH_SIZES[@]}"; do - for context_size in "${CONTEXT_SIZES[@]}"; do - for sm_count in "${SM_COUNTS[@]}"; do - python profile_green_ctx.py "$decode_tokens" "$batch_size" "$context_size" "$sm_count" - done - done - done -done diff --git a/vllm_profile/profile_green_ctx.py b/vllm_profile/profile_green_ctx.py deleted file mode 100644 index 1053ac8fa22b4..0000000000000 --- a/vllm_profile/profile_green_ctx.py +++ /dev/null @@ -1,75 +0,0 @@ -import argparse -import json -import os -import sys - -from vllm import LLM, SamplingParams -from green_ctx import init, make_shard - - -def get_profile_args(): - parser = argparse.ArgumentParser() - parser.add_argument("decode_tokens", type=int) - parser.add_argument("batch_size", type=int) - parser.add_argument("context_size", type=int) - parser.add_argument("sm_count", type=int) - - args = parser.parse_args() - return args.decode_tokens, args.batch_size, args.context_size, args.sm_count - - -def get_kernel_time(trace_json): - with open(trace_json, 'r') as f: - data = json.load(f) - events = data['traceEvents'] - - total_kernel_time = 0 # microseconds - for event in events: - if 'cat' in event and event['cat'] == 'kernel': - total_kernel_time += event['dur'] - - total_kernel_time /= 1000 - print(f'Kernel time for {trace_json}:', total_kernel_time, 'ms') - return total_kernel_time - - -if __name__ == '__main__': - # init green context cuda stuff - init() - - # get command line args - decode_tokens, batch_size, context_size, sm_count = get_profile_args() - print(f'\n\nProfile for decode_tokens={decode_tokens}, batch_size={batch_size} context_size={context_size}, sm_count={sm_count}\n\n') - - with open(f'measured_times_decode{decode_tokens}.csv', 'r') as f: - content = f.read() - if f'{batch_size},{context_size},{sm_count},' in content: - print('skipped\n\n') - sys.exit() - - # set env variables for storing trace json files - trace_dir = f"/workspace/vllm/vllm_profile/D{decode_tokens}_B{batch_size}_L{context_size}" - os.makedirs(trace_dir, exist_ok=True) - os.environ["VLLM_TORCH_PROFILER_DIR"] = trace_dir - os.environ["SM_COUNT"] = str(sm_count) - - # init vLLM stuff - llm = LLM( - model="meta-llama/Meta-Llama-3-8B-Instruct", - # load_format='dummy', - enforce_eager=True, - ) - prompts = ["hi" * (context_size - 1)] - sampling_params = SamplingParams(min_tokens=decode_tokens, max_tokens=decode_tokens) - - # create green context and use for inference - green_ctx = make_shard(sm_count) - with green_ctx.with_context(): - llm.start_profile() - outputs = llm.generate(prompts, sampling_params) - llm.stop_profile() - - # get total kernel time from trace json and write to csv - time = get_kernel_time(os.path.join(trace_dir, f'sm{sm_count}.json')) - with open(f'measured_times_decode{decode_tokens}.csv', 'a') as f: - f.write(f'{batch_size},{context_size},{sm_count},{time}\n')