diff --git a/benchmarking/debug.sh b/benchmarking/debug.sh index abcc40d773..be2ace94c7 100755 --- a/benchmarking/debug.sh +++ b/benchmarking/debug.sh @@ -23,28 +23,30 @@ make -j install export LEGION_BACKTRACE=1 export FF_DEBG_NO_WEIGHTS=1 -gdb -ex run --args ./inference/incr_decoding/incr_decoding \ - -ll:cpu $NCPUS -ll:gpu $NGPUS -ll:util $NCPUS \ - -ll:fsize 20000 -ll:zsize 10000 \ - -llm-model $MODEL_NAME --verbose \ - -prompt $PROMPT \ - -tensor-parallelism-degree $NGPUS \ - -log-file ../inference/output/test.out \ - -output-file ../inference/output/test.json \ - --max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000 +export CUDA_VISIBLE_DEVICES=1 -#--verbose -lg:prof 1 -lg:prof_logfile prof_%.gz \ - -# ./inference/peft/peft \ -# -ll:cpu 4 -ll:gpu $NGPUS -ll:util 2 \ -# -ll:fsize 10000 -ll:zsize 10000 \ -# --fusion \ -# -llm-model $MODEL_NAME \ -# -enable-peft -peft-model $PEFT_MODEL_NAME \ -# -prompt /usr/FlexFlow/inference/prompt/peft.json \ -# -finetuning-dataset /usr/FlexFlow/inference/prompt/peft_dataset.json \ +# gdb -ex run --args ./inference/incr_decoding/incr_decoding \ +# -ll:cpu $NCPUS -ll:gpu $NGPUS -ll:util $NCPUS \ +# -ll:fsize 20000 -ll:zsize 10000 \ +# -llm-model $MODEL_NAME --verbose \ +# -prompt $PROMPT \ # -tensor-parallelism-degree $NGPUS \ +# -log-file ../inference/output/test.out \ # -output-file ../inference/output/test.json \ # --max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000 - # -lg:prof 1 -lg:prof_logfile prof_%.gz --verbose --inference-debugging \ \ No newline at end of file +#--verbose -lg:prof 1 -lg:prof_logfile prof_%.gz \ + +./inference/peft/peft \ + -ll:cpu 4 -ll:gpu $NGPUS -ll:util 2 \ + -ll:fsize 20000 -ll:zsize 10000 \ + --fusion \ + -llm-model $MODEL_NAME \ + -enable-peft -peft-model $PEFT_MODEL_NAME \ + -prompt /usr/FlexFlow/inference/prompt/peft.json \ + -finetuning-dataset /usr/FlexFlow/inference/prompt/peft_dataset.json \ + -tensor-parallelism-degree $NGPUS \ + -output-file ../inference/output/test.json \ + --max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000 + +# -lg:prof 1 -lg:prof_logfile prof_%.gz --verbose --inference-debugging \ \ No newline at end of file diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index f4348fd743..dd7843ca92 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -178,6 +178,8 @@ void FlexFlow::top_level_task(Task const *task, assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); + + ffconfig.enable_peft_finetuning = enable_peft_finetuning; std::string config_filepath = join_path( {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cbb5bf638f..37c0dc6490 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -126,12 +126,23 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_length; + int max_peft_tokens = BatchConfig::max_sequence_length(); // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].finetuning_request) { size_t activation_size_needed = sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed != m->allocated_peft_buffer_size1) { + std::cout << "activation_size_needed: " << activation_size_needed + << std::endl; + std::cout << "m->allocated_peft_buffer_size1: " << m->allocated_peft_buffer_size1 + << std::endl; + std::cout << "max_peft_tokens: " << max_peft_tokens << std::endl; + std::cout << "m->num_q_heads: " << m->num_q_heads << std::endl; + std::cout << "m->qProjSize: " << m->qProjSize << std::endl; + std::cout << "BatchConfig::max_sequence_length()" << BatchConfig::max_sequence_length() << std::endl; + std::cout << "sizeof(DT)" << sizeof(DT) << std::endl; + } assert(activation_size_needed == m->allocated_peft_buffer_size1); int parallelism = m->hidden_size * num_tokens; store_query_cache<<data_dim; batch_size = rms->effective_batch_size; + enable_peft_finetuning = rms->enable_peft_finetuning; num_elements = in_dim * batch_size; DataType data_type = rms->weights[0]->data_type; @@ -218,6 +219,18 @@ void inference_kernel_wrapper(RMSNormMeta *m, assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); assert(!bc->requestsInfo[i].finetuning_backward_phase); int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (m->allocated_peft_buffer_size != data_type_size(m->input_type[0]) * + BatchConfig::max_sequence_length() * in_dim) { + std::cout << "allocated_peft_buffer_size = " << m->allocated_peft_buffer_size + << ", expected = " << data_type_size(m->input_type[0]) * + BatchConfig::max_sequence_length() * in_dim + << std::endl; + std::cout << "in_dim = " << in_dim << std::endl; + std::cout << "max_sequence_length = " << BatchConfig::max_sequence_length() + << std::endl; + std::cout << "data_type_size = " << data_type_size(m->input_type[0]) + << std::endl; + } assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * BatchConfig::max_sequence_length() * in_dim); diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 4353a9e05f..85a4e19b6e 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -121,10 +121,11 @@ void SigmoidSiluMulti::inference_kernel_wrapper( int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_tokens()); + int max_peft_tokens = BatchConfig::max_sequence_length(); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; size_t input_tensor_size = data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; - assert(m->allocated_peft_buffer_size == 2 * input_tensor_size); + assert(m->allocated_peft_buffer_size == 2 * (data_type_size(m->input_type[0]) * max_peft_tokens * in_dim)); // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(