Skip to content

Commit

Permalink
[SpecInfer] Reduce single request per batch overhead (flexflow#1155)
Browse files Browse the repository at this point in the history
* Initial commit.

* Format

* Update batch_config setup.
  • Loading branch information
zwang86 authored Sep 29, 2023
1 parent 1d5e0c5 commit ee6090e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
3 changes: 1 addition & 2 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ class BatchConfig {
void print() const;
virtual InferenceMode get_mode() const;
static BatchConfig const *from_future(BatchConfigFuture const &future);
static int const MAX_NUM_REQUESTS = 4;
static int const MAX_NUM_REQUESTS = 7;
static int const MAX_NUM_TOKENS = 64;
static int const MAX_PROMPT_LENGTH = 62;
static int const MAX_SEQ_LENGTH = 256;

// These are set by update
Expand Down
14 changes: 14 additions & 0 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
<< std::endl;
}
new_bc.num_tokens_to_commit++;
request.llm_cache_size++;
}
}
}
Expand Down Expand Up @@ -1255,6 +1256,19 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
"Exceeding the space available in the TreeVerify batch");
break;
}

if (new_bc.num_tokens + request.llm_cache_size >= request.initial_len) {
// launch the request into running phase after loading all prompt
request.status = Request::RUNNING;
new_bc.request_running[i] = true;

std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
<< new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;

dfs_tree_inputs[guid] =
std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
request.tokens.back(), request.tokens.size() - 1)};
}
} else { // launch the request into running phase after loading all prompt
if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) {
request.status = Request::RUNNING;
Expand Down

0 comments on commit ee6090e

Please sign in to comment.