Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Nov 27, 2024
1 parent ee48def commit 450c98f
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 144 deletions.
33 changes: 28 additions & 5 deletions benchmarking/debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,44 @@ set -e
cd "${BASH_SOURCE[0]%/*}/../build"

# MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
# PROMPT="../benchmarking/test.json"
PROMPT="/usr/FlexFlow/inference/prompt/peft.json"
MODEL_NAME="JackFram/llama-160m"
PEFT_MODEL_NAME="goliaro/llama-160m-lora"
NGPUS=1
NCPUS=4

reset
make -j install

# python ../inference/utils/download_hf_model.py $MODEL_NAME
# python ../inference/utils/download_peft_model.py $PEFT_MODEL_NAME

python ../inference/utils/download_hf_model.py $MODEL_NAME

export LEGION_BACKTRACE=1
export FF_DEBG_NO_WEIGHTS=1

./inference/incr_decoding/incr_decoding \
-ll:cpu 16 -ll:gpu $NGPUS -ll:util 16 \
gdb -ex run --args ./inference/incr_decoding/incr_decoding \
-ll:cpu $NCPUS -ll:gpu $NGPUS -ll:util $NCPUS \
-ll:fsize 20000 -ll:zsize 10000 \
--fusion \
--verbose -lg:prof 1 -lg:prof_logfile prof_%.gz \
-llm-model $MODEL_NAME \
-prompt ../benchmarking/test.json \
-prompt $PROMPT \
-tensor-parallelism-degree $NGPUS \
-log-file ../inference/output/test.out \
-output-file ../inference/output/test.json \
--max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000

# ./inference/peft/peft \
# -ll:cpu 4 -ll:gpu $NGPUS -ll:util 2 \
# -ll:fsize 10000 -ll:zsize 10000 \
# --fusion \
# -llm-model $MODEL_NAME \
# -enable-peft -peft-model $PEFT_MODEL_NAME \
# -prompt /usr/FlexFlow/inference/prompt/peft.json \
# -finetuning-dataset /usr/FlexFlow/inference/prompt/peft_dataset.json \
# -tensor-parallelism-degree $NGPUS \
# -output-file ../inference/output/test.json \
# --max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000

# -lg:prof 1 -lg:prof_logfile prof_%.gz --verbose --inference-debugging \
1 change: 1 addition & 0 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ struct InferenceResult {
static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
float finetuning_loss;
friend std::ostream &operator<<(std::ostream &os, InferenceResult const &result);
};

class BeamSearchBatchConfig : public BatchConfig {
Expand Down
4 changes: 2 additions & 2 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ struct Request {
std::vector<struct BeamTree> beam_trees;

Request() = default;
Request(Request const &other);
void load_token_ids();
static Request from_other(Request const &other);

friend std::ostream &operator<<(std::ostream &os, Request const &req);
};
Expand Down Expand Up @@ -152,6 +151,7 @@ class RequestManager {

bool load_request_token_ids(Request &request);

void set_verbose(bool verbose);
void set_max_requests_per_batch(int max_num_requests);
int get_max_requests_per_batch();
void set_max_tokens_per_batch(int max_num_tokens);
Expand Down
30 changes: 25 additions & 5 deletions include/flexflow/utils/file_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

using namespace std;
using namespace FlexFlow;
using namespace Legion;


class FileDataLoader {
public:
Expand All @@ -36,16 +38,31 @@ class FileDataLoader {
BatchConfig::TokenId *generate_requests(int num, int length);

template <typename DT>
void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
void load_single_weight_tensor(FFModel *ff,
Layer *l,
int weight_idx,
size_t volume,
size_t num_replicas,
DT *weight,
Domain weight_domain);

void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
void load_quantization_weight(FFModel *ff,
Layer *l,
int weight_idx,
size_t volume,
size_t num_replicas,
char *weight,
DataType data_type,
Domain weight_domain);

static void
load_weight_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
void load_weights_parallel(FFModel *ff,
Legion::Context ctx,
Legion::Runtime *runtime);

void load_positions(FFModel *ff,
Tensor pt,
Expand All @@ -66,12 +83,15 @@ struct WeightLoadTaskArgs {
FileDataLoader *loader;
Layer *layer;
int weight_idx;
size_t volume, num_replicas;
DataType data_type;
WeightLoadTaskArgs(FFModel *_ff,
FileDataLoader *_loader,
Layer *_l,
int _idx,
size_t _volume,
size_t _num_replicas,
DataType _data_type)
: ff(_ff), loader(_loader), layer(_l), weight_idx(_idx),
data_type(_data_type) {}
: ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), volume(_volume),
num_replicas(_num_replicas), data_type(_data_type) {}
};
141 changes: 82 additions & 59 deletions inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ void FlexFlow::top_level_task(Task const *task,

GenerationConfig generationConfig(do_sample, temperature, topp);
RequestManager *rm = RequestManager::get_request_manager();
rm->set_verbose(verbose);
rm->set_max_requests_per_batch(max_requests_per_batch);
rm->set_max_tokens_per_batch(max_tokens_per_batch);
rm->set_max_sequence_length(max_sequence_length);
Expand Down Expand Up @@ -271,74 +272,96 @@ void FlexFlow::top_level_task(Task const *task,
}

rm->start_background_server(&model);
// {
// using json = nlohmann::json;
// std::ifstream file_handle(file_paths.prompt_file_path);
// assert(file_handle.good() && "Prompt file does not exist.");
// nlohmann::ordered_json prompt_json =
// nlohmann::ordered_json::parse(file_handle,
// /*parser_callback_t */ nullptr,
// /*allow_exceptions */ true,
// /*ignore_comments */ true);
// file_handle.close();
// auto &metadata = prompt_json["metadata"];
// int num_warmup_requests = metadata["num_warmup_requests"];
// int num_regular_requests = 0, total_requests = 0;
// std::vector<Request> warmup_requests, requests;
// for (auto &entry : prompt_json["entries"]) {
// int prompt_length = entry["prompt_length"];
// int response_length = entry["response_length"];
// std::string text = entry["prompt"];
// bool is_warmup_request = total_requests < num_warmup_requests;

// Request inference_req;
// inference_req.prompt = text;
// inference_req.add_special_tokens = false;
// inference_req.max_new_tokens = response_length;

// if (is_warmup_request) {
// warmup_requests.push_back(inference_req);
// } else {
// printf("Prompt[%d]: %s\n", total_requests, text.c_str());
// requests.push_back(inference_req);
// num_regular_requests++;
// }

// total_requests++;
// }
// std::vector<GenerationResult> warmup_result =
// model.generate(warmup_requests);
// std::vector<GenerationResult> result = model.generate(requests);

// assert(warmup_result.size() == warmup_requests.size());
// assert(result.size() == requests.size());
// assert(result.size() + warmup_result.size() == total_requests);
// int i = 0;
// for (auto &entry : prompt_json["entries"]) {
// if (i < num_warmup_requests) {
// i++;
// continue;
// }
// int index = i - num_warmup_requests;
// entry["original_response"] = entry["response"];
// entry["original_response_length"] = entry["response_length"];
// std::string ff_out = result[index].output_text;
// int tot_length = result[index].output_text.length();
// entry["response"] = ff_out;
// entry["response_length"] = result[index].output_tokens.size();
// i++;
// }

// // Write the modified JSON to a file
// std::ofstream output_file(file_paths.output_file_path);
// if (output_file.is_open()) {
// output_file << prompt_json.dump(2);
// output_file.close();
// std::cout << "Modified JSON has been saved to "
// << file_paths.output_file_path << std::endl;
// } else {
// std::cerr << "Unable to open file for writing." << std::endl;
// }
// }
int total_num_requests = 0;
{
using json = nlohmann::json;
std::ifstream file_handle(file_paths.prompt_file_path);
assert(file_handle.good() && "Prompt file does not exist.");
nlohmann::ordered_json prompt_json =
nlohmann::ordered_json::parse(file_handle,
/*parser_callback_t */ nullptr,
/*allow_exceptions */ true,
/*ignore_comments */ true);
file_handle.close();
auto &metadata = prompt_json["metadata"];
int num_warmup_requests = metadata["num_warmup_requests"];
int num_regular_requests = 0, total_requests = 0;
std::vector<Request> warmup_requests, requests;
for (auto &entry : prompt_json["entries"]) {
int prompt_length = entry["prompt_length"];
int response_length = entry["response_length"];
std::string text = entry["prompt"];
bool is_warmup_request = total_requests < num_warmup_requests;
json prompt_json = json::parse(file_handle,
/*parser_callback_t */ nullptr,
/*allow_exceptions */ true,
/*ignore_comments */ true);

std::vector<Request> requests;
for (auto &prompt : prompt_json) {
std::string text = prompt.get<std::string>();
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.add_special_tokens = false;
inference_req.max_new_tokens = response_length;

if (is_warmup_request) {
warmup_requests.push_back(inference_req);
} else {
printf("Prompt[%d]: %s\n", total_requests, text.c_str());
requests.push_back(inference_req);
num_regular_requests++;
}

total_requests++;
inference_req.max_length = 128;
requests.push_back(inference_req);
total_num_requests++;
}
std::vector<GenerationResult> warmup_result =
model.generate(warmup_requests);
std::vector<GenerationResult> result = model.generate(requests);

assert(warmup_result.size() == warmup_requests.size());
assert(result.size() == requests.size());
assert(result.size() + warmup_result.size() == total_requests);
int i = 0;
for (auto &entry : prompt_json["entries"]) {
if (i < num_warmup_requests) {
i++;
continue;
}
int index = i - num_warmup_requests;
entry["original_response"] = entry["response"];
entry["original_response_length"] = entry["response_length"];
std::string ff_out = result[index].output_text;
int tot_length = result[index].output_text.length();
entry["response"] = ff_out;
entry["response_length"] = result[index].output_tokens.size();
i++;
}

// Write the modified JSON to a file
std::ofstream output_file(file_paths.output_file_path);
if (output_file.is_open()) {
output_file << prompt_json.dump(2);
output_file.close();
std::cout << "Modified JSON has been saved to "
<< file_paths.output_file_path << std::endl;
} else {
std::cerr << "Unable to open file for writing." << std::endl;
}
}

// terminate the request manager by stopping the background thread
Expand Down
5 changes: 5 additions & 0 deletions inference/peft/peft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ void FlexFlow::top_level_task(Task const *task,
<< std::endl;
assert(false);
}
if (!enable_peft) {
std::cerr << "Running PEFT script with PEFT not enabled" << std::endl;
assert(false);
}
if (enable_peft && peft_model_name.empty()) {
std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
assert(false);
Expand Down Expand Up @@ -272,6 +276,7 @@ void FlexFlow::top_level_task(Task const *task,

GenerationConfig generationConfig(do_sample, temperature, topp);
RequestManager *rm = RequestManager::get_request_manager();
rm->set_verbose(verbose);
rm->set_max_requests_per_batch(
max_requests_per_batch +
(int)enable_peft_finetuning); // add one slot for finetuning if needed
Expand Down
1 change: 1 addition & 0 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ def download_and_convert_peft_model(hf_peft_model_id: str):
weights_path = self.__get_resource_path(
hf_peft_model_id.lower(), CachedResourceType.WEIGHTS
)
print(f"Opening {adapter_path}...")
with safe_open(adapter_path, framework="pt", device="cpu") as f:
for tensor_name in f.keys():
tensor = f.get_tensor(tensor_name)
Expand Down
14 changes: 14 additions & 0 deletions src/runtime/batch_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,20 @@ int BatchConfig::max_spec_tree_token_num() {
return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
}

// print InferenceResult
std::ostream &operator<<(std::ostream &os, InferenceResult const &result) {
os << "InferenceResult {";
os << "MAX_NUM_TOKENS: " << InferenceResult::MAX_NUM_TOKENS << ", ";
os << "token_ids: [";
for (int i = 0; i < 16; i++) {
os << result.token_ids[i] << ", ";
}
os << "], ";
os << "finetuning_loss: " << result.finetuning_loss;
os << "}";
return os;
}

std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode()
<< ") @@@@@@@@@@@@@@" << std::endl;
Expand Down
Loading

0 comments on commit 450c98f

Please sign in to comment.