flexflow · jiazhihao · Jan 14, 2024 · Nov 2, 2023 · Nov 4, 2023 · Nov 4, 2023
diff --git a/.github/README.md b/.github/README.md
@@ -79,7 +79,12 @@ ssms=[]
 ssm = ff.SSM("JackFram/llama-68m")
 ssms.append(ssm)
 ```
-Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs.
+Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. You can also use the following arguments to specify serving configuration when compiling LLMs and SSMs:
+
+* max\_requests\_per\_batch: the maximum number of requests to serve in a batch (default: 16)
+* max\_seq\_length: the maximum number of tokens in a request (default: 256)
+* max\_tokens\_per\_batch: the maximum number of tokens to process in a batch (default: 128)
+
 ```python
 # Create the sampling configs
 generation_config = ff.GenerationConfig(
@@ -91,11 +96,16 @@ for ssm in ssms:
     ssm.compile(generation_config)
 
 # Compile the LLM for inference and load the weights into memory
-llm.compile(generation_config, ssms=ssms)
+llm.compile(generation_config,
+            max_requests_per_batch = 16,
+            max_seq_length = 256,
+            max_tokens_per_batch = 128,
+            ssms=ssms)
 ```
 Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text.
 ```python
-result = llm.generate("Here are some travel tips for Tokyo:\n")
+with llm:
+  result = llm.generate("Here are some travel tips for Tokyo:\n")
 ```
 
 ### Incremental decoding
@@ -124,10 +134,14 @@ generation_config = ff.GenerationConfig(
 )
 
 # Compile the LLM for inference and load the weights into memory
-llm.compile(generation_config)
+llm.compile(generation_config,
+            max_requests_per_batch = 16,
+            max_seq_length = 256,
+            max_tokens_per_batch = 128)
 
 # Generation begins!
-result = llm.generate("Here are some travel tips for Tokyo:\n")
+with llm:
+  result = llm.generate("Here are some travel tips for Tokyo:\n")
 ```
 
 </details>

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -264,14 +264,14 @@ if(NOT BUILD_LEGION_ONLY)
     LIST_DIRECTORIES False
     ${FLEXFLOW_ROOT}/include/*.h)
 
-  list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
+  #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
 
   file(GLOB_RECURSE FLEXFLOW_SRC
     LIST_DIRECTORIES False
     ${FLEXFLOW_ROOT}/src/*.cc)
 
   list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
-  list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
+  #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
 
   set(FLEXFLOW_CPP_DRV_SRC
     ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -611,13 +611,13 @@ flexflow_perf_metrics_t
 
 void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
-flexflow_generation_result_t
-    flexflow_model_generate(flexflow_model_t handle_,
-                            char const *input_text,
-                            int max_num_chars,
-                            char *output_text,
-                            int max_seq_length,
-                            int *output_length_and_tokens);
+void flexflow_model_generate(flexflow_model_t handle_,
+                             int num_requests,
+                             char const **input_text,
+                             int max_num_chars,
+                             char **output_text,
+                             int max_seq_length,
+                             int **output_length_and_tokens);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
 
@@ -988,6 +988,12 @@ void flexflow_request_manager_register_output_filepath(
 int flexflow_request_manager_register_ssm_model(
     flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
 
+void flexflow_request_manager_start_background_server(
+    flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
+
+void flexflow_request_manager_terminate_background_server(
+    flexflow_request_manager_t handle_);
+
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------
@@ -1004,6 +1010,11 @@ void flexflow_inference_manager_compile_model_and_allocate_buffer(
 void flexflow_inference_manager_init_operators_inference(
     flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
 
+void flexflow_inference_manager_register_model_weights_loader(
+    flexflow_inference_manager_t handle_,
+    flexflow_model_t model_handle,
+    flexflow_file_data_loader_t loader_handle);
+
 // -----------------------------------------------------------------------
 // FileDataLoader
 // -----------------------------------------------------------------------
@@ -1014,13 +1025,13 @@ flexflow_file_data_loader_t
                                      int num_kv_heads,
                                      int hidden_dim,
                                      int qkv_inner_dim,
-                                     int tensor_parallelism_degree);
+                                     int tensor_parallelism_degree,
+                                     bool use_full_precision);
 
 void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 
 void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
-                                            flexflow_model_t model_handle_,
-                                            bool use_full_precision);
+                                            flexflow_model_t model_handle_);
 
 #ifdef __cplusplus
 }

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -247,6 +247,7 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
+  RM_BACKGROUND_SERVING_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
@@ -806,8 +807,8 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  GenerationResult generate(std::vector<std::string> &prompts,
-                            int max_seq_length);
+  std::vector<GenerationResult> generate(std::vector<std::string> &prompts,
+                                         int max_seq_length);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -18,6 +18,8 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
+#include "flexflow/utils/file_loader.h"
+#include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
 
@@ -30,25 +32,29 @@ using tokenizers::Tokenizer;
 
 class InferenceManager {
 public:
-  InferenceManager(FFConfig const &config);
+  InferenceManager();
   static InferenceManager *get_inference_manager();
   void compile_model_and_allocate_buffer(FFModel *model);
   void init_operators_inference(FFModel *model);
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
       inference(FFModel *model, int index, BatchConfigFuture const &bc);
-  void load_input_tokens_from_batch_config(BatchConfigFuture const &bc,
+  void load_input_tokens_from_batch_config(FFModel *model,
+                                           BatchConfigFuture const &bc,
                                            ParallelTensor const input,
                                            FFHandler *handlers);
-  void load_positions(BatchConfigFuture const &bc,
+  void load_positions(FFModel *model,
+                      BatchConfigFuture const &bc,
                       ParallelTensor position_input,
                       int offset);
-  void load_inference_metadata_batch_config(BatchConfigFuture const &bc,
+  void register_model_weights_loader(FFModel *, FileDataLoader *);
+  void load_inference_metadata_batch_config(FFModel *model,
+                                            BatchConfigFuture const &bc,
                                             FFHandler *handlers);
 
 public:
-  FFConfig ff_config;
   std::unordered_map<ParallelTensor, std::vector<ParallelTensor>> tensor_buffer;
+  std::unordered_map<FFModel *, FileDataLoader *> model_weights_loaders;
   int num_devices;
 };
 
@@ -91,9 +97,15 @@ struct BeamTree {
 
 class RequestManager {
 public:
+  enum Status {
+    INITIALIZED = 1001,
+    SERVING = 1002,
+    TERMINATED = 1003,
+  };
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
+  static const RequestGuid INVALID_GUID = 0;
   RequestManager();
   static RequestManager *get_request_manager();
   size_t get_num_processed_requests();
@@ -125,42 +137,54 @@ class RequestManager {
                      int initLength,
                      int non_tree_size);
 
-  FFModel *get_model(int model_id);
+  FFModel *get_ssm_model(int model_id);
 
-  GenerationResult generate_incr_decoding(FFModel *model,
-                                          std::vector<std::string> &prompts,
-                                          int max_seq_length);
-  GenerationResult generate_spec_infer(FFModel *model,
-                                       std::vector<std::string> &prompts,
-                                       int max_seq_length);
+  void serve_incr_decoding(FFModel *model);
+  void serve_spec_infer(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,
                                    int max_sequence_length);
   RequestGuid register_new_request(std::vector<TokenId> const &prompt,
                                    int max_sequence_length);
+  // Methods to start and terminate request manager's background task
+  void start_background_server(FFModel *model);
+  bool is_background_server_terminated();
+  void terminate_background_server();
+  static void terminate_background_server_at_exit();
+  // Methods to check and mark request completion
   bool is_request_completed(RequestGuid const &guid);
+  void trigger_request_completion_future(RequestGuid const &guid);
+  // Methods for preparing next batches
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
-                                       InferenceResultFuture const &result);
+                                       InferenceResultFuture const &result,
+                                       Legion::Context ctx,
+                                       Legion::Runtime *runtime);
   BeamSearchBatchConfig
       prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
                               BeamInferenceResult const &result);
   BeamSearchBatchConfigFuture
       prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result);
+                              BeamInferenceResultFuture const &result,
+                              Legion::Context ctx,
+                              Legion::Runtime *runtime);
   BeamSearchBatchConfig
       prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                               InferenceResult const &result,
                               int model_id);
   BeamSearchBatchConfigFuture
       prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
                               InferenceResultFuture const &result,
-                              int model_id);
+                              int model_id,
+                              Legion::Context ctx,
+                              Legion::Runtime *runtime);
   TreeVerifyBatchConfig prepare_next_batch_verify(
       std::vector<BeamSearchBatchConfig> const &old_batches);
   TreeVerifyBatchConfigFuture prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfigFuture> const &old_batches);
+      std::vector<BeamSearchBatchConfigFuture> const &old_batches,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
 
   void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
                            BeamInferenceResult const &result);
@@ -187,7 +211,11 @@ class RequestManager {
           &inputSerializedTree,
       std::vector<std::pair<BatchConfig::TokenId, int>> const
           &outputSerializedTree);
-
+  static void background_serving_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
   static void
       load_tokens_task(Legion::Task const *task,
                        std::vector<Legion::PhysicalRegion> const &regions,
@@ -233,9 +261,11 @@ class RequestManager {
   int max_requests_per_batch;
   int max_tokens_per_batch;
   int max_sequence_length;
+  Status request_manager_status;
 
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
+
   // private fields
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -247,12 +277,9 @@ class RequestManager {
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
+  std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
+  std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
-  // Legion futures for inc_decoding and spec_infer
-  BatchConfigFuture last_bcf;
-  InferenceResultFuture last_irf;
-  TreeVerifyBatchConfigFuture last_tree_bcf;
-  InferenceResultFuture last_tree_irf;
 
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,
@@ -262,11 +289,14 @@ class RequestManager {
       committed_tokens;
 
   // Multi-model support
-  std::vector<FFModel *> models;
+  std::vector<FFModel *> ssm_models;
 
   // Performance profiling
   size_t num_processed_requests;
 
+  // Background server handler
+  Legion::Future background_server_handler;
+
 private:
   struct ProfileInfo {
     int llm_decoding_steps;

diff --git a/inference/file_loader.h → include/flexflow/utils/file_loader.h b/inference/file_loader.h → include/flexflow/utils/file_loader.h
@@ -30,18 +30,16 @@ class FileDataLoader {
                  int _num_kv_heads,
                  size_t _hidden_dim,
                  size_t _qkv_inner_dim,
-                 int _tensor_parallelism_degree);
+                 int _tensor_parallelism_degree,
+                 bool _use_full_precision);
 
   BatchConfig::TokenId *generate_requests(int num, int length);
 
   template <typename DT>
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
-  void load_quantization_weight(FFModel *ff,
-                                Layer *l,
-                                int weight_idx,
-                                bool use_full_precision);
-  void load_weights(FFModel *ff, bool use_full_precision);
+  void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
+  void load_weights(FFModel *ff);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -54,4 +52,5 @@ class FileDataLoader {
   size_t hidden_dim, qkv_inner_dim;
   std::string prompts_filepath;
   std::string weights_folder;
+  bool use_full_precision;
 };
diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt
@@ -7,7 +7,6 @@ set(project_target incr_decoding)
 set(CPU_SRC
   ${FLEXFLOW_CPP_DRV_SRC}
   incr_decoding.cc
-  ../file_loader.cc
   ../models/llama.cc
   ../models/opt.cc
   ../models/falcon.cc

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -24,6 +24,7 @@
 
 #include <nlohmann/json.hpp>
 
+using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
@@ -250,6 +251,8 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  rm->start_background_server(&model);
+
   int total_num_requests = 0;
   {
     using json = nlohmann::json;
@@ -266,10 +269,13 @@ void FlexFlow::top_level_task(Task const *task,
       total_num_requests++;
       prompts.push_back(text);
     }
-    GenerationResult result =
+    std::vector<GenerationResult> result =
         model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
   // Execution fence
   {
     Future future = runtime->issue_execution_fence(ctx);