vllm-project · lixiaobai09 · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024 · Dec 4, 2024
@@ -187,6 +187,7 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
 set(VLLM_EXT_SRC
   "csrc/cache_kernels.cu"
+  "csrc/kv_store/kv_store.cu"
   "csrc/attention/paged_attention_v1.cu"
   "csrc/attention/paged_attention_v2.cu"
   "csrc/pos_encoding_kernels.cu"

diff --git a/csrc/cache.h b/csrc/cache.h
@@ -8,6 +8,19 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                  const torch::Tensor& block_mapping);
 
+void kv_store_copy_incomplete_blocks(
+    torch::Tensor& src, torch::Tensor& dst, const int64_t layer_id,
+    const torch::Tensor& incomplete_block_mapping);
+void kv_store_copy_blocks2CPU(torch::Tensor& src, torch::Tensor& dst,
+                              const int64_t layer_id,
+                              const torch::Tensor& block_mapping);
+
+void kv_store_copy_blocks2GPU(
+    torch::Tensor& src, std::vector<torch::Tensor> const& dst,
+    const int64_t num_layers, const torch::Tensor& block_mapping,
+    const torch::Tensor& block_offsets, const torch::Tensor& req_ids,
+    std::vector<long> const& events, const bool is_batch_layer);
+
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -1,6 +1,7 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <iostream>
 
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
@@ -21,6 +22,14 @@
 typedef __hip_bfloat16 __nv_bfloat16;
 #endif
 
+#include "kv_store/kv_store.hpp"
+
+namespace {
+
+KVStore kv_store;
+
+};  // namespace
+
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                  const torch::Tensor& block_mapping) {
   torch::Device src_device = src.device();
@@ -62,6 +71,40 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
   }
 }
 
+// src layout: [2, num_blocks, block_size, num_kv_heads, head_size]
+// dst layout: [num_blocks, 2, num_layer, block_size, num_kv_heads*head_size]
+void kv_store_copy_incomplete_blocks(
+    torch::Tensor& src, torch::Tensor& dst, const int64_t layer_id,
+    const torch::Tensor& incomplete_block_mapping) {
+  kv_store.CopyIncompleteBlocks(src, dst, layer_id, incomplete_block_mapping);
+}
+
+// src layout: [2, num_blocks, block_size, num_kv_heads, head_size]
+// dst layout: [num_blocks, 2, num_layer, block_size, num_kv_heads*head_size]
+void kv_store_copy_blocks2CPU(torch::Tensor& src, torch::Tensor& dst,
+                              const int64_t layer_id,
+                              const torch::Tensor& block_mapping) {
+  kv_store.CopyBlocks2CPU(src, dst, layer_id, block_mapping);
+}
+
+// src layout: [num_blocks, 2, num_layer, block_size, num_kv_heads*head_size]
+// kv_caches layout: [layers, [2, num_blocks, block_size, num_kv_heads,
+// head_size]]
+void kv_store_copy_blocks2GPU(
+    torch::Tensor& src, std::vector<torch::Tensor> const& kv_caches,
+    const int64_t num_layers, const torch::Tensor& block_mapping,
+    const torch::Tensor& block_offsets, const torch::Tensor& req_ids,
+    std::vector<long> const& events, const bool is_batch_layer) {
+  if (is_batch_layer) {
+    const int64_t num_requests = req_ids.size(0);
+    kv_store.CopyBlocks2GPUBatch(src, kv_caches, num_layers, block_mapping,
+                                 block_offsets, num_requests, events);
+  } else {
+    kv_store.CopyLayerBlocks2GPU(src, kv_caches, num_layers, block_mapping,
+                                 block_offsets, req_ids, events);
+  }
+}
+
 namespace vllm {
 
 // Grid: (num_layers, num_pairs)