feat: sync whisper.cpp

mybigday · Dec 9, 2023 · c9ec9f7 · c9ec9f7
1 parent 913954c
commit c9ec9f7
Show file tree

Hide file tree

Showing 16 changed files with 2,755 additions and 861 deletions.
diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c
@@ -137,7 +137,7 @@ void wsp_ggml_tallocr_alloc(wsp_ggml_tallocr_t alloc, struct wsp_ggml_tensor * t
 
 #ifdef WSP_GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
-    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    size_t cur_max = (char*)addr - (char*)alloc->base + size;
     if (cur_max > alloc->max_size) {
         printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
@@ -168,10 +168,6 @@ static void wsp_ggml_tallocr_free_tensor(wsp_ggml_tallocr_t alloc, struct wsp_gg
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
 
-    if (!alloc->measure) {
-        wsp_ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
-    }
-
 #ifdef WSP_GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
 #endif
@@ -237,7 +233,7 @@ void wsp_ggml_tallocr_reset(wsp_ggml_tallocr_t alloc) {
 }
 
 wsp_ggml_tallocr_t wsp_ggml_tallocr_new(void * data, size_t size, size_t alignment) {
-    struct wsp_ggml_backend_buffer * buffer = wsp_ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+    struct wsp_ggml_backend_buffer * buffer = wsp_ggml_backend_cpu_buffer_from_ptr(data, size);
 
     wsp_ggml_tallocr_t alloc = (wsp_ggml_tallocr_t)malloc(sizeof(struct wsp_ggml_tallocr));
 
@@ -449,7 +445,6 @@ static wsp_ggml_tallocr_t node_tallocr(wsp_ggml_gallocr_t galloc, struct wsp_ggm
 static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view, bool update_backend) {
     wsp_ggml_tallocr_t alloc = node_tallocr(galloc, view);
 
-    //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
     WSP_GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
     if (update_backend) {
         view->backend = view->view_src->backend;
@@ -459,7 +454,7 @@ static void init_view(wsp_ggml_gallocr_t galloc, struct wsp_ggml_tensor * view,
 
     // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
     // due to the wsp_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(wsp_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
+    assert(wsp_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
 
     if (!alloc->measure) {
         wsp_ggml_backend_buffer_init_tensor(alloc->buffer, view);
@@ -765,3 +760,43 @@ size_t wsp_ggml_allocr_max_size(wsp_ggml_allocr_t alloc) {
 size_t wsp_ggml_allocr_alloc_graph(wsp_ggml_allocr_t alloc, struct wsp_ggml_cgraph * graph) {
     return wsp_ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
 }
+
+// utils
+wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct wsp_ggml_context * ctx, wsp_ggml_backend_buffer_type_t buft) {
+    WSP_GGML_ASSERT(wsp_ggml_get_no_alloc(ctx) == true);
+
+    size_t alignment = wsp_ggml_backend_buft_get_alignment(buft);
+
+    size_t nbytes = 0;
+    for (struct wsp_ggml_tensor * t = wsp_ggml_get_first_tensor(ctx); t != NULL; t = wsp_ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL && t->view_src == NULL) {
+            nbytes += WSP_GGML_PAD(wsp_ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+    }
+
+    if (nbytes == 0) {
+        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        return NULL;
+    }
+
+    wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_buft_alloc_buffer(buft, nbytes);
+    wsp_ggml_tallocr_t tallocr = wsp_ggml_tallocr_new_from_buffer(buffer);
+
+    for (struct wsp_ggml_tensor * t = wsp_ggml_get_first_tensor(ctx); t != NULL; t = wsp_ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                wsp_ggml_tallocr_alloc(tallocr, t);
+            } else {
+                wsp_ggml_backend_view_init(buffer, t);
+            }
+        }
+    }
+
+    wsp_ggml_tallocr_free(tallocr);
+
+    return buffer;
+}
+
+wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_ctx_tensors(struct wsp_ggml_context * ctx, wsp_ggml_backend_t backend) {
+    return wsp_ggml_backend_alloc_ctx_tensors_from_buft(ctx, wsp_ggml_backend_get_default_buffer_type(backend));
+}
diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h
@@ -8,6 +8,7 @@ extern "C" {
 
 struct wsp_ggml_backend;
 struct wsp_ggml_backend_buffer;
+struct wsp_ggml_backend_buffer_type;
 
 //
 // Legacy API
@@ -80,6 +81,12 @@ WSP_GGML_API void   wsp_ggml_gallocr_alloc_graph_n(
                     struct wsp_ggml_hash_set hash_set,
                     wsp_ggml_tallocr_t * hash_node_talloc);
 
+
+// Utils
+// Create a buffer and allocate all the tensors in a wsp_ggml_context
+WSP_GGML_API struct wsp_ggml_backend_buffer * wsp_ggml_backend_alloc_ctx_tensors_from_buft(struct wsp_ggml_context * ctx, struct wsp_ggml_backend_buffer_type * buft);
+WSP_GGML_API struct wsp_ggml_backend_buffer * wsp_ggml_backend_alloc_ctx_tensors(struct wsp_ggml_context * ctx, struct wsp_ggml_backend * backend);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/cpp/ggml-backend-impl.h b/cpp/ggml-backend-impl.h
@@ -12,31 +12,50 @@ extern "C" {
     // Backend buffer
     //
 
+    // buffer type
+    typedef void * wsp_ggml_backend_buffer_type_context_t;
+
+    struct wsp_ggml_backend_buffer_type_i {
+        wsp_ggml_backend_buffer_t (*alloc_buffer)    (wsp_ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*get_alignment)   (wsp_ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*get_alloc_size)  (wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*supports_backend)(wsp_ggml_backend_buffer_type_t buft, wsp_ggml_backend_t backend); // check if the buffer type is usable by the backend
+    };
+
+    struct wsp_ggml_backend_buffer_type {
+        struct wsp_ggml_backend_buffer_type_i  iface;
+        wsp_ggml_backend_buffer_type_context_t context;
+    };
+
+    // buffer
     typedef void * wsp_ggml_backend_buffer_context_t;
 
     struct wsp_ggml_backend_buffer_i {
-        void   (*free_buffer)   (wsp_ggml_backend_buffer_t buffer);
-        void * (*get_base)      (wsp_ggml_backend_buffer_t buffer); // get base pointer
-        size_t (*get_alloc_size)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor); // pre-allocation callback
-        void   (*init_tensor)   (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor); // post-allocation callback
-        void   (*free_tensor)   (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor); // pre-free callback
+        void     (*free_buffer)(wsp_ggml_backend_buffer_t buffer);
+        //void     (*reset)      (wsp_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        void *   (*get_base)   (wsp_ggml_backend_buffer_t buffer);
+        void     (*init_tensor)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor);
+        void     (*set_tensor) (wsp_ggml_backend_buffer_t buffer,       struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void     (*get_tensor) (wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
+        void (*cpy_tensor_from)(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
+        void (*cpy_tensor_to)  (wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
     };
 
     struct wsp_ggml_backend_buffer {
-        struct wsp_ggml_backend_buffer_i iface;
-
-        wsp_ggml_backend_t                backend;
+        struct wsp_ggml_backend_buffer_i  iface;
+        wsp_ggml_backend_buffer_type_t    buft;
         wsp_ggml_backend_buffer_context_t context;
-
         size_t size;
     };
 
-    WSP_GGML_API wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
-            struct wsp_ggml_backend                  * backend,
+    wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
+                   wsp_ggml_backend_buffer_type_t      buft,
             struct wsp_ggml_backend_buffer_i           iface,
                    wsp_ggml_backend_buffer_context_t   context,
                    size_t                          size);
 
+
     //
     // Backend
     //
@@ -49,20 +68,17 @@ extern "C" {
         void (*free)(wsp_ggml_backend_t backend);
 
         // buffer allocation
-        wsp_ggml_backend_buffer_t (*alloc_buffer)(wsp_ggml_backend_t backend, size_t size);
+        wsp_ggml_backend_buffer_type_t (*get_default_buffer_type)(wsp_ggml_backend_t backend);
 
-        // get buffer alignment
-        size_t (*get_alignment)(wsp_ggml_backend_t backend);
-
-        // tensor data access
-        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        // (optional) asynchroneous tensor data access
         void (*set_tensor_async)(wsp_ggml_backend_t backend,       struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void (*get_tensor_async)(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        void (*synchronize)     (wsp_ggml_backend_t backend);
 
-        // (optional) copy tensor between different backends, allow for single-copy tranfers
-        void (*cpy_tensor_from)(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
-        void (*cpy_tensor_to)  (wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
+        // (optional) asynchroneous tensor copy
+        void (*cpy_tensor_from_async)(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
+        void (*cpy_tensor_to_async)  (wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst);
+
+        void (*synchronize)     (wsp_ggml_backend_t backend);
 
         // compute graph with a plan
         wsp_ggml_backend_graph_plan_t (*graph_plan_create) (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
@@ -82,6 +98,15 @@ extern "C" {
         wsp_ggml_backend_context_t context;
     };
 
+
+    //
+    // Backend registry
+    //
+
+    typedef wsp_ggml_backend_t (*wsp_ggml_backend_init_fn)(const char * params, void * user_data);
+
+    void wsp_ggml_backend_register(const char * name, wsp_ggml_backend_init_fn init_fn, wsp_ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+
 #ifdef  __cplusplus
 }
 #endif