working lstm

PABannier · Oct 19, 2024 · 13d6ed1 · 13d6ed1
1 parent d1118e9
commit 13d6ed1
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,6 @@ encodec
 *.th
 .vscode/
 
-build/
+build/
+
+*.wav
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,9 @@ endif()
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_FLAGS_DEBUG   "-g -O0")
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     set(ENCODEC_STANDALONE ON)

diff --git a/encodec.cpp b/encodec.cpp
@@ -1,6 +1,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
+#include "ggml/src/ggml-impl.h"
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -32,6 +33,7 @@
 #include "quantizer.h"
 
 #define ENCODEC_FILE_MAGIC 'ggml'
+#define ENCODEC_MAX_NODES 80000
 
 typedef enum {
     // Run the end-to-end encoder-decoder pipeline
@@ -96,9 +98,28 @@ struct encodec_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
+struct encodec_ggml_cgraph_deleter {
+    void operator()(struct ggml_cgraph * cgraph) {
+        if (cgraph->nodes)
+            free(cgraph->nodes);
+        if (cgraph->leafs)
+            free(cgraph->leafs);
+        if (cgraph->visited_hash_set.keys)
+            free(cgraph->visited_hash_set.keys);
+        if (cgraph->grads)
+            free(cgraph->grads);
+        free(cgraph);
+    }
+};
+
 struct encodec_context {
     encodec_model model;
 
+    // computational graph stored on the heap to avoid stack overflows
+    // the computational graph grows with the sequence length (because of the LSTM)
+    // which requires a lot of nodes
+    std::unique_ptr<struct ggml_cgraph, encodec_ggml_cgraph_deleter> gf;
+
     // buffer for model evaluation
     ggml_backend_buffer_t buf_compute;
 
@@ -201,7 +222,6 @@ bool encodec_load_model_weights(std::ifstream &infile, encodec_model &model, int
 #ifdef GGML_USE_METAL
     if (n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
         model.backend = ggml_backend_metal_init();
         if (!model.backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
@@ -473,24 +493,65 @@ bool encodec_load_model_weights(std::ifstream &infile, encodec_model &model, int
             model.n_loaded++;
         }
 
-        printf("%s: model size = %8.2f MB\n", __func__, total_size / 1024.0 / 1024.0);
+        printf("%s: model size = %.2f MB\n", __func__, total_size / 1024.0 / 1024.0);
     }
 
     infile.close();
 
     return true;
 }
 
-struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx,
-                                        const float * inp_audio,
-                                        const int n_samples,
-                                        const encodec_run_mode_t mode) {
+static struct ggml_cgraph * encodec_ggml_cgraph_create(size_t size) {
+    struct ggml_cgraph * cgraph = (struct ggml_cgraph *)calloc(1, sizeof(struct ggml_cgraph));
+    cgraph->size = size;
+    cgraph->n_nodes = 0;
+    cgraph->n_leafs = 0;
+    cgraph->nodes = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *));
+    cgraph->leafs = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *));
+
+    // next primes after powers of two
+    static const size_t primes[] = {
+        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
+        2053, 4099, 8209, 16411, 32771, 65537, 131101,
+        262147, 524309, 1048583, 2097169, 4194319, 8388617,
+        16777259, 33554467, 67108879, 134217757, 268435459,
+        536870923, 1073741827, 2147483659
+    };
+    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
+
+    // find the smallest prime that is larger or equal to size
+    size_t l = 0;
+    size_t r = n_primes;
+    while (l < r) {
+        size_t m = (l + r)/2;
+        if (primes[m] < size * 2) {
+            l = m + 1;
+        } else {
+            r = m;
+        }
+    }
+    size_t hash_size = l < n_primes ? primes[l] : (size * 2 + 1);
+
+    cgraph->visited_hash_set.size = hash_size;
+    cgraph->visited_hash_set.keys = (struct ggml_tensor **)calloc(1, hash_size * sizeof(struct ggml_tensor *));
+    cgraph->visited_hash_set.used = (ggml_bitset_t *)calloc(1, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t));
+    cgraph->order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT;
+
+    return cgraph;
+}
+
+void encodec_build_graph(struct encodec_context *ectx,
+                         const float * inp_audio,
+                         const int n_samples,
+                         const encodec_run_mode_t mode) {
     assert(mode == encodec_run_mode_t::FULL || mode == encodec_run_mode_t::ENCODE);
 
     const auto & model   = ectx->model;
     const auto & hparams = model.hparams;
     const auto & allocr  = ectx->allocr;
 
+    auto & gf = ectx->gf;
+
     const int *ratios       = hparams.ratios;
     const int kernel_size   = hparams.kernel_size;
     const int res_kernel_sz = hparams.residual_kernel_size;
@@ -504,7 +565,7 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx,
 
     // since we are using ggml-alloc, this buffer only needs enough space to hold the
     // ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static size_t buf_size = ggml_tensor_overhead() * ENCODEC_MAX_NODES + ggml_graph_overhead();
     static std::vector<uint8_t> buf(buf_size);
 
     struct ggml_init_params ggml_params = {
@@ -515,7 +576,7 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx,
 
     struct ggml_context *ctx0 = ggml_init(ggml_params);
 
-    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
+    gf = std::unique_ptr<struct ggml_cgraph, encodec_ggml_cgraph_deleter>(encodec_ggml_cgraph_create(ENCODEC_MAX_NODES));
 
     struct ggml_tensor *inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_samples);
     ggml_set_name(inp, "inp");
@@ -541,19 +602,18 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx,
         case encodec_run_mode_t::FULL: {
             ggml_set_name(decoded, "decoded");
             ggml_set_output(decoded);
-            ggml_build_forward_expand(gf, decoded);
+            ggml_build_forward_expand(gf.get(), decoded);
         } break;
         case encodec_run_mode_t::ENCODE: {
             ggml_set_name(codes, "codes");
             ggml_set_output(codes);
-            ggml_build_forward_expand(gf, codes);
+            ggml_build_forward_expand(gf.get(), codes);
         } break;
         case encodec_run_mode_t::DECODE: {
-            return NULL;
+            assert(false);
         } break;
         default: {
             fprintf(stderr, "%s: unknown run mode\n", __func__);
-            return NULL;
         } break;
     }
 
@@ -562,18 +622,18 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx,
     ectx->encoded = encoded;
     ectx->codes   = codes;
     ectx->decoded = decoded;
-
-    return gf;
 }
 
-struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int32_t *codes,
-                                        const int n_codes, const encodec_run_mode_t mode) {
+void encodec_build_graph(struct encodec_context *ectx, const int32_t *codes,
+                         const int n_codes, const encodec_run_mode_t mode) {
     assert(mode == encodec_run_mode_t::DECODE);
 
     const auto & model   = ectx->model;
     const auto & hparams = model.hparams;
     const auto & allocr  = ectx->allocr;
 
+    auto & gf = ectx->gf;
+
     const int n_bins        = hparams.n_bins;
     const int sr            = hparams.sr;
     const int bandwidth     = hparams.bandwidth;
@@ -589,12 +649,12 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3
 
     if (n_codes % n_q != 0) {
         fprintf(stderr, "%s: invalid number of codes\n", __func__);
-        return NULL;
+        assert(false);
     }
 
     const int N = n_codes / n_q;
 
-    static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static size_t buf_size = ggml_tensor_overhead() * ENCODEC_MAX_NODES + ggml_graph_overhead();
     static std::vector<uint8_t> buf(buf_size);
 
     struct ggml_init_params ggml_params = {
@@ -605,7 +665,7 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3
 
     struct ggml_context *ctx0 = ggml_init(ggml_params);
 
-    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
+    gf = std::unique_ptr<struct ggml_cgraph, encodec_ggml_cgraph_deleter>(encodec_ggml_cgraph_create(ENCODEC_MAX_NODES));
 
     struct ggml_tensor *inp_codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_q);
     ggml_set_name(inp_codes, "inp_codes");
@@ -626,20 +686,18 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3
         case encodec_run_mode_t::DECODE: {
             ggml_set_name(decoded, "decoded");
             ggml_set_output(decoded);
-            ggml_build_forward_expand(gf, decoded);
+            ggml_build_forward_expand(gf.get(), decoded);
         } break;
         default: {
             fprintf(stderr, "%s: unknown run mode\n", __func__);
-            return NULL;
+            assert(false);
         } break;
     }
 
     ggml_free(ctx0);
 
     ectx->codes   = inp_codes;
     ectx->decoded = decoded;
-
-    return gf;
 }
 
 static void encodec_zero_tensor(struct ggml_cgraph *gf, const char *name) {
@@ -652,39 +710,36 @@ bool encodec_eval_internal(struct encodec_context *ectx, const float * raw_audio
                            const encodec_run_mode_t mode) {
     auto & model  = ectx->model;
     auto & allocr = ectx->allocr;
+    auto & gf     = ectx->gf;
 
-    struct ggml_cgraph *gf = encodec_build_graph(ectx, raw_audio, n_samples, mode);
+    encodec_build_graph(ectx, raw_audio, n_samples, mode);
 
     // allocate the graph tensors
-    ggml_gallocr_alloc_graph(allocr, gf);
+    ggml_gallocr_alloc_graph(allocr, gf.get());
 
     // set the graph inputs
-    struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "inp");
+    struct ggml_tensor * inp = ggml_graph_get_tensor(gf.get(), "inp");
     ggml_backend_tensor_set(inp, raw_audio, 0, n_samples * ggml_element_size(inp));
 
     // make sure accumulation tensor are zeroed
-    encodec_zero_tensor(gf, "enc_l0_ht");
-    encodec_zero_tensor(gf, "enc_l1_ht");
-    encodec_zero_tensor(gf, "enc_l0_ct");
-    encodec_zero_tensor(gf, "enc_l1_ct");
+    encodec_zero_tensor(gf.get(), "enc_l0_ht");
+    encodec_zero_tensor(gf.get(), "enc_l1_ht");
+    encodec_zero_tensor(gf.get(), "enc_l0_ct");
+    encodec_zero_tensor(gf.get(), "enc_l1_ct");
 
-    encodec_zero_tensor(gf, "dec_l0_ht");
-    encodec_zero_tensor(gf, "dec_l1_ht");
-    encodec_zero_tensor(gf, "dec_l0_ct");
-    encodec_zero_tensor(gf, "dec_l1_ct");
+    encodec_zero_tensor(gf.get(), "dec_l0_ht");
+    encodec_zero_tensor(gf.get(), "dec_l1_ht");
+    encodec_zero_tensor(gf.get(), "dec_l0_ct");
+    encodec_zero_tensor(gf.get(), "dec_l1_ct");
 
-    encodec_zero_tensor(gf, "quantized_out");
+    encodec_zero_tensor(gf.get(), "quantized_out");
 
     // run the computation
     if (ggml_backend_is_cpu(model.backend)) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-    ggml_backend_graph_compute(model.backend, gf);
+
+    ggml_backend_graph_compute(model.backend, gf.get());
 
     return true;
 }
@@ -694,39 +749,36 @@ bool encodec_eval_internal(struct encodec_context *ectx, const int32_t *codes,
                            const encodec_run_mode_t mode) {
     auto & model  = ectx->model;
     auto & allocr = ectx->allocr;
+    auto & gf     = ectx->gf;
 
-    struct ggml_cgraph *gf = encodec_build_graph(ectx, codes, n_codes, mode);
+    encodec_build_graph(ectx, codes, n_codes, mode);
 
     // allocate the graph tensors
-    ggml_gallocr_alloc_graph(allocr, gf);
+    ggml_gallocr_alloc_graph(allocr, gf.get());
 
     // set the graph inputs
-    struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "inp_codes");
+    struct ggml_tensor * inp = ggml_graph_get_tensor(gf.get(), "inp_codes");
     ggml_backend_tensor_set(inp, codes, 0, n_codes * ggml_element_size(inp));
 
     // make sure accumulation tensor are zeroed
-    encodec_zero_tensor(gf, "enc_l0_ht");
-    encodec_zero_tensor(gf, "enc_l1_ht");
-    encodec_zero_tensor(gf, "enc_l0_ct");
-    encodec_zero_tensor(gf, "enc_l1_ct");
+    encodec_zero_tensor(gf.get(), "enc_l0_ht");
+    encodec_zero_tensor(gf.get(), "enc_l1_ht");
+    encodec_zero_tensor(gf.get(), "enc_l0_ct");
+    encodec_zero_tensor(gf.get(), "enc_l1_ct");
 
-    encodec_zero_tensor(gf, "dec_l0_ht");
-    encodec_zero_tensor(gf, "dec_l1_ht");
-    encodec_zero_tensor(gf, "dec_l0_ct");
-    encodec_zero_tensor(gf, "dec_l1_ct");
+    encodec_zero_tensor(gf.get(), "dec_l0_ht");
+    encodec_zero_tensor(gf.get(), "dec_l1_ht");
+    encodec_zero_tensor(gf.get(), "dec_l0_ct");
+    encodec_zero_tensor(gf.get(), "dec_l1_ct");
 
-    encodec_zero_tensor(gf, "quantized_out");
+    encodec_zero_tensor(gf.get(), "quantized_out");
 
     // run the computation
     if (ggml_backend_is_cpu(model.backend)) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-    ggml_backend_graph_compute(model.backend, gf);
+
+    ggml_backend_graph_compute(model.backend, gf.get());
 
     return true;
 }
@@ -742,10 +794,10 @@ bool encodec_eval(struct encodec_context *ectx, const float *raw_audio,
         ectx->allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ectx->model.backend));
 
         // create the graph for memory usage estimation
-        struct ggml_cgraph *gf = encodec_build_graph(ectx, raw_audio, n_samples, mode);
+        encodec_build_graph(ectx, raw_audio, n_samples, mode);
 
         // pre-allocate the compute buffer
-        ggml_gallocr_reserve(ectx->allocr, gf);
+        ggml_gallocr_reserve(ectx->allocr, ectx->gf.get());
         size_t mem_size = ggml_gallocr_get_buffer_size(ectx->allocr, 0);
         fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size / 1024.0 / 1024.0);
     }
@@ -772,10 +824,10 @@ bool encodec_eval(struct encodec_context *ectx, const int32_t *codes,
         ectx->allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ectx->model.backend));
 
         // create the graph for memory usage estimation
-        struct ggml_cgraph *gf = encodec_build_graph(ectx, codes, n_codes, mode);
+        encodec_build_graph(ectx, codes, n_codes, mode);
 
         // pre-allocate the compute buffer
-        ggml_gallocr_reserve(ectx->allocr, gf);
+        ggml_gallocr_reserve(ectx->allocr, ectx->gf.get());
         size_t mem_size = ggml_gallocr_get_buffer_size(ectx->allocr, 0);
         fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size / 1024.0 / 1024.0);
     }

diff --git a/lstm.h b/lstm.h
@@ -49,7 +49,7 @@ struct ggml_tensor *forward_pass_lstm_unilayer(struct ggml_context *ctx0,
 
     struct ggml_tensor *current = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
 
-    for (int t = 0; t < 2; t++) {
+    for (int t = 0; t < seq_length; t++) {
         struct ggml_tensor *x_t = ggml_view_1d(ctx0, current, input_dim, t * current->nb[1]);
 
         struct ggml_tensor *inp_gates = ggml_mul_mat(ctx0, weight_ih, x_t);
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,6 @@ encodec @@
     *.th
     .vscode/
-    build/
+    build/
+    *.wav