From 13d6ed1b38eb671f66cb600d7bf4cffb4111c3c1 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sun, 20 Oct 2024 00:43:27 +0200 Subject: [PATCH] working lstm --- .gitignore | 4 +- CMakeLists.txt | 3 + encodec.cpp | 178 ++++++++++++++++++++++++++++++++----------------- lstm.h | 2 +- 4 files changed, 122 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index 2738ef2..9bd2f1d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ encodec *.th .vscode/ -build/ \ No newline at end of file +build/ + +*.wav diff --git a/CMakeLists.txt b/CMakeLists.txt index 693921c..df1ab72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,9 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(ENCODEC_STANDALONE ON) diff --git a/encodec.cpp b/encodec.cpp index e770f12..3b8504f 100644 --- a/encodec.cpp +++ b/encodec.cpp @@ -1,6 +1,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" +#include "ggml/src/ggml-impl.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" @@ -32,6 +33,7 @@ #include "quantizer.h" #define ENCODEC_FILE_MAGIC 'ggml' +#define ENCODEC_MAX_NODES 80000 typedef enum { // Run the end-to-end encoder-decoder pipeline @@ -96,9 +98,28 @@ struct encodec_model { std::map tensors; }; +struct encodec_ggml_cgraph_deleter { + void operator()(struct ggml_cgraph * cgraph) { + if (cgraph->nodes) + free(cgraph->nodes); + if (cgraph->leafs) + free(cgraph->leafs); + if (cgraph->visited_hash_set.keys) + free(cgraph->visited_hash_set.keys); + if (cgraph->grads) + free(cgraph->grads); + free(cgraph); + } +}; + struct encodec_context { encodec_model model; + // computational graph stored on the heap to avoid stack overflows + // the computational graph grows with the sequence length (because of the LSTM) + // which requires a lot of nodes + std::unique_ptr gf; + // buffer for model evaluation ggml_backend_buffer_t buf_compute; @@ -201,7 +222,6 @@ bool encodec_load_model_weights(std::ifstream &infile, encodec_model &model, int #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); model.backend = ggml_backend_metal_init(); if (!model.backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); @@ -473,7 +493,7 @@ bool encodec_load_model_weights(std::ifstream &infile, encodec_model &model, int model.n_loaded++; } - printf("%s: model size = %8.2f MB\n", __func__, total_size / 1024.0 / 1024.0); + printf("%s: model size = %.2f MB\n", __func__, total_size / 1024.0 / 1024.0); } infile.close(); @@ -481,16 +501,57 @@ bool encodec_load_model_weights(std::ifstream &infile, encodec_model &model, int return true; } -struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, - const float * inp_audio, - const int n_samples, - const encodec_run_mode_t mode) { +static struct ggml_cgraph * encodec_ggml_cgraph_create(size_t size) { + struct ggml_cgraph * cgraph = (struct ggml_cgraph *)calloc(1, sizeof(struct ggml_cgraph)); + cgraph->size = size; + cgraph->n_nodes = 0; + cgraph->n_leafs = 0; + cgraph->nodes = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *)); + cgraph->leafs = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *)); + + // next primes after powers of two + static const size_t primes[] = { + 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, + 2053, 4099, 8209, 16411, 32771, 65537, 131101, + 262147, 524309, 1048583, 2097169, 4194319, 8388617, + 16777259, 33554467, 67108879, 134217757, 268435459, + 536870923, 1073741827, 2147483659 + }; + static const size_t n_primes = sizeof(primes)/sizeof(primes[0]); + + // find the smallest prime that is larger or equal to size + size_t l = 0; + size_t r = n_primes; + while (l < r) { + size_t m = (l + r)/2; + if (primes[m] < size * 2) { + l = m + 1; + } else { + r = m; + } + } + size_t hash_size = l < n_primes ? primes[l] : (size * 2 + 1); + + cgraph->visited_hash_set.size = hash_size; + cgraph->visited_hash_set.keys = (struct ggml_tensor **)calloc(1, hash_size * sizeof(struct ggml_tensor *)); + cgraph->visited_hash_set.used = (ggml_bitset_t *)calloc(1, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t)); + cgraph->order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT; + + return cgraph; +} + +void encodec_build_graph(struct encodec_context *ectx, + const float * inp_audio, + const int n_samples, + const encodec_run_mode_t mode) { assert(mode == encodec_run_mode_t::FULL || mode == encodec_run_mode_t::ENCODE); const auto & model = ectx->model; const auto & hparams = model.hparams; const auto & allocr = ectx->allocr; + auto & gf = ectx->gf; + const int *ratios = hparams.ratios; const int kernel_size = hparams.kernel_size; const int res_kernel_sz = hparams.residual_kernel_size; @@ -504,7 +565,7 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, // since we are using ggml-alloc, this buffer only needs enough space to hold the // ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static size_t buf_size = ggml_tensor_overhead() * ENCODEC_MAX_NODES + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params ggml_params = { @@ -515,7 +576,7 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, struct ggml_context *ctx0 = ggml_init(ggml_params); - struct ggml_cgraph *gf = ggml_new_graph(ctx0); + gf = std::unique_ptr(encodec_ggml_cgraph_create(ENCODEC_MAX_NODES)); struct ggml_tensor *inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_samples); ggml_set_name(inp, "inp"); @@ -541,19 +602,18 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, case encodec_run_mode_t::FULL: { ggml_set_name(decoded, "decoded"); ggml_set_output(decoded); - ggml_build_forward_expand(gf, decoded); + ggml_build_forward_expand(gf.get(), decoded); } break; case encodec_run_mode_t::ENCODE: { ggml_set_name(codes, "codes"); ggml_set_output(codes); - ggml_build_forward_expand(gf, codes); + ggml_build_forward_expand(gf.get(), codes); } break; case encodec_run_mode_t::DECODE: { - return NULL; + assert(false); } break; default: { fprintf(stderr, "%s: unknown run mode\n", __func__); - return NULL; } break; } @@ -562,18 +622,18 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, ectx->encoded = encoded; ectx->codes = codes; ectx->decoded = decoded; - - return gf; } -struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int32_t *codes, - const int n_codes, const encodec_run_mode_t mode) { +void encodec_build_graph(struct encodec_context *ectx, const int32_t *codes, + const int n_codes, const encodec_run_mode_t mode) { assert(mode == encodec_run_mode_t::DECODE); const auto & model = ectx->model; const auto & hparams = model.hparams; const auto & allocr = ectx->allocr; + auto & gf = ectx->gf; + const int n_bins = hparams.n_bins; const int sr = hparams.sr; const int bandwidth = hparams.bandwidth; @@ -589,12 +649,12 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3 if (n_codes % n_q != 0) { fprintf(stderr, "%s: invalid number of codes\n", __func__); - return NULL; + assert(false); } const int N = n_codes / n_q; - static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static size_t buf_size = ggml_tensor_overhead() * ENCODEC_MAX_NODES + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params ggml_params = { @@ -605,7 +665,7 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3 struct ggml_context *ctx0 = ggml_init(ggml_params); - struct ggml_cgraph *gf = ggml_new_graph(ctx0); + gf = std::unique_ptr(encodec_ggml_cgraph_create(ENCODEC_MAX_NODES)); struct ggml_tensor *inp_codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_q); ggml_set_name(inp_codes, "inp_codes"); @@ -626,11 +686,11 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3 case encodec_run_mode_t::DECODE: { ggml_set_name(decoded, "decoded"); ggml_set_output(decoded); - ggml_build_forward_expand(gf, decoded); + ggml_build_forward_expand(gf.get(), decoded); } break; default: { fprintf(stderr, "%s: unknown run mode\n", __func__); - return NULL; + assert(false); } break; } @@ -638,8 +698,6 @@ struct ggml_cgraph *encodec_build_graph(struct encodec_context *ectx, const int3 ectx->codes = inp_codes; ectx->decoded = decoded; - - return gf; } static void encodec_zero_tensor(struct ggml_cgraph *gf, const char *name) { @@ -652,39 +710,36 @@ bool encodec_eval_internal(struct encodec_context *ectx, const float * raw_audio const encodec_run_mode_t mode) { auto & model = ectx->model; auto & allocr = ectx->allocr; + auto & gf = ectx->gf; - struct ggml_cgraph *gf = encodec_build_graph(ectx, raw_audio, n_samples, mode); + encodec_build_graph(ectx, raw_audio, n_samples, mode); // allocate the graph tensors - ggml_gallocr_alloc_graph(allocr, gf); + ggml_gallocr_alloc_graph(allocr, gf.get()); // set the graph inputs - struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "inp"); + struct ggml_tensor * inp = ggml_graph_get_tensor(gf.get(), "inp"); ggml_backend_tensor_set(inp, raw_audio, 0, n_samples * ggml_element_size(inp)); // make sure accumulation tensor are zeroed - encodec_zero_tensor(gf, "enc_l0_ht"); - encodec_zero_tensor(gf, "enc_l1_ht"); - encodec_zero_tensor(gf, "enc_l0_ct"); - encodec_zero_tensor(gf, "enc_l1_ct"); + encodec_zero_tensor(gf.get(), "enc_l0_ht"); + encodec_zero_tensor(gf.get(), "enc_l1_ht"); + encodec_zero_tensor(gf.get(), "enc_l0_ct"); + encodec_zero_tensor(gf.get(), "enc_l1_ct"); - encodec_zero_tensor(gf, "dec_l0_ht"); - encodec_zero_tensor(gf, "dec_l1_ht"); - encodec_zero_tensor(gf, "dec_l0_ct"); - encodec_zero_tensor(gf, "dec_l1_ct"); + encodec_zero_tensor(gf.get(), "dec_l0_ht"); + encodec_zero_tensor(gf.get(), "dec_l1_ht"); + encodec_zero_tensor(gf.get(), "dec_l0_ct"); + encodec_zero_tensor(gf.get(), "dec_l1_ct"); - encodec_zero_tensor(gf, "quantized_out"); + encodec_zero_tensor(gf.get(), "quantized_out"); // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - ggml_backend_graph_compute(model.backend, gf); + + ggml_backend_graph_compute(model.backend, gf.get()); return true; } @@ -694,39 +749,36 @@ bool encodec_eval_internal(struct encodec_context *ectx, const int32_t *codes, const encodec_run_mode_t mode) { auto & model = ectx->model; auto & allocr = ectx->allocr; + auto & gf = ectx->gf; - struct ggml_cgraph *gf = encodec_build_graph(ectx, codes, n_codes, mode); + encodec_build_graph(ectx, codes, n_codes, mode); // allocate the graph tensors - ggml_gallocr_alloc_graph(allocr, gf); + ggml_gallocr_alloc_graph(allocr, gf.get()); // set the graph inputs - struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "inp_codes"); + struct ggml_tensor * inp = ggml_graph_get_tensor(gf.get(), "inp_codes"); ggml_backend_tensor_set(inp, codes, 0, n_codes * ggml_element_size(inp)); // make sure accumulation tensor are zeroed - encodec_zero_tensor(gf, "enc_l0_ht"); - encodec_zero_tensor(gf, "enc_l1_ht"); - encodec_zero_tensor(gf, "enc_l0_ct"); - encodec_zero_tensor(gf, "enc_l1_ct"); + encodec_zero_tensor(gf.get(), "enc_l0_ht"); + encodec_zero_tensor(gf.get(), "enc_l1_ht"); + encodec_zero_tensor(gf.get(), "enc_l0_ct"); + encodec_zero_tensor(gf.get(), "enc_l1_ct"); - encodec_zero_tensor(gf, "dec_l0_ht"); - encodec_zero_tensor(gf, "dec_l1_ht"); - encodec_zero_tensor(gf, "dec_l0_ct"); - encodec_zero_tensor(gf, "dec_l1_ct"); + encodec_zero_tensor(gf.get(), "dec_l0_ht"); + encodec_zero_tensor(gf.get(), "dec_l1_ht"); + encodec_zero_tensor(gf.get(), "dec_l0_ct"); + encodec_zero_tensor(gf.get(), "dec_l1_ct"); - encodec_zero_tensor(gf, "quantized_out"); + encodec_zero_tensor(gf.get(), "quantized_out"); // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - ggml_backend_graph_compute(model.backend, gf); + + ggml_backend_graph_compute(model.backend, gf.get()); return true; } @@ -742,10 +794,10 @@ bool encodec_eval(struct encodec_context *ectx, const float *raw_audio, ectx->allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ectx->model.backend)); // create the graph for memory usage estimation - struct ggml_cgraph *gf = encodec_build_graph(ectx, raw_audio, n_samples, mode); + encodec_build_graph(ectx, raw_audio, n_samples, mode); // pre-allocate the compute buffer - ggml_gallocr_reserve(ectx->allocr, gf); + ggml_gallocr_reserve(ectx->allocr, ectx->gf.get()); size_t mem_size = ggml_gallocr_get_buffer_size(ectx->allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size / 1024.0 / 1024.0); } @@ -772,10 +824,10 @@ bool encodec_eval(struct encodec_context *ectx, const int32_t *codes, ectx->allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ectx->model.backend)); // create the graph for memory usage estimation - struct ggml_cgraph *gf = encodec_build_graph(ectx, codes, n_codes, mode); + encodec_build_graph(ectx, codes, n_codes, mode); // pre-allocate the compute buffer - ggml_gallocr_reserve(ectx->allocr, gf); + ggml_gallocr_reserve(ectx->allocr, ectx->gf.get()); size_t mem_size = ggml_gallocr_get_buffer_size(ectx->allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size / 1024.0 / 1024.0); } diff --git a/lstm.h b/lstm.h index 1de84cb..ac23ed1 100644 --- a/lstm.h +++ b/lstm.h @@ -49,7 +49,7 @@ struct ggml_tensor *forward_pass_lstm_unilayer(struct ggml_context *ctx0, struct ggml_tensor *current = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); - for (int t = 0; t < 2; t++) { + for (int t = 0; t < seq_length; t++) { struct ggml_tensor *x_t = ggml_view_1d(ctx0, current, input_dim, t * current->nb[1]); struct ggml_tensor *inp_gates = ggml_mul_mat(ctx0, weight_ih, x_t);