From 8da72442a65fe2a3dd81642a4b0c772bf51185c0 Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Sun, 1 Oct 2023 23:05:55 -0500 Subject: [PATCH] feat: sync llama.cpp (#22) * feat: sync llama.cpp * feat: update rn-llama.hpp * fix: build of API changes * feat: sync * feat: add lora_scaled param * fix(android): lora params --- .../main/java/com/rnllama/LlamaContext.java | 3 + android/src/main/jni.cpp | 11 +- cpp/build-info.h | 4 +- cpp/common.cpp | 203 +- cpp/common.h | 31 +- cpp/ggml-alloc.c | 10 +- cpp/ggml-alloc.h | 1 + cpp/ggml-metal.h | 4 + cpp/ggml-metal.m | 247 +- cpp/ggml-metal.metal | 159 +- cpp/ggml.c | 2438 +++++++++++------ cpp/ggml.h | 151 +- cpp/llama.cpp | 1669 ++++++----- cpp/llama.h | 423 ++- cpp/log.h | 74 +- cpp/rn-llama.hpp | 43 +- docs/API/README.md | 14 +- docs/API/classes/LlamaContext.md | 20 +- docs/API/classes/SchemaGrammarConverter.md | 12 +- example/ios/Podfile.lock | 4 +- example/src/App.tsx | 2 +- ios/RNLlamaContext.mm | 6 +- llama.cpp | 2 +- scripts/llama.cpp.patch | 8 +- src/NativeRNLlama.ts | 1 + 25 files changed, 3640 insertions(+), 1900 deletions(-) diff --git a/android/src/main/java/com/rnllama/LlamaContext.java b/android/src/main/java/com/rnllama/LlamaContext.java index 8448b0f3..6725438e 100644 --- a/android/src/main/java/com/rnllama/LlamaContext.java +++ b/android/src/main/java/com/rnllama/LlamaContext.java @@ -56,6 +56,8 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true, // String lora, params.hasKey("lora") ? params.getString("lora") : "", + // float lora_scaled, + params.hasKey("lora_scaled") ? (float) params.getDouble("lora_scaled") : 1.0f, // String lora_base, params.hasKey("lora_base") ? params.getString("lora_base") : "", // float rope_freq_base, @@ -221,6 +223,7 @@ protected static native long initContext( boolean use_mmap, boolean memory_f16, String lora, + float lora_scaled, String lora_base, float rope_freq_base, float rope_freq_scale diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index 01597474..5bf51bbd 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -131,6 +131,7 @@ Java_com_rnllama_LlamaContext_initContext( jboolean use_mmap, jboolean memory_f16, jstring lora_str, + jfloat lora_scaled, jstring lora_base_str, jfloat rope_freq_base, jfloat rope_freq_scale @@ -160,10 +161,12 @@ Java_com_rnllama_LlamaContext_initContext( defaultParams.memory_f16 = memory_f16; const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr); - defaultParams.lora_adapter = lora_chars; - const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr); - defaultParams.lora_base = lora_base_chars; + if (!lora_chars) { + defaultParams.lora_adapter.push_back({lora_chars, lora_scaled}); + defaultParams.lora_base = lora_base_chars; + defaultParams.use_mmap = false; + } defaultParams.rope_freq_base = rope_freq_base; defaultParams.rope_freq_scale = rope_freq_scale; @@ -281,7 +284,7 @@ Java_com_rnllama_LlamaContext_doCompletion( llama->params.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY; } - const int n_vocab = llama_n_vocab(llama->ctx); + const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx)); jsize logit_bias_len = env->GetArrayLength(logit_bias); for (jsize i = 0; i < logit_bias_len; i++) { diff --git a/cpp/build-info.h b/cpp/build-info.h index fb859464..6cb80787 100644 --- a/cpp/build-info.h +++ b/cpp/build-info.h @@ -1,8 +1,8 @@ #ifndef BUILD_INFO_H #define BUILD_INFO_H -#define BUILD_NUMBER 1255 -#define BUILD_COMMIT "7ddf185" +#define BUILD_NUMBER 1299 +#define BUILD_COMMIT "f5ef5cf" #define BUILD_COMPILER "" #define BUILD_TARGET "unknown" diff --git a/cpp/common.cpp b/cpp/common.cpp index a5f020ee..47d1a343 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -78,7 +78,7 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -static void process_escapes(std::string& input) { +void process_escapes(std::string& input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -129,6 +129,15 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } + } else if (arg == "-tb" || arg == "--threads-batch") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -317,6 +326,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_chunks = std::stoi(argv[i]); + } else if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-ns" || arg == "--sequences") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_sequences = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -340,7 +361,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter = argv[i]; + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } else if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])}); params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { @@ -360,6 +393,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.multiline_input = true; } else if (arg == "--simple-io") { params.simple_io = true; + } else if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; } else if (arg == "--color") { params.use_color = true; } else if (arg == "--mlock") { @@ -425,19 +460,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mul_mat_q = false; #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); -#endif // LM_GGML_USE_CUBLAS - } else if (arg == "--low-vram" || arg == "-lv") { -#ifdef LM_GGML_USE_CUBLAS - params.low_vram = true; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); #endif // LM_GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--numa") { params.numa = true; - } else if (arg == "--export") { - params.export_cgraph = true; } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -456,8 +483,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } - } else if (arg == "--perplexity") { - params.perplexity = true; + } else if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; } else if (arg == "--ppl-stride") { if (++i >= argc) { invalid_param = true; @@ -606,7 +633,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); @@ -621,7 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); @@ -647,20 +676,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); - printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); - printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); + printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); - printf(" --perplexity compute perplexity over each ctx window of the prompt\n"); + printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); + printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } @@ -678,17 +710,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); #ifdef LM_GGML_USE_CUBLAS printf(" -nommq, --no-mul-mat-q\n"); printf(" use " LM_GGML_CUBLAS_NAME " instead of custom mul_mat_q " LM_GGML_CUDA_NAME " kernels.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif // LM_GGML_USE_CUBLAS #endif - printf(" --export export the computation graph to 'llama.ggml'\n"); printf(" --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); @@ -699,6 +730,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf("\n"); } +std::string get_system_info(const gpt_params & params) { + std::ostringstream os; + + os << "system_info: n_threads = " << params.n_threads; + if (params.n_threads_batch != -1) { + os << " (n_threads_batch = " << params.n_threads_batch << ")"; + } + os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); + + return os.str(); +} + std::string gpt_random_prompt(std::mt19937 & rng) { const int r = rng() % 10; switch (r) { @@ -712,60 +755,74 @@ std::string gpt_random_prompt(std::mt19937 & rng) { case 7: return "He"; case 8: return "She"; case 9: return "They"; - default: return "To"; } - return "The"; + LM_GGML_UNREACHABLE(); } // // Model utils // -struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { - auto lparams = llama_context_default_params(); +struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { + auto mparams = llama_model_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_batch = params.n_batch; if (params.n_gpu_layers != -1) { - lparams.n_gpu_layers = params.n_gpu_layers; + mparams.n_gpu_layers = params.n_gpu_layers; } - lparams.main_gpu = params.main_gpu; - lparams.tensor_split = params.tensor_split; - lparams.low_vram = params.low_vram; - lparams.mul_mat_q = params.mul_mat_q; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - lparams.logits_all = params.perplexity; - lparams.embedding = params.embedding; - lparams.rope_freq_base = params.rope_freq_base; - lparams.rope_freq_scale = params.rope_freq_scale; - - return lparams; + mparams.main_gpu = params.main_gpu; + mparams.tensor_split = params.tensor_split; + mparams.use_mmap = params.use_mmap; + mparams.use_mlock = params.use_mlock; + + return mparams; +} + +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { + auto cparams = llama_context_default_params(); + + cparams.n_ctx = params.n_ctx; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + cparams.seed = params.seed; + cparams.f16_kv = params.memory_f16; + cparams.logits_all = params.logits_all; + cparams.embedding = params.embedding; + cparams.rope_freq_base = params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale; + + return cparams; } std::tuple llama_init_from_gpt_params(gpt_params & params) { - auto lparams = llama_context_params_from_gpt_params(params); + auto mparams = llama_model_params_from_gpt_params(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); } - llama_context * lctx = llama_new_context_with_model(model, lparams); + auto cparams = llama_context_params_from_gpt_params(params); + + llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - if (!params.lora_adapter.empty()) { + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); int err = llama_model_apply_lora_from_file(model, - params.lora_adapter.c_str(), - params.lora_base.empty() ? NULL : params.lora_base.c_str(), + lora_adapter.c_str(), + lora_scale, + ((i > 0) || params.lora_base.empty()) + ? NULL + : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); @@ -782,8 +839,9 @@ std::tuple llama_init_from_gpt_par { LOG("warming up the model with an empty run\n"); - const std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; - llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads); + std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + llama_kv_cache_tokens_rm(lctx, -1, -1); llama_reset_timings(lctx); } @@ -795,16 +853,23 @@ std::tuple llama_init_from_gpt_par // std::vector llama_tokenize( - struct llama_context * ctx, + const struct llama_context * ctx, + const std::string & text, + bool add_bos) { + return llama_tokenize(llama_get_model(ctx), text, add_bos); +} + +std::vector llama_tokenize( + const struct llama_model * model, const std::string & text, bool add_bos) { // upper limit for the number of tokens int n_tokens = text.length() + add_bos; std::vector result(n_tokens); - n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); + n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); + int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos); LM_GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -814,10 +879,10 @@ std::vector llama_tokenize( std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); LM_GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -872,7 +937,7 @@ llama_token llama_sample_token( std::vector & candidates, int idx) { const int n_ctx = llama_n_ctx(ctx); - const int n_vocab = llama_n_vocab(ctx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; @@ -890,7 +955,7 @@ llama_token llama_sample_token( llama_token id = 0; - float * logits = llama_get_logits(ctx) + idx * n_vocab; + float * logits = llama_get_logits_ith(ctx, idx); // Apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { @@ -941,11 +1006,11 @@ llama_token llama_sample_token( if (mirostat == 1) { static float mirostat_mu = 2.0f * mirostat_tau; const int mirostat_m = 100; - llama_sample_temperature(ctx, &cur_p, temp); + llama_sample_temp(ctx, &cur_p, temp); id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); } else if (mirostat == 2) { static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &cur_p, temp); + llama_sample_temp(ctx, &cur_p, temp); id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { // Temperature sampling @@ -953,7 +1018,7 @@ llama_token llama_sample_token( llama_sample_tail_free (ctx, &cur_p, tfs_z, 1); llama_sample_typical (ctx, &cur_p, typical_p, 1); llama_sample_top_p (ctx, &cur_p, top_p, 1); - llama_sample_temperature(ctx, &cur_p, temp); + llama_sample_temp(ctx, &cur_p, temp); { const int n_top = 10; @@ -1158,7 +1223,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l #endif // NDEBUG fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx)); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); #ifdef __OPTIMIZE__ fprintf(stream, "optimize: true\n"); @@ -1182,7 +1247,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); - fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); @@ -1211,9 +1275,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, " %d: %f", lb.first, lb.second); } - fprintf(stream, "lora: %s\n", params.lora_adapter.c_str()); + fprintf(stream, "lora:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) != 1.0f) { + continue; + } + fprintf(stream, " - %s\n", std::get<0>(la).c_str()); + } + fprintf(stream, "lora_scaled:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) == 1.0f) { + continue; + } + fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); + } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); - fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat); @@ -1256,6 +1332,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", params.temp); const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); diff --git a/cpp/common.h b/cpp/common.h index f9dfd4a2..0e2d3fa6 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -3,7 +3,6 @@ #pragma once #include "llama.h" -#include "build-info.h" #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" @@ -37,20 +36,23 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_beams = 0; // if non-zero then use beam search of given width. - float rope_freq_base = 10000.0f; // RoPE base frequency - float rope_freq_scale = 1.0f; // RoPE frequency scaling factor + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor // sampling parameters int32_t top_k = 40; // <= 0 to use vocab size @@ -84,8 +86,8 @@ struct gpt_params { std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files - std::string lora_adapter = ""; // lora adapter path - std::string lora_base = ""; // base model path for the lora adapter + std::vector> lora_adapter; // lora adapter path with user defined scale + std::string lora_base = ""; // base model path for the lora adapter int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line @@ -94,7 +96,6 @@ struct gpt_params { bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score - bool low_vram = false; // if true, reduce VRAM usage at the cost of performance bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided @@ -108,16 +109,16 @@ struct gpt_params { bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles + bool cont_batching = false; // insert new sequences for decoding on-the-fly bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool ignore_eos = false; // ignore generated EOS tokens bool instruct = false; // instruction mode (used for Alpaca models) bool penalize_nl = true; // consider newlines as a repeatable token - bool perplexity = false; // compute perplexity over the prompt + bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool numa = false; // attempt optimizations that help on some NUMA systems - bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation }; @@ -125,13 +126,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +std::string get_system_info(const gpt_params & params); + std::string gpt_random_prompt(std::mt19937 & rng); +void process_escapes(std::string& input); + // // Model utils // std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); // @@ -141,7 +147,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // tokenizes a string into a vector of tokens // should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( - struct llama_context * ctx, + const struct llama_context * ctx, + const std::string & text, + bool add_bos); + +std::vector llama_tokenize( + const struct llama_model * model, const std::string & text, bool add_bos); @@ -182,7 +193,7 @@ std::string llama_detokenize_bpe( // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL // - grammar: grammar to use for sampling, ignore if NULL // - last_tokens: needed for repetition penalty, ignore if empty -// - idx: sample from llama_get_logits(ctx) + idx * n_vocab +// - idx: sample from llama_get_logits_ith(ctx, idx) // // returns: // - token: sampled token diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c index 470afb7d..95e5a431 100644 --- a/cpp/ggml-alloc.c +++ b/cpp/ggml-alloc.c @@ -77,7 +77,7 @@ struct free_block { size_t size; }; -#define MAX_FREE_BLOCKS 128 +#define MAX_FREE_BLOCKS 256 struct lm_ggml_allocr { void * data; @@ -187,6 +187,7 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * } tensor->data = addr; + AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data); #ifdef LM_GGML_ALLOCATOR_DEBUG add_allocated_tensor(alloc, tensor); @@ -218,7 +219,8 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_ size_t size = lm_ggml_allocr_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); + AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); + AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size); #ifdef LM_GGML_ALLOCATOR_DEBUG remove_allocated_tensor(alloc, tensor); @@ -631,3 +633,7 @@ static size_t lm_ggml_allocr_alloc_graph_tensors_n( size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph) { return lm_ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); } + +size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc) { + return alloc->max_size; +} diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h index e79c26b7..93f6ccec 100644 --- a/cpp/ggml-alloc.h +++ b/cpp/ggml-alloc.h @@ -19,6 +19,7 @@ LM_GGML_API bool lm_ggml_allocr_is_measure(struct lm_ggml_allocr * alloc); LM_GGML_API void lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc); LM_GGML_API void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor); LM_GGML_API size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph); +LM_GGML_API size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc); #ifdef __cplusplus diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h index f6a1676d..f94efc53 100644 --- a/cpp/ggml-metal.h +++ b/cpp/ggml-metal.h @@ -19,6 +19,8 @@ #pragma once +#include "ggml.h" + #include #include @@ -33,6 +35,8 @@ struct lm_ggml_cgraph; extern "C" { #endif +void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data); + struct lm_ggml_metal_context; // number of command buffers to use diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index 91ecf969..1a504a52 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -11,11 +11,14 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -// TODO: temporary - reuse llama.cpp logging #ifdef LM_GGML_METAL_NDEBUG -#define metal_printf(...) +#define LM_GGML_METAL_LOG_INFO(...) +#define LM_GGML_METAL_LOG_WARN(...) +#define LM_GGML_METAL_LOG_ERROR(...) #else -#define metal_printf(...) fprintf(stderr, __VA_ARGS__) +#define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define LM_GGML_METAL_LOG_ERROR(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -100,7 +103,8 @@ LM_GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); LM_GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); LM_GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); - LM_GGML_METAL_DECL_KERNEL(rope); + LM_GGML_METAL_DECL_KERNEL(rope_f32); + LM_GGML_METAL_DECL_KERNEL(rope_f16); LM_GGML_METAL_DECL_KERNEL(alibi_f32); LM_GGML_METAL_DECL_KERNEL(cpy_f32_f16); LM_GGML_METAL_DECL_KERNEL(cpy_f32_f32); @@ -120,8 +124,37 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end +lm_ggml_log_callback lm_ggml_metal_log_callback = NULL; +void * lm_ggml_metal_log_user_data = NULL; + +void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data) { + lm_ggml_metal_log_callback = log_callback; + lm_ggml_metal_log_user_data = user_data; +} + +static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, ...){ + if (lm_ggml_metal_log_callback != NULL) { + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + lm_ggml_metal_log_callback(level, buffer, lm_ggml_metal_log_user_data); + } else { + char* buffer2 = malloc(len+1); + vsnprintf(buffer2, len+1, format, args); + buffer2[len] = 0; + lm_ggml_metal_log_callback(level, buffer2, lm_ggml_metal_log_user_data); + free(buffer2); + } + va_end(args); + } +} + + + struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { - metal_printf("%s: allocating\n", __func__); + LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__); id device; NSString * s; @@ -131,14 +164,14 @@ @implementation GGMLMetalClass NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; - metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + LM_GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); } #endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); s = [device name]; - metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); + LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context)); @@ -165,7 +198,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -179,11 +212,11 @@ @implementation GGMLMetalClass //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); + LM_GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -195,7 +228,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + LM_GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -207,11 +240,11 @@ @implementation GGMLMetalClass #define LM_GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + LM_GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + LM_GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -261,7 +294,8 @@ @implementation GGMLMetalClass LM_GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); LM_GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); LM_GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); - LM_GGML_METAL_ADD_KERNEL(rope); + LM_GGML_METAL_ADD_KERNEL(rope_f32); + LM_GGML_METAL_ADD_KERNEL(rope_f16); LM_GGML_METAL_ADD_KERNEL(alibi_f32); LM_GGML_METAL_ADD_KERNEL(cpy_f32_f16); LM_GGML_METAL_ADD_KERNEL(cpy_f32_f32); @@ -270,13 +304,13 @@ @implementation GGMLMetalClass #undef LM_GGML_METAL_ADD_KERNEL } - metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX - metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); + LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -284,7 +318,7 @@ @implementation GGMLMetalClass } void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { - metal_printf("%s: deallocating\n", __func__); + LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); #define LM_GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -335,7 +369,8 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); LM_GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); LM_GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); - LM_GGML_METAL_DEL_KERNEL(rope); + LM_GGML_METAL_DEL_KERNEL(rope_f32); + LM_GGML_METAL_DEL_KERNEL(rope_f16); LM_GGML_METAL_DEL_KERNEL(alibi_f32); LM_GGML_METAL_DEL_KERNEL(cpy_f32_f16); LM_GGML_METAL_DEL_KERNEL(cpy_f32_f32); @@ -360,7 +395,7 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { void * data = NULL; const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - metal_printf("%s: error: posix_memalign failed\n", __func__); + LM_GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -388,7 +423,7 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id lm_ggml_metal_get_buffer(struct lm_ggml_metal_context * ctx, struct lm_ggml_tensor * t, size_t * offs) { - //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //LM_GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = lm_ggml_nbytes(t); @@ -400,13 +435,13 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //LM_GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - metal_printf("%s: error: buffer is nil\n", __func__); + LM_GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__); return nil; } @@ -418,7 +453,7 @@ bool lm_ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= LM_GGML_METAL_MAX_BUFFERS) { - metal_printf("%s: too many buffers\n", __func__); + LM_GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__); return false; } @@ -428,7 +463,7 @@ bool lm_ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + LM_GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -449,11 +484,11 @@ bool lm_ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -473,13 +508,13 @@ bool lm_ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - metal_printf("\n"); + LM_GGML_METAL_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -487,17 +522,17 @@ bool lm_ggml_metal_add_buffer( } #if TARGET_OS_OSX - metal_printf(", (%8.2f / %8.2f)", + LM_GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); + LM_GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { - metal_printf("\n"); + LM_GGML_METAL_LOG_INFO("\n"); } #else - metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + LM_GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); #endif } @@ -610,7 +645,7 @@ void lm_ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > LM_GGML_MAX_CONCUR) { - metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); + LM_GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -664,7 +699,7 @@ void lm_ggml_metal_graph_compute( continue; } - //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, lm_ggml_op_name(gf->nodes[i]->op)); + //LM_GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, lm_ggml_op_name(gf->nodes[i]->op)); struct lm_ggml_tensor * src0 = gf->nodes[i]->src[0]; struct lm_ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -708,17 +743,17 @@ void lm_ggml_metal_graph_compute( id id_src1 = src1 ? lm_ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id id_dst = dst ? lm_ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //metal_printf("%s: op - %s\n", __func__, lm_ggml_op_name(dst->op)); + //LM_GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, lm_ggml_op_name(dst->op)); //if (src0) { - // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, lm_ggml_type_name(src0t), ne00, ne01, ne02, + // LM_GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, lm_ggml_type_name(src0t), ne00, ne01, ne02, // lm_ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, lm_ggml_type_name(src1t), ne10, ne11, ne12, + // LM_GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, lm_ggml_type_name(src1t), ne10, ne11, ne12, // lm_ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, lm_ggml_type_name(dstt), ne0, ne1, ne2, + // LM_GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, lm_ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -736,25 +771,59 @@ void lm_ggml_metal_graph_compute( LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(src1)); - // utilize float4 - LM_GGML_ASSERT(ne00 % 4 == 0); - const int64_t nb = ne00/4; + bool bcast_row = false; - if (lm_ggml_nelements(src1) == ne10) { + int64_t nb = ne00; + + if (lm_ggml_nelements(src1) == ne10 && ne00 % 4 == 0) { // src1 is a row LM_GGML_ASSERT(ne11 == 1); + + nb = ne00 / 4; [encoder setComputePipelineState:ctx->pipeline_add_row]; + + bcast_row = true; } else { [encoder setComputePipelineState:ctx->pipeline_add]; } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - - const int64_t n = lm_ggml_nelements(dst)/4; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; + + if (bcast_row) { + const int64_t n = lm_ggml_nelements(dst)/4; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } else { + const int nth = MIN(1024, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } } break; case LM_GGML_OP_MUL: { @@ -830,13 +899,13 @@ void lm_ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, lm_ggml_op_name(dst->op)); + LM_GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, lm_ggml_op_name(dst->op)); LM_GGML_ASSERT(false); } } break; case LM_GGML_OP_SOFT_MAX: { - const int nth = 32; + const int nth = MIN(32, ne00); if (ne00%4 == 0) { [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; @@ -889,7 +958,7 @@ void lm_ggml_metal_graph_compute( src1t == LM_GGML_TYPE_F32 && [ctx->device supportsFamily:MTLGPUFamilyApple7] && ne00%32 == 0 && - ne11 > 1) { + ne11 > 2) { switch (src0->type) { case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; @@ -1019,7 +1088,7 @@ void lm_ggml_metal_graph_compute( } break; default: { - metal_printf("Asserting on type %d\n",(int)src0t); + LM_GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); LM_GGML_ASSERT(false && "not implemented"); } }; @@ -1100,7 +1169,7 @@ void lm_ggml_metal_graph_compute( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - const int nth = 512; + const int nth = MIN(512, ne00); [encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1119,7 +1188,7 @@ void lm_ggml_metal_graph_compute( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - const int nth = 256; + const int nth = MIN(256, ne00); [encoder setComputePipelineState:ctx->pipeline_norm]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1137,6 +1206,8 @@ void lm_ggml_metal_graph_compute( { LM_GGML_ASSERT((src0t == LM_GGML_TYPE_F32)); + const int nth = MIN(1024, ne00); + const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; @@ -1170,12 +1241,14 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; - const int nth = 32; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case LM_GGML_OP_ROPE: { + LM_GGML_ASSERT(ne10 == ne02); + + const int nth = MIN(1024, ne00); + const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; @@ -1185,38 +1258,44 @@ void lm_ggml_metal_graph_compute( memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - [encoder setComputePipelineState:ctx->pipeline_rope]; + switch (src0->type) { + case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; + case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break; + default: LM_GGML_ASSERT(false); + }; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; - [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; - [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; + [encoder setBytes:&mode length:sizeof( int) atIndex:21]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case LM_GGML_OP_DUP: case LM_GGML_OP_CPY: case LM_GGML_OP_CONT: { - const int nth = 32; + const int nth = MIN(1024, ne00); switch (src0t) { case LM_GGML_TYPE_F32: @@ -1261,7 +1340,7 @@ void lm_ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, lm_ggml_op_name(dst->op)); + LM_GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, lm_ggml_op_name(dst->op)); LM_GGML_ASSERT(false); } } @@ -1286,7 +1365,7 @@ void lm_ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); + LM_GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); LM_GGML_ASSERT(false); } } diff --git a/cpp/ggml-metal.metal b/cpp/ggml-metal.metal index 7f1c3d9e..5e1af6a0 100644 --- a/cpp/ggml-metal.metal +++ b/cpp/ggml-metal.metal @@ -24,12 +24,59 @@ typedef struct { int8_t qs[QK8_0]; // quants } block_q8_0; +// general-purpose kernel for addition of two tensors +// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 +// cons: not very efficient kernel void kernel_add( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig]; + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0]; + + src0_ptr += ntg.x*nb00; + src1_ptr += ntg.x*nb10; + dst_ptr += ntg.x*nb0; + } } // assumption: src1 is a row @@ -38,7 +85,7 @@ kernel void kernel_add_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb, + constant int64_t & nb [[buffer(27)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig % nb]; } @@ -806,30 +853,61 @@ kernel void kernel_alibi_f32( } } +typedef void (rope_t)( + device const void * src0, + device const int32_t * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int & n_past, + constant int & n_dims, + constant int & mode, + constant float & freq_base, + constant float & freq_scale, + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]); + +template kernel void kernel_rope( - device const void * src0, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - constant int & n_past, - constant int & n_dims, - constant int & mode, - constant float & freq_base, - constant float & freq_scale, + device const void * src0, + device const int32_t * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int & n_past, + constant int & n_dims, + constant int & mode, + constant float & freq_base, + constant float & freq_scale, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]) { @@ -839,7 +917,9 @@ kernel void kernel_rope( const bool is_neox = mode & 2; - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + device const int32_t * pos = src1; + + const int64_t p = pos[i2]; const float theta_0 = freq_scale * (float)p; const float inv_ndims = -1.f/n_dims; @@ -851,11 +931,11 @@ kernel void kernel_rope( const float cos_theta = cos(theta); const float sin_theta = sin(theta); - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = src[0]; - const float x1 = src[1]; + const T x0 = src[0]; + const T x1 = src[1]; dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[1] = x0*sin_theta + x1*cos_theta; @@ -870,8 +950,8 @@ kernel void kernel_rope( const int64_t i0 = ib*n_dims + ic/2; - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x0 = src[0]; const float x1 = src[n_dims/2]; @@ -883,6 +963,9 @@ kernel void kernel_rope( } } +template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; +template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; + kernel void kernel_cpy_f16_f16( device const half * src0, device half * dst, @@ -1273,8 +1356,8 @@ kernel void kernel_mul_mat_q3_K_f32( float yl[32]; - const uint16_t kmask1 = 0x3030; - const uint16_t kmask2 = 0x0f0f; + //const uint16_t kmask1 = 0x3030; + //const uint16_t kmask2 = 0x0f0f; const int tid = tiisg/4; const int ix = tiisg%4; diff --git a/cpp/ggml.c b/cpp/ggml.c index 5f4b2d4c..1fa4e055 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -89,7 +89,9 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo static int pthread_join(pthread_t thread, void * unused) { (void) unused; - return (int) WaitForSingleObject(thread, INFINITE); + int ret = (int) WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); + return ret; } static int sched_yield (void) { @@ -134,6 +136,7 @@ typedef void * thread_ret_t; #define LM_GGML_SOFT_MAX_UNROLL 4 #define LM_GGML_VEC_DOT_UNROLL 2 +#define LM_GGML_VEC_MAD_UNROLL 32 // // logging @@ -242,18 +245,18 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { // #define LM_GGML_TENSOR_UNARY_OP_LOCALS \ - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) #define LM_GGML_TENSOR_BINARY_OP_LOCALS \ - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ - LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) #if defined(LM_GGML_USE_ACCELERATE) #include @@ -1863,7 +1866,7 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { #define LM_GGML_F16x8_ADD vaddq_f16 #define LM_GGML_F16x8_MUL vmulq_f16 #define LM_GGML_F16x8_REDUCE(res, x) \ - { \ + do { \ int offset = LM_GGML_F16_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = vaddq_f16(x[i], x[offset+i]); \ @@ -1879,7 +1882,7 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ res = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } + } while (0) #define LM_GGML_F16_VEC LM_GGML_F16x8 #define LM_GGML_F16_VEC_ZERO LM_GGML_F16x8_ZERO @@ -1940,7 +1943,7 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { #define LM_GGML_F32x8_ADD _mm256_add_ps #define LM_GGML_F32x8_MUL _mm256_mul_ps #define LM_GGML_F32x8_REDUCE(res, x) \ -{ \ +do { \ int offset = LM_GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm256_add_ps(x[i], x[offset+i]); \ @@ -1957,7 +1960,7 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { _mm256_extractf128_ps(x[0], 1)); \ const __m128 t1 = _mm_hadd_ps(t0, t0); \ res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ -} +} while (0) // TODO: is this optimal ? #define LM_GGML_F32_VEC LM_GGML_F32x8 @@ -3707,6 +3710,58 @@ inline static void lm_ggml_vec_mad_f32(const int n, float * restrict y, const fl #endif } +// xs and vs are byte strides of x and v +inline static void lm_ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { + + const float * restrict x[LM_GGML_VEC_MAD_UNROLL]; + const float * restrict v[LM_GGML_VEC_MAD_UNROLL]; + + for (int i = 0; i < LM_GGML_VEC_MAD_UNROLL; ++i) { + x[i] = (const float *) ((const char *) xv + i*xs); + v[i] = (const float *) ((const char *) vv + i*vs); + } + +#if defined(LM_GGML_SIMD) + const int np = (n & ~(LM_GGML_F32_STEP - 1)); + + LM_GGML_F32_VEC vx[LM_GGML_VEC_MAD_UNROLL]; + + for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) { + vx[k] = LM_GGML_F32_VEC_SET1(v[k][0]); + } + + LM_GGML_F32_VEC ax[LM_GGML_VEC_MAD_UNROLL][LM_GGML_F32_ARR]; + LM_GGML_F32_VEC ay[LM_GGML_F32_ARR]; + + for (int i = 0; i < np; i += LM_GGML_F32_STEP) { + for (int j = 0; j < LM_GGML_F32_ARR; j++) { + ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR); + + for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) { + ax[k][j] = LM_GGML_F32_VEC_LOAD(x[k] + i + j*LM_GGML_F32_EPR); + ay[j] = LM_GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); + } + + LM_GGML_F32_VEC_STORE(y + i + j*LM_GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) { + for (int i = np; i < n; ++i) { + y[i] += x[k][i]*v[k][0]; + } + } +#else + // scalar + for (int k = 0; k < LM_GGML_VEC_MAD_UNROLL; ++k) { + for (int i = 0; i < n; ++i) { + y[i] += x[k][i]*v[k][0]; + } + } +#endif +} + //inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } inline static void lm_ggml_vec_scale_f32(const int n, float * y, const float v) { #if defined(LM_GGML_USE_ACCELERATE) @@ -4392,10 +4447,9 @@ static inline bool lm_ggml_can_mul_mat(const struct lm_ggml_tensor * t0, const s static inline bool lm_ggml_can_out_prod(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1) { static_assert(LM_GGML_MAX_DIMS == 4, "LM_GGML_MAX_DIMS is not 4 - update this function"); - return - (t0->ne[1] == t1->ne[1]) && - (t0->ne[2] == t1->ne[2]) && - (t0->ne[3] == t1->ne[3]); + return (t0->ne[1] == t1->ne[1]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); } enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) { @@ -5065,43 +5119,78 @@ struct lm_ggml_tensor * lm_ggml_set_f32(struct lm_ggml_tensor * tensor, float va return tensor; } +void lm_ggml_unravel_index(const struct lm_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) { + const int64_t ne2 = tensor->ne[2]; + const int64_t ne1 = tensor->ne[1]; + const int64_t ne0 = tensor->ne[0]; + + const int64_t i3_ = (i/(ne2*ne1*ne0)); + const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0); + const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0; + const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0); + + if (i0) { + * i0 = i0_; + } + if (i1) { + * i1 = i1_; + } + if (i2) { + * i2 = i2_; + } + if (i3) { + * i3 = i3_; + } +} + int32_t lm_ggml_get_i32_1d(const struct lm_ggml_tensor * tensor, int i) { + if (!lm_ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + lm_ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + return lm_ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]); + } switch (tensor->type) { case LM_GGML_TYPE_I8: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; - } break; + } case LM_GGML_TYPE_I16: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; - } break; + } case LM_GGML_TYPE_I32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; - } break; + } case LM_GGML_TYPE_F16: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]); - } break; + } case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; - } break; + } default: { LM_GGML_ASSERT(false); - } break; + } } return 0.0f; } void lm_ggml_set_i32_1d(const struct lm_ggml_tensor * tensor, int i, int32_t value) { + if (!lm_ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + lm_ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + lm_ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } switch (tensor->type) { case LM_GGML_TYPE_I8: { @@ -5135,43 +5224,104 @@ void lm_ggml_set_i32_1d(const struct lm_ggml_tensor * tensor, int i, int32_t val } } +int32_t lm_ggml_get_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case LM_GGML_TYPE_I8: + return ((int8_t *) data)[0]; + case LM_GGML_TYPE_I16: + return ((int16_t *) data)[0]; + case LM_GGML_TYPE_I32: + return ((int32_t *) data)[0]; + case LM_GGML_TYPE_F16: + return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]); + case LM_GGML_TYPE_F32: + return ((float *) data)[0]; + default: + LM_GGML_ASSERT(false); + } + + return 0.0f; +} + +void lm_ggml_set_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case LM_GGML_TYPE_I8: + { + ((int8_t *)(data))[0] = value; + } break; + case LM_GGML_TYPE_I16: + { + ((int16_t *)(data))[0] = value; + } break; + case LM_GGML_TYPE_I32: + { + ((int32_t *)(data))[0] = value; + } break; + case LM_GGML_TYPE_F16: + { + ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_FP32_TO_FP16(value); + } break; + case LM_GGML_TYPE_F32: + { + ((float *)(data))[0] = value; + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + float lm_ggml_get_f32_1d(const struct lm_ggml_tensor * tensor, int i) { + if (!lm_ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + lm_ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + return lm_ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]); + } switch (tensor->type) { case LM_GGML_TYPE_I8: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; - } break; + } case LM_GGML_TYPE_I16: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; - } break; + } case LM_GGML_TYPE_I32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; - } break; + } case LM_GGML_TYPE_F16: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t)); return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]); - } break; + } case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; - } break; + } default: { LM_GGML_ASSERT(false); - } break; + } } return 0.0f; } void lm_ggml_set_f32_1d(const struct lm_ggml_tensor * tensor, int i, float value) { + if (!lm_ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + lm_ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + lm_ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } switch (tensor->type) { case LM_GGML_TYPE_I8: { @@ -5205,6 +5355,56 @@ void lm_ggml_set_f32_1d(const struct lm_ggml_tensor * tensor, int i, float value } } +float lm_ggml_get_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case LM_GGML_TYPE_I8: + return ((int8_t *) data)[0]; + case LM_GGML_TYPE_I16: + return ((int16_t *) data)[0]; + case LM_GGML_TYPE_I32: + return ((int32_t *) data)[0]; + case LM_GGML_TYPE_F16: + return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]); + case LM_GGML_TYPE_F32: + return ((float *) data)[0]; + default: + LM_GGML_ASSERT(false); + } + + return 0.0f; +} + +void lm_ggml_set_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case LM_GGML_TYPE_I8: + { + ((int8_t *)(data))[0] = value; + } break; + case LM_GGML_TYPE_I16: + { + ((int16_t *)(data))[0] = value; + } break; + case LM_GGML_TYPE_I32: + { + ((int32_t *)(data))[0] = value; + } break; + case LM_GGML_TYPE_F16: + { + ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_FP32_TO_FP16(value); + } break; + case LM_GGML_TYPE_F32: + { + ((float *)(data))[0] = value; + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + void * lm_ggml_get_data(const struct lm_ggml_tensor * tensor) { return tensor->data; } @@ -5347,6 +5547,44 @@ struct lm_ggml_tensor * lm_ggml_add_inplace( return lm_ggml_add_impl(ctx, a, b, true); } +// lm_ggml_add_cast + +static struct lm_ggml_tensor * lm_ggml_add_cast_impl( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + enum lm_ggml_type type) { + // TODO: support less-strict constraint + // LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); + LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a)); + LM_GGML_ASSERT(lm_ggml_is_quantized(a->type)); // currently only supported for quantized input + + bool is_node = false; + + if (a->grad || b->grad) { + // TODO: support backward pass for broadcasting + LM_GGML_ASSERT(lm_ggml_are_same_shape(a, b)); + is_node = true; + } + + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, type, a->n_dims, a->ne); + + result->op = LM_GGML_OP_ADD; + result->grad = is_node ? lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, a->n_dims, a->ne) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct lm_ggml_tensor * lm_ggml_add_cast( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + enum lm_ggml_type type) { + return lm_ggml_add_cast_impl(ctx, a, b, type); +} + // lm_ggml_add1 static struct lm_ggml_tensor * lm_ggml_add1_impl( @@ -5783,7 +6021,6 @@ struct lm_ggml_tensor * lm_ggml_repeat( result->op = LM_GGML_OP_REPEAT; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -5811,7 +6048,6 @@ struct lm_ggml_tensor * lm_ggml_repeat_back( result->op = LM_GGML_OP_REPEAT_BACK; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -6186,8 +6422,9 @@ struct lm_ggml_tensor * lm_ggml_out_prod( is_node = true; } - const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] + const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); result->op = LM_GGML_OP_OUT_PROD; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; @@ -6406,6 +6643,54 @@ struct lm_ggml_tensor * lm_ggml_cont_inplace( return lm_ggml_cont_impl(ctx, a, true); } + +// make contiguous, with new shape +LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_1d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0) { + return lm_ggml_cont_4d(ctx, a, ne0, 1, 1, 1); +} + +LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_2d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0, + int64_t ne1) { + return lm_ggml_cont_4d(ctx, a, ne0, ne1, 1, 1); +} + +LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_3d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2) { + return lm_ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1); +} + +struct lm_ggml_tensor * lm_ggml_cont_4d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { + LM_GGML_ASSERT(lm_ggml_nelements(a) == (ne0*ne1*ne2*ne3)); + + bool is_node = false; + + struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); + lm_ggml_format_name(result, "%s (cont)", a->name); + + result->op = LM_GGML_OP_CONT; + result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + + // lm_ggml_reshape struct lm_ggml_tensor * lm_ggml_reshape( @@ -6413,7 +6698,7 @@ struct lm_ggml_tensor * lm_ggml_reshape( struct lm_ggml_tensor * a, struct lm_ggml_tensor * b) { LM_GGML_ASSERT(lm_ggml_is_contiguous(a)); - LM_GGML_ASSERT(lm_ggml_is_contiguous(b)); + // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous. LM_GGML_ASSERT(lm_ggml_nelements(a) == lm_ggml_nelements(b)); bool is_node = false; @@ -6786,7 +7071,6 @@ struct lm_ggml_tensor * lm_ggml_get_rows_back( result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -6968,7 +7252,7 @@ struct lm_ggml_tensor * lm_ggml_soft_max_back_inplace( static struct lm_ggml_tensor * lm_ggml_rope_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -6977,7 +7261,10 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( float xpos_base, bool xpos_down, bool inplace) { - LM_GGML_ASSERT(n_past >= 0); + LM_GGML_ASSERT(lm_ggml_is_vector(b)); + LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32); + LM_GGML_ASSERT(a->ne[2] == b->ne[0]); + bool is_node = false; if (a->grad) { @@ -6986,7 +7273,7 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); - int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 6, &xpos_base, sizeof(float)); @@ -6996,6 +7283,7 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( result->op = LM_GGML_OP_ROPE; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; + result->src[1] = b; return result; } @@ -7003,55 +7291,55 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( struct lm_ggml_tensor * lm_ggml_rope( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx) { - return lm_ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); + return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); } struct lm_ggml_tensor * lm_ggml_rope_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx) { - return lm_ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); + return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); } struct lm_ggml_tensor * lm_ggml_rope_custom( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, float freq_base, float freq_scale) { - return lm_ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); + return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); } struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, float freq_base, float freq_scale) { - return lm_ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); + return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); } struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, float base, bool down) { - return lm_ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); + return lm_ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); } // lm_ggml_rope_back @@ -7059,7 +7347,7 @@ struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( struct lm_ggml_tensor * lm_ggml_rope_back( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -7067,7 +7355,10 @@ struct lm_ggml_tensor * lm_ggml_rope_back( float freq_scale, float xpos_base, bool xpos_down) { - LM_GGML_ASSERT(n_past >= 0); + LM_GGML_ASSERT(lm_ggml_is_vector(b)); + LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32); + LM_GGML_ASSERT(a->ne[2] == b->ne[0]); + LM_GGML_ASSERT((mode & 4) == 0 && "lm_ggml_rope_back() for ChatGLM not implemented yet"); bool is_node = false; @@ -7078,7 +7369,7 @@ struct lm_ggml_tensor * lm_ggml_rope_back( struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a); - int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 6, &xpos_base, sizeof(float)); @@ -7088,6 +7379,7 @@ struct lm_ggml_tensor * lm_ggml_rope_back( result->op = LM_GGML_OP_ROPE_BACK; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; + result->src[1] = b; return result; } @@ -7484,27 +7776,30 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_back( // d shape [D,N,ne2,ne3] // q shape [D,N,ne2,ne3] - // k shape [D,M,ne2,ne3] - // v shape [M,D,ne2,ne3] + // k shape [D,M,kvne2,ne3] + // v shape [M,D,kvne2,ne3] - const int64_t D = q->ne[0]; - const int64_t N = q->ne[1]; - const int64_t M = k->ne[1]; - const int64_t ne2 = q->ne[2]; - const int64_t ne3 = q->ne[3]; + const int64_t D = q->ne[0]; + const int64_t N = q->ne[1]; + const int64_t M = k->ne[1]; + const int64_t ne2 = q->ne[2]; + const int64_t ne3 = q->ne[3]; + const int64_t kvne2 = k->ne[2]; LM_GGML_ASSERT(k->ne[0] == D); LM_GGML_ASSERT(v->ne[0] == M); LM_GGML_ASSERT(v->ne[1] == D); LM_GGML_ASSERT(d->ne[0] == D); LM_GGML_ASSERT(d->ne[1] == N); - LM_GGML_ASSERT(k->ne[2] == ne2); + LM_GGML_ASSERT(k->ne[2] == kvne2); LM_GGML_ASSERT(k->ne[3] == ne3); - LM_GGML_ASSERT(v->ne[2] == ne2); + LM_GGML_ASSERT(v->ne[2] == kvne2); LM_GGML_ASSERT(v->ne[3] == ne3); LM_GGML_ASSERT(d->ne[2] == ne2); LM_GGML_ASSERT(d->ne[3] == ne3); + LM_GGML_ASSERT(ne2 % kvne2 == 0); + bool is_node = false; if (q->grad || k->grad || v->grad) { @@ -7514,14 +7809,23 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_back( } // store gradients of q, k and v as continuous tensors concatenated in result. - // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] - // gradq->data = result->data - // gradk->data = result->data + nb0*D*N*ne2*ne3 - // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 // note: v and gradv are actually transposed, i.e. v->ne[0] != D. - int64_t ne[4] = {D,M+N+M,ne2,ne3}; + const int64_t elem_q = lm_ggml_nelements(q); + const int64_t elem_k = lm_ggml_nelements(k); + const int64_t elem_v = lm_ggml_nelements(v); - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); + enum lm_ggml_type result_type = LM_GGML_TYPE_F32; + LM_GGML_ASSERT(lm_ggml_blck_size(result_type) == 1); + const size_t tsize = lm_ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + LM_GGML_PAD(elem_q * tsize, LM_GGML_MEM_ALIGN); + const size_t offs_v = offs_k + LM_GGML_PAD(elem_k * tsize, LM_GGML_MEM_ALIGN); + const size_t end = offs_v + LM_GGML_PAD(elem_v * tsize, LM_GGML_MEM_ALIGN); + + const size_t nelements = (end + tsize - 1)/tsize; + + struct lm_ggml_tensor * result = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nelements); int32_t masked_i = masked ? 1 : 0; lm_ggml_set_op_params(result, &masked_i, sizeof(masked_i)); @@ -8214,7 +8518,7 @@ static void lm_ggml_compute_forward_dup_f16( return; } - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8485,7 +8789,7 @@ static void lm_ggml_compute_forward_dup_f32( return; } - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8766,7 +9070,7 @@ static void lm_ggml_compute_forward_add_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_ASSERT( nb0 == sizeof(float)); LM_GGML_ASSERT(nb00 == sizeof(float)); @@ -8798,8 +9102,6 @@ static void lm_ggml_compute_forward_add_f32( #else lm_ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif - // } - // } } } else { // src1 is not contiguous @@ -8841,7 +9143,7 @@ static void lm_ggml_compute_forward_add_f16_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); @@ -8895,7 +9197,7 @@ static void lm_ggml_compute_forward_add_f16_f16( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); @@ -8946,14 +9248,15 @@ static void lm_ggml_compute_forward_add_q_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; const enum lm_ggml_type type = src0->type; + const enum lm_ggml_type dtype = dst->type; lm_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - lm_ggml_from_float_t const quantize_row_q = type_traits[type].from_float; + lm_ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float; // we don't support permuted src0 or src1 LM_GGML_ASSERT(nb00 == lm_ggml_type_size(type)); @@ -8965,7 +9268,6 @@ static void lm_ggml_compute_forward_add_q_f32( LM_GGML_ASSERT(nb2 <= nb3); LM_GGML_ASSERT(lm_ggml_is_quantized(src0->type)); - LM_GGML_ASSERT(dst->type == src0->type); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); // rows per thread @@ -9003,7 +9305,11 @@ static void lm_ggml_compute_forward_add_q_f32( // add src1 lm_ggml_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst - quantize_row_q(wdata, dst_row, ne00); + if (quantize_row_q != NULL) { + quantize_row_q(wdata, dst_row, ne00); + } else { + memcpy(dst_row, wdata, ne0*nb0); + } } } @@ -9068,7 +9374,7 @@ static void lm_ggml_compute_forward_add1_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS LM_GGML_ASSERT( nb0 == sizeof(float)); LM_GGML_ASSERT(nb00 == sizeof(float)); @@ -9123,7 +9429,7 @@ static void lm_ggml_compute_forward_add1_f16_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); @@ -9173,7 +9479,7 @@ static void lm_ggml_compute_forward_add1_f16_f16( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); @@ -9223,7 +9529,7 @@ static void lm_ggml_compute_forward_add1_q_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS const enum lm_ggml_type type = src0->type; lm_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; @@ -9351,8 +9657,8 @@ static void lm_ggml_compute_forward_acc_f32( const int nr = lm_ggml_nrows(src1); const int nc = src1->ne[0]; - LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) // src0 and dst as viewed during acc const size_t nb0 = lm_ggml_element_size(src0); @@ -9441,7 +9747,7 @@ static void lm_ggml_compute_forward_sub_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_ASSERT( nb0 == sizeof(float)); LM_GGML_ASSERT(nb00 == sizeof(float)); @@ -9531,7 +9837,7 @@ static void lm_ggml_compute_forward_mul_f32( const int64_t nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_ASSERT( nb0 == sizeof(float)); LM_GGML_ASSERT(nb00 == sizeof(float)); @@ -9622,7 +9928,7 @@ static void lm_ggml_compute_forward_div_f32( const int nr = lm_ggml_nrows(src0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS LM_GGML_ASSERT( nb0 == sizeof(float)); LM_GGML_ASSERT(nb00 == sizeof(float)); @@ -9831,8 +10137,8 @@ static void lm_ggml_compute_forward_sum_f32( assert(lm_ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) lm_ggml_float sum = 0; lm_ggml_float row_sum = 0; @@ -9863,8 +10169,8 @@ static void lm_ggml_compute_forward_sum_f16( assert(src0->nb[0] == sizeof(lm_ggml_fp16_t)); - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) float sum = 0; float row_sum = 0; @@ -9917,7 +10223,7 @@ static void lm_ggml_compute_forward_sum_rows_f32( LM_GGML_ASSERT(src0->nb[0] == sizeof(float)); LM_GGML_ASSERT(dst->nb[0] == sizeof(float)); - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS LM_GGML_ASSERT(ne0 == 1); LM_GGML_ASSERT(ne1 == ne01); @@ -9967,7 +10273,7 @@ static void lm_ggml_compute_forward_mean_f32( assert(src0->nb[0] == sizeof(float)); - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS assert(ne0 == 1); assert(ne1 == ne01); @@ -10067,7 +10373,7 @@ static void lm_ggml_compute_forward_repeat_f32( return; } - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in lm_ggml_can_repeat const int nr0 = (int)(ne0/ne00); @@ -10099,11 +10405,61 @@ static void lm_ggml_compute_forward_repeat_f32( } } +static void lm_ggml_compute_forward_repeat_f16( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + struct lm_ggml_tensor * dst) { + LM_GGML_ASSERT(params->ith == 0); + LM_GGML_ASSERT(lm_ggml_can_repeat(src0, dst)); + + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + LM_GGML_TENSOR_UNARY_OP_LOCALS; + + // guaranteed to be an integer due to the check in lm_ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + LM_GGML_ASSERT(nb0 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + lm_ggml_fp16_t * y = (lm_ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); + lm_ggml_fp16_t * x = (lm_ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); + // lm_ggml_vec_cpy_f16(ne00, y, x) + for (int i = 0; i < ne00; ++i) { + y[i] = x[i]; + } + } + } + } + } + } + } + } +} + static void lm_ggml_compute_forward_repeat( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { switch (src0->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_repeat_f16(params, src0, dst); + } break; case LM_GGML_TYPE_F32: { lm_ggml_compute_forward_repeat_f32(params, src0, dst); @@ -10128,7 +10484,7 @@ static void lm_ggml_compute_forward_repeat_back_f32( return; } - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in lm_ggml_can_repeat const int nr0 = (int)(ne00/ne0); @@ -10206,7 +10562,7 @@ static void lm_ggml_compute_forward_concat_f32( const int ith = params->ith; - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS // TODO: support for transposed / permuted tensors LM_GGML_ASSERT(nb0 == sizeof(float)); @@ -10808,7 +11164,7 @@ static void lm_ggml_compute_forward_norm_f32( const int ith = params->ith; const int nth = params->nth; - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -10877,7 +11233,7 @@ static void lm_ggml_compute_forward_rms_norm_f32( const int ith = params->ith; const int nth = params->nth; - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -10942,7 +11298,7 @@ static void lm_ggml_compute_forward_rms_norm_back_f32( const int ith = params->ith; const int nth = params->nth; - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -11117,7 +11473,7 @@ static void lm_ggml_compute_forward_group_norm_f32( const int ith = params->ith; const int nth = params->nth; - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS const float eps = 1e-6f; // TODO: make this a parameter @@ -11228,7 +11584,7 @@ static void lm_ggml_compute_forward_mul_mat( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -11443,10 +11799,10 @@ static void lm_ggml_compute_forward_out_prod_f32( const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - int64_t t0 = lm_ggml_perf_time_us(); - UNUSED(t0); + // int64_t t0 = lm_ggml_perf_time_us(); + // UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -11485,6 +11841,146 @@ static void lm_ggml_compute_forward_out_prod_f32( return; } + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i1: + // for i01: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + // parallelize by last three dimensions + + // total rows in dst + const int64_t nr = ne1*ne2*ne3; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + // block-tiling attempt + const int64_t blck_0 = MAX(LM_GGML_VEC_MAD_UNROLL, 32); + const int64_t blck_1 = 16; + + for (int64_t bir = ir0; bir < ir1; bir += blck_1) { + const int64_t bir1 = MIN(bir + blck_1, ir1); + for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { + const int64_t bne01 = MIN(bi01 + blck_0, ne01); + for (int64_t ir = bir; ir < bir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + + const int64_t i02 = i2; + const int64_t i03 = i3; + + //const int64_t i10 = i1; + const int64_t i12 = i2; + const int64_t i13 = i3; + +#if LM_GGML_VEC_MAD_UNROLL > 2 + const int64_t bne01_unroll = bne01 - (bne01 % LM_GGML_VEC_MAD_UNROLL); + for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += LM_GGML_VEC_MAD_UNROLL) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + lm_ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1); + } + for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + lm_ggml_vec_mad_f32(ne0, d, s0, *s1); + } +#else + for (int64_t i01 = bi01; i01 < bne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + lm_ggml_vec_mad_f32(ne0, d, s0, *s1); + } +#endif + } + } + } + + + //int64_t t1 = lm_ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +static void lm_ggml_compute_forward_out_prod_q_f32( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + // int64_t t0 = lm_ggml_perf_time_us(); + // UNUSED(t0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum lm_ggml_type type = src0->type; + lm_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + + LM_GGML_ASSERT(ne02 == ne12); + LM_GGML_ASSERT(ne03 == ne13); + LM_GGML_ASSERT(ne2 == ne12); + LM_GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 dim0 + LM_GGML_ASSERT(nb00 == lm_ggml_type_size(type)); + + // dst dim0 cannot be transposed or permuted + LM_GGML_ASSERT(nb0 == sizeof(float)); + // LM_GGML_ASSERT(nb0 <= nb1); + // LM_GGML_ASSERT(nb1 <= nb2); + // LM_GGML_ASSERT(nb2 <= nb3); + + LM_GGML_ASSERT(ne0 == ne00); + LM_GGML_ASSERT(ne1 == ne10); + LM_GGML_ASSERT(ne2 == ne02); + LM_GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + // TODO: #if defined(LM_GGML_USE_CUBLAS) lm_ggml_cuda_out_prod + // TODO: #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) || defined(LM_GGML_USE_CLBLAST) + + if (params->type == LM_GGML_TASK_INIT) { + lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == LM_GGML_TASK_FINALIZE) { + return; + } + // parallelize by last three dimensions // total rows in dst @@ -11504,6 +12000,8 @@ static void lm_ggml_compute_forward_out_prod_f32( // for i0: // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; + for (int64_t ir = ir0; ir < ir1; ++ir) { // dst indices const int64_t i3 = ir/(ne2*ne1); @@ -11524,10 +12022,8 @@ static void lm_ggml_compute_forward_out_prod_f32( float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - lm_ggml_vec_mad_f32(ne0, d, s0, *s1); - // for (int64_t i0 = 0; i0 < ne0; ++i0) { - // d[i0] += s0[i0] * s1[i1]; - // } + dequantize_row_q(s0, wdata, ne0); + lm_ggml_vec_mad_f32(ne0, d, wdata, *s1); } } @@ -11556,10 +12052,13 @@ static void lm_ggml_compute_forward_out_prod( case LM_GGML_TYPE_Q5_0: case LM_GGML_TYPE_Q5_1: case LM_GGML_TYPE_Q8_0: - case LM_GGML_TYPE_Q8_1: + case LM_GGML_TYPE_Q2_K: + case LM_GGML_TYPE_Q3_K: + case LM_GGML_TYPE_Q4_K: + case LM_GGML_TYPE_Q5_K: + case LM_GGML_TYPE_Q6_K: { - LM_GGML_ASSERT(false); // todo - // lm_ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + lm_ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; case LM_GGML_TYPE_F16: { @@ -11677,8 +12176,8 @@ static void lm_ggml_compute_forward_set_f32( const int nr = lm_ggml_nrows(src1); const int nc = src1->ne[0]; - LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + LM_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) // src0 and dst as viewed during set const size_t nb0 = lm_ggml_element_size(src0); @@ -11947,14 +12446,15 @@ static void lm_ggml_compute_forward_get_rows_back_f32_f16( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * opt0, struct lm_ggml_tensor * dst) { LM_GGML_ASSERT(params->ith == 0); - LM_GGML_ASSERT(lm_ggml_are_same_shape(opt0, dst)); - LM_GGML_ASSERT(lm_ggml_is_contiguous(opt0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); - lm_ggml_compute_forward_dup_same_cont(params, opt0, dst); + // lm_ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == LM_GGML_TASK_INIT) { + memset(dst->data, 0, lm_ggml_nbytes(dst)); + } if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; @@ -11980,11 +12480,8 @@ static void lm_ggml_compute_forward_get_rows_back_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * opt0, struct lm_ggml_tensor * dst) { LM_GGML_ASSERT(params->ith == 0); - LM_GGML_ASSERT(lm_ggml_are_same_shape(opt0, dst)); - LM_GGML_ASSERT(lm_ggml_is_contiguous(opt0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); // lm_ggml_compute_forward_dup_same_cont(params, opt0, dst); @@ -12018,16 +12515,15 @@ static void lm_ggml_compute_forward_get_rows_back( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - const struct lm_ggml_tensor * opt0, struct lm_ggml_tensor * dst) { switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); + lm_ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); + lm_ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst); } break; default: { @@ -12068,7 +12564,7 @@ static void lm_ggml_compute_forward_diag_f32( // TODO: handle transposed/permuted matrices - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS LM_GGML_ASSERT(ne00 == ne0); LM_GGML_ASSERT(ne00 == ne1); @@ -12456,13 +12952,11 @@ static void lm_ggml_compute_forward_alibi_f16( return; } - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - assert(n_past >= 0); - const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past const int ne2 = src0->ne[2]; // n_head -> this is k @@ -12477,7 +12971,7 @@ static void lm_ggml_compute_forward_alibi_f16( //const int nb3 = src0->nb[3]; LM_GGML_ASSERT(nb0 == sizeof(lm_ggml_fp16_t)); - LM_GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; + //LM_GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; LM_GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) @@ -12623,8 +13117,8 @@ static void lm_ggml_compute_forward_clamp( static void lm_ggml_compute_forward_rope_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -12634,9 +13128,9 @@ static void lm_ggml_compute_forward_rope_f32( // these two only relevant for xPos RoPE: float xpos_base; - bool xpos_down; + bool xpos_down; - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -12645,9 +13139,7 @@ static void lm_ggml_compute_forward_rope_f32( memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - assert(n_past >= 0); - - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12677,9 +13169,11 @@ static void lm_ggml_compute_forward_rope_f32( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -12716,7 +13210,7 @@ static void lm_ggml_compute_forward_rope_f32( const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -12761,8 +13255,8 @@ static void lm_ggml_compute_forward_rope_f32( static void lm_ggml_compute_forward_rope_f16( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { - if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } @@ -12770,16 +13264,14 @@ static void lm_ggml_compute_forward_rope_f16( float freq_base; float freq_scale; - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - assert(n_past >= 0); - - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12809,9 +13301,11 @@ static void lm_ggml_compute_forward_rope_f16( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -12890,15 +13384,16 @@ static void lm_ggml_compute_forward_rope_f16( static void lm_ggml_compute_forward_rope( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_f16(params, src0, dst); + lm_ggml_compute_forward_rope_f16(params, src0, src1, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_f32(params, src0, dst); + lm_ggml_compute_forward_rope_f32(params, src0, src1, dst); } break; default: { @@ -12912,6 +13407,7 @@ static void lm_ggml_compute_forward_rope( static void lm_ggml_compute_forward_rope_back_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -12929,7 +13425,7 @@ static void lm_ggml_compute_forward_rope_back_f32( float xpos_base; bool xpos_down; - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); @@ -12938,9 +13434,7 @@ static void lm_ggml_compute_forward_rope_back_f32( memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - assert(n_past >= 0); - - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12966,9 +13460,11 @@ static void lm_ggml_compute_forward_rope_back_f32( const bool is_neox = mode & 2; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -12980,7 +13476,7 @@ static void lm_ggml_compute_forward_rope_back_f32( const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -13023,6 +13519,7 @@ static void lm_ggml_compute_forward_rope_back_f32( static void lm_ggml_compute_forward_rope_back_f16( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { @@ -13033,13 +13530,11 @@ static void lm_ggml_compute_forward_rope_back_f16( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - assert(n_past >= 0); - - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -13065,9 +13560,11 @@ static void lm_ggml_compute_forward_rope_back_f16( const bool is_neox = mode & 2; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -13119,15 +13616,16 @@ static void lm_ggml_compute_forward_rope_back_f16( static void lm_ggml_compute_forward_rope_back( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_back_f16(params, src0, dst); + lm_ggml_compute_forward_rope_back_f16(params, src0, src1, dst); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_back_f32(params, src0, dst); + lm_ggml_compute_forward_rope_back_f32(params, src0, src1, dst); } break; default: { @@ -13150,7 +13648,7 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f16_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13241,7 +13739,7 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13353,7 +13851,7 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f16_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13444,7 +13942,7 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13562,7 +14060,7 @@ static void lm_ggml_compute_forward_conv_1d( lm_ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); } else { LM_GGML_ASSERT(false); // only stride 1 and 2 supported - }; + } } // lm_ggml_compute_forward_conv_2d @@ -13579,7 +14077,7 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13699,7 +14197,7 @@ static void lm_ggml_compute_forward_conv_transpose_2d( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13958,7 +14456,7 @@ static void lm_ggml_compute_forward_upscale_f32( const int ith = params->ith; - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS const int scale_factor = dst->op_params[0]; @@ -14010,14 +14508,14 @@ static void lm_ggml_compute_forward_flash_attn_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14087,10 +14585,11 @@ static void lm_ggml_compute_forward_flash_attn_f32( S[i] = -INFINITY; } - for (int64_t ic = 0; ic < nek1; ++ic) { + const int64_t masked_begin = masked ? (P + iq1 + 1) : M; + for (int64_t ic = 0; ic < masked_begin; ++ic) { // k indices const int ik3 = iq3; - const int ik2 = iq2; + const int ik2 = iq2 % nek2; const int ik1 = ic; // S indices @@ -14103,20 +14602,18 @@ static void lm_ggml_compute_forward_flash_attn_f32( } // scale - lm_ggml_vec_scale_f32(nek1, S, scale); + lm_ggml_vec_scale_f32(masked_begin, S, scale); - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; - } - } + for (int64_t i = masked_begin; i < M; i++) { + S[i] = -INFINITY; } // softmax + // exclude known -INF S[..] values from max and loop + // dont forget to set their SW values to zero { float max = -INFINITY; - lm_ggml_vec_max_f32(M, &max, S); + lm_ggml_vec_max_f32(masked_begin, &max, S); lm_ggml_float sum = 0.0; { @@ -14130,10 +14627,15 @@ static void lm_ggml_compute_forward_flash_attn_f32( lm_ggml_float sump[LM_GGML_SOFT_MAX_UNROLL] = { 0.0 }; for (int i = 0; i < Mup; i += LM_GGML_SOFT_MAX_UNROLL) { + if (i >= masked_begin) { + break; + } float * SS = S + i; for (int j = 0; j < LM_GGML_SOFT_MAX_UNROLL; ++j) { - if (SS[j] == -INFINITY) { + if (i + j >= masked_begin) { + break; + } else if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { #ifndef LM_GGML_FLASH_ATTN_EXP_FP16 @@ -14158,10 +14660,10 @@ static void lm_ggml_compute_forward_flash_attn_f32( assert(sum > 0.0); sum = 1.0/sum; - lm_ggml_vec_scale_f32(M, S, sum); + lm_ggml_vec_scale_f32(masked_begin, S, sum); #ifndef NDEBUG - for (int i = 0; i < M; ++i) { + for (int i = 0; i < masked_begin; ++i) { assert(!isnan(S[i])); assert(!isinf(S[i])); } @@ -14174,9 +14676,13 @@ static void lm_ggml_compute_forward_flash_attn_f32( const int i2 = iq2; const int i3 = iq3; - lm_ggml_vec_dot_f32(nek1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + // v indices + const int iv2 = iq2 % nev2; + const int iv3 = iq3; + + lm_ggml_vec_dot_f32(masked_begin, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), S); } } @@ -14192,14 +14698,14 @@ static void lm_ggml_compute_forward_flash_attn_f16( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14273,7 +14779,7 @@ static void lm_ggml_compute_forward_flash_attn_f16( for (int64_t ic = 0; ic < nek1; ++ic) { // k indices const int ik3 = iq3; - const int ik2 = iq2; + const int ik2 = iq2 % nek2; const int ik1 = ic; // S indices @@ -14288,7 +14794,7 @@ static void lm_ggml_compute_forward_flash_attn_f16( for (int64_t ic = 0; ic < nek1; ic += LM_GGML_VEC_DOT_UNROLL) { // k indices const int ik3 = iq3; - const int ik2 = iq2; + const int ik2 = iq2 % nek2; const int ik1 = ic; // S indices @@ -14313,6 +14819,8 @@ static void lm_ggml_compute_forward_flash_attn_f16( } // softmax + // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero. + // dont forget to set their S values to zero { float max = -INFINITY; lm_ggml_vec_max_f32(M, &max, S); @@ -14369,6 +14877,7 @@ static void lm_ggml_compute_forward_flash_attn_f16( S16[i] = LM_GGML_FP32_TO_FP16(S[i]); } + // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16). if (LM_GGML_VEC_DOT_UNROLL == 1 || (nev1 % LM_GGML_VEC_DOT_UNROLL != 0)) { for (int64_t ic = 0; ic < nev1; ++ic) { // dst indices @@ -14376,9 +14885,13 @@ static void lm_ggml_compute_forward_flash_attn_f16( const int i2 = iq2; const int i3 = iq3; - lm_ggml_vec_dot_f16(nek1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (lm_ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + // v indices + const int iv2 = iq2 % nev2; + const int iv3 = iq3; + + lm_ggml_vec_dot_f16(nev0, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (lm_ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), S16); } } else { @@ -14388,9 +14901,13 @@ static void lm_ggml_compute_forward_flash_attn_f16( const int i2 = iq2; const int i3 = iq3; - lm_ggml_vec_dot_f16_unroll(nek1, nbv1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + // v indices + const int iv2 = iq2 % nev2; + const int iv3 = iq3; + + lm_ggml_vec_dot_f16_unroll(nev0, nbv1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), S16); } } @@ -14433,18 +14950,18 @@ static void lm_ggml_compute_forward_flash_ff_f16( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_LOCALS(int64_t, nea, a, ne); - LM_GGML_TENSOR_LOCALS(size_t, nba, a, nb); - LM_GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); - LM_GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + LM_GGML_TENSOR_LOCALS(int64_t, nea, a, ne) + LM_GGML_TENSOR_LOCALS(size_t, nba, a, nb) + LM_GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb) + LM_GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb) + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14592,16 +15109,16 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); - LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - LM_GGML_TENSOR_LOCALS(int64_t, ned, d, ne); - LM_GGML_TENSOR_LOCALS(size_t, nbd, d, nb); - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + LM_GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + LM_GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + LM_GGML_TENSOR_LOCALS(int64_t, ned, d, ne) + LM_GGML_TENSOR_LOCALS(size_t, nbd, d, nb) + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + LM_GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14649,10 +15166,37 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( return; } - // parallelize by q rows using lm_ggml_vec_dot_f32 + const int64_t elem_q = lm_ggml_nelements(q); + const int64_t elem_k = lm_ggml_nelements(k); - // total rows in q - const int nr = neq2*neq3; + enum lm_ggml_type result_type = dst->type; + LM_GGML_ASSERT(lm_ggml_blck_size(result_type) == 1); + const size_t tsize = lm_ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + LM_GGML_PAD(elem_q * tsize, LM_GGML_MEM_ALIGN); + const size_t offs_v = offs_k + LM_GGML_PAD(elem_k * tsize, LM_GGML_MEM_ALIGN); + + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + offs_k; + void * grad_v = (char *) dst->data + offs_v; + + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; + + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; + + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; + + // parallelize by k rows using lm_ggml_vec_dot_f32 + + // total rows in k + const int nr = nek2*nek3; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -14665,268 +15209,243 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + // how often k2 (and v2) is repeated in q2 + int nrep = neq2/nek2; + for (int ir = ir0; ir < ir1; ++ir) { // q indices - const int iq3 = ir/(neq2); - const int iq2 = ir - iq3*neq2; - for ( int iq1 = 0; iq1 < neq1; ++iq1) { + const int ik3 = ir/(nek2); + const int ik2 = ir - ik3*nek2; + const int iq3 = ik3; + const int id3 = ik3; + const int iv3 = ik3; + const int iv2 = ik2; - // not sure about CACHE_LINE_SIZE_F32.. - // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? - float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); - float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); + for (int irep = 0; irep < nrep; ++irep) { + const int iq2 = ik2 + irep*nek2; + const int id2 = iq2; - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } + // (ik2 + irep*nek2) % nek2 == ik2 + for (int iq1 = 0; iq1 < neq1; ++iq1) { + const int id1 = iq1; - for (int64_t ic = 0; ic < nek1; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2; - const int ik1 = ic; + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); - // S indices - const int i1 = ik1; + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } - lm_ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); - } + const int64_t masked_begin = masked ? (P + iq1 + 1) : M; + for (int64_t ic = 0; ic < masked_begin; ++ic) { + // k indices + const int ik1 = ic; - // scale - lm_ggml_vec_scale_f32(nek1, S, scale); + // S indices + const int i1 = ik1; - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; - } + lm_ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } - } - // softmax - { - float max = -INFINITY; - lm_ggml_vec_max_f32(M, &max, S); + // scale + lm_ggml_vec_scale_f32(masked_begin, S, scale); - lm_ggml_float sum = 0.0; + for (int64_t i = masked_begin; i < M; i++) { + S[i] = -INFINITY; + } + + // softmax + // exclude known -INF S[..] values from max and loop + // dont forget to set their SM values to zero { + float max = -INFINITY; + lm_ggml_vec_max_f32(masked_begin, &max, S); + + lm_ggml_float sum = 0.0; + { #ifdef LM_GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(SM, 1, &max, SM, 1, Mup); - vvexpf(SM, SM, &Mup); - lm_ggml_vec_sum_f32(Mup, &sum, SM); + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + lm_ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[LM_GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); - lm_ggml_float sump[LM_GGML_SOFT_MAX_UNROLL] = { 0.0 }; - - for (int i = 0; i < Mup; i += LM_GGML_SOFT_MAX_UNROLL) { - float * SR = S + i; - float * SW = SM + i; + uint16_t scvt[LM_GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); + lm_ggml_float sump[LM_GGML_SOFT_MAX_UNROLL] = { 0.0 }; - for (int j = 0; j < LM_GGML_SOFT_MAX_UNROLL; ++j) { - if (SR[j] == -INFINITY) { - SW[j] = 0.0f; - } else { + for (int i = 0; i < Mup; i += LM_GGML_SOFT_MAX_UNROLL) { + if (i >= masked_begin) { + break; + } + float * SR = S + i; + float * SW = SM + i; + + for (int j = 0; j < LM_GGML_SOFT_MAX_UNROLL; ++j) { + if (i + j >= masked_begin) { + break; + } else if (SR[j] == -INFINITY) { + SW[j] = 0.0f; + } else { #ifndef LM_GGML_FLASH_ATTN_EXP_FP16 - const float val = expf(SR[j] - max); + const float val = expf(SR[j] - max); #else - lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SR[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SR[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); #endif - sump[j] += (lm_ggml_float)val; - SW[j] = val; + sump[j] += (lm_ggml_float)val; + SW[j] = val; + } } } - } - for (int i = 0; i < LM_GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + for (int i = 0; i < LM_GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } #endif - } - - assert(sum > 0.0); - - sum = 1.0/sum; - lm_ggml_vec_scale_f32(M, SM, sum); - - } - - // step-by-step explanation - { - // forward-process shape grads from backward process - // parallel_for iq2,iq3: - // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] - // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] - // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] - // for iq1: - // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur - // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur - // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 - // S0 = -Inf [D,1,1,1] - // ~S1[i] = dot(kcur[:D,i], qcur) - // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale - // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) - // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur - // ~S5[i] = dot(vcur[:,i], S4) - // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] - // ~dst[i,iq1,iq2,iq3] = S5[i] ^ - // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] - // dst backward-/ grad[dst] = d - // - // output gradients with their dependencies: - // - // grad[kcur] = grad[S1].T @ qcur - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S4] = grad[S5] @ vcur - // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur - // grad[qcur] = grad[S1] @ kcur - // grad[vcur] = grad[S5].T @ S4 - // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 - // - // in post-order: - // - // S1 = qcur @ kcur.T - // S2 = S1 * scale - // S3 = diag_mask_inf(S2, P) - // S4 = softmax(S3) - // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[qcur] = grad[S1] @ kcur - // grad[kcur] = grad[S1].T @ qcur - // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 - // - // using less variables (SM=S4): - // - // S = diag_mask_inf(qcur @ kcur.T * scale, P) - // SM = softmax(S) - // S = d[:D,iq1,iq2,iq3] @ vcur - // dot_SM_gradSM = dot(SM, S) - // S = SM * (S - dot(SM, S)) - // S = diag_mask_zero(S, P) * scale - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM - } - - // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur - // S = d[:D,iq1,iq2,iq3] @ vcur - // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3] - lm_ggml_vec_set_f32(M, S, 0); - for (int64_t ic = 0; ic < D; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + } - lm_ggml_vec_mad_f32(M, - S, - (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), - *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); - } + assert(sum > 0.0); - // S = SM * (S - dot(SM, S)) - float dot_SM_gradSM = 0; - lm_ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); - lm_ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); - lm_ggml_vec_mul_f32 (M, S, S, SM); + sum = 1.0/sum; + lm_ggml_vec_scale_f32(masked_begin, SM, sum); - // S = diag_mask_zero(S, P) * scale - if (masked) { - // for (int64_t i = P + iq1 + 1; i < M; i++) { - // S[i] = 0; - // } - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = 0; - } } - } - lm_ggml_vec_scale_f32(M, S, scale); - - void * grad_q = (char *) dst->data; - void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; - void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; - - const size_t nbgq1 = nb0*neq0; - const size_t nbgq2 = nb0*neq0*neq1; - const size_t nbgq3 = nb0*neq0*neq1*neq2; - - const size_t nbgk1 = nb0*nek0; - const size_t nbgk2 = nb0*nek0*nek1; - const size_t nbgk3 = nb0*nek0*nek1*neq2; - - const size_t nbgv1 = nb0*nev0; - const size_t nbgv2 = nb0*nev0*nev1; - const size_t nbgv3 = nb0*nev0*nev1*neq2; - - // S shape [M,1] - // SM shape [M,1] - // kcur shape [D,M] - // qcur shape [D,1] - // vcur shape [M,D] - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] - // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic] - // - //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) - //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) - for (int64_t ic = 0; ic < M; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - lm_ggml_vec_mad_f32(D, - (float *) ((char *) grad_q + (i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), - (float *) ((char *) k->data + (ic*nbk1 + i2*nbk2 + i3*nbk3)), - S[ic]); - } + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for ik2,ik3: + // for irep: + // iq2 = ik2 + irep*nek2 + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,ik2,ik3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iv2,iv3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,ik2,ik3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iv2,iv3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i], S4) + // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,id1,id2,id3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,id1,id2,id3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,id1,id2,id3].T @ S4 + // + // in post-order: + // + // S1 = qcur @ kcur.T + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,id1,id2,id3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,id1,id2,id3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur @ kcur.T * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,ik2,ik3] += S.T @ qcur + // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM + } - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] - // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] - for (int64_t ic = 0; ic < M; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] + // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] + // for ic: + // S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3] + // exclude known future zero S[..] values from operation + lm_ggml_vec_set_f32(masked_begin, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + lm_ggml_vec_mad_f32(masked_begin, + S, + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + } - // lm_ggml_vec_set_f32(D, - // (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), - // 0); - lm_ggml_vec_mad_f32(D, - (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), - (float *) ((char *) q->data + (i1*nbq1 + i2*nbq2 + i3*nbq3)), - S[ic]); - } + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + lm_ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S); + lm_ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + lm_ggml_vec_mul_f32 (masked_begin, S, S, SM); + + // S = diag_mask_zero(S, P) * scale + // already done by above lm_ggml_vec_set_f32 + + // exclude known zero S[..] values from operation + lm_ggml_vec_scale_f32(masked_begin, S, scale); + + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // for ic: + // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3] + // exclude known zero S[..] values from loop + for (int64_t ic = 0; ic < masked_begin; ++ic) { + lm_ggml_vec_mad_f32(D, + (float *) ((char *) grad_q + (iq1*nbgq1 + iq2*nbgq2 + iq3*nbgq3)), + (float *) ((char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), + S[ic]); + } - // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM - // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] - // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] - for (int64_t ic = 0; ic < D; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // for ic: + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + // exclude known zero S[..] values from loop + for (int64_t ic = 0; ic < masked_begin; ++ic) { + lm_ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + ik2*nbgk2 + ik3*nbgk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), + S[ic]); + } - // lm_ggml_vec_set_f32(M, - // (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), - // 0); - lm_ggml_vec_mad_f32(M, - (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), - SM, - *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM + // for ic: + // grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3] * SM[:M] + // exclude known zero SM[..] values from mad + for (int64_t ic = 0; ic < D; ++ic) { + lm_ggml_vec_mad_f32(masked_begin, + (float *) ((char *) grad_v + ( ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + } } } } @@ -14962,8 +15481,8 @@ static void lm_ggml_compute_forward_win_part_f32( return; } - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; @@ -15024,8 +15543,8 @@ static void lm_ggml_compute_forward_win_unpart_f32( return; } - LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + LM_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + LM_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) const int32_t w = ((const int32_t *)(dst->op_params))[0]; @@ -15142,7 +15661,7 @@ static void lm_ggml_compute_forward_get_rel_pos_f16( // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 - LM_GGML_TENSOR_UNARY_OP_LOCALS; + LM_GGML_TENSOR_UNARY_OP_LOCALS const int64_t w = ne1; @@ -15840,7 +16359,7 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru } break; case LM_GGML_OP_GET_ROWS_BACK: { - lm_ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + lm_ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor); } break; case LM_GGML_OP_DIAG: { @@ -15864,11 +16383,11 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru } break; case LM_GGML_OP_ROPE: { - lm_ggml_compute_forward_rope(params, tensor->src[0], tensor); + lm_ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); } break; case LM_GGML_OP_ROPE_BACK: { - lm_ggml_compute_forward_rope_back(params, tensor->src[0], tensor); + lm_ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); } break; case LM_GGML_OP_ALIBI: { @@ -16013,7 +16532,218 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru //////////////////////////////////////////////////////////////////////////////// -static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, bool inplace) { +static_assert(LM_GGML_GRAPH_HASHTABLE_SIZE > LM_GGML_MAX_NODES * 2, "LM_GGML_GRAPH_HT_SIZE is too small"); + +static size_t hash(void * p) { + return (size_t)p % LM_GGML_GRAPH_HASHTABLE_SIZE; +} + +static size_t hash_find(void * hash_table[], void * p) { + size_t h = hash(p); + + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % LM_GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // visited all hash table entries -> not found + return LM_GGML_GRAPH_HASHTABLE_SIZE; + } + } + return i; +} + +static bool hash_insert(void * hash_table[], void * p) { + size_t i = hash_find(hash_table, p); + + LM_GGML_ASSERT(i < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + + if (hash_table[i] == p) { + return true; + } + + // insert + LM_GGML_ASSERT(hash_table[i] == NULL); + hash_table[i] = p; + return false; +} + +static bool hash_contains(void * hash_table[], void * p) { + size_t i = hash_find(hash_table, p); + return (i < LM_GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); +} + +struct hash_map { + void * keys[LM_GGML_GRAPH_HASHTABLE_SIZE]; + void * vals[LM_GGML_GRAPH_HASHTABLE_SIZE]; +}; + +static struct hash_map * new_hash_map(void) { + struct hash_map * result = malloc(sizeof(struct hash_map)); + for (int i=0; ikeys[i] = NULL; + result->vals[i] = NULL; + } + return result; +} + +static void free_hash_map(struct hash_map * map) { + free(map); +} + +// gradient checkpointing + +static struct lm_ggml_tensor * lm_ggml_recompute_graph_node( + struct lm_ggml_context * ctx, + struct lm_ggml_cgraph * graph, + struct hash_map * replacements, + struct lm_ggml_tensor * node) { + + if (node == NULL) { + return NULL; + } + + if (node->is_param) { + return node; + } + + if (!hash_contains(graph->visited_hash_table, node)) { + return node; + } + + int count_children = 0; + for (int k = 0; k < LM_GGML_MAX_SRC; ++k) { + if (node->src[k]) { + ++count_children; + } + } + + if (count_children == 0) { + return node; + } + + size_t i = hash_find(replacements->keys, node); + LM_GGML_ASSERT(i < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + if (replacements->keys[i] == node) { + return (struct lm_ggml_tensor *) replacements->vals[i]; + } + + struct lm_ggml_tensor * clone = lm_ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + + // insert clone into replacements + LM_GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite + replacements->keys[i] = node; + replacements->vals[i] = clone; + + clone->op = node->op; + clone->grad = node->grad; + clone->is_param = node->is_param; + clone->extra = node->extra; + for (int k = 0; k < LM_GGML_MAX_DIMS; ++k) { + clone->nb[k] = node->nb[k]; + } + for (int k = 0; k < LM_GGML_MAX_SRC; ++k) { + clone->src[k] = lm_ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]); + } + if (node->view_src != NULL) { + clone->data = (node->view_src->data == NULL) + ? NULL // view_src not yet allocated + : (char *) node->view_src->data // view_src already allocated + + node->view_offs; + clone->view_src = node->view_src; + clone->view_offs = node->view_offs; + } + + LM_GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (LM_GGML_MAX_OP_PARAMS / sizeof(int32_t))); + LM_GGML_ASSERT(sizeof(node->name) == LM_GGML_MAX_NAME); + memcpy(clone->op_params, node->op_params, sizeof(node->op_params)); + lm_ggml_format_name(clone, "%s (clone)", lm_ggml_get_name(node)); + + return clone; +} + +void lm_ggml_build_backward_gradient_checkpointing( + struct lm_ggml_context * ctx, + struct lm_ggml_cgraph * gf, + struct lm_ggml_cgraph * gb, + struct lm_ggml_cgraph * gb_tmp, + struct lm_ggml_tensor * * checkpoints, + int n_checkpoints) { + *gb_tmp = *gf; + lm_ggml_build_backward_expand(ctx, gf, gb_tmp, true); + + if (n_checkpoints <= 0) { + *gb = *gb_tmp; + return; + } + + struct hash_map * replacements = new_hash_map(); + + // insert checkpoints in replacements + for (int i = 0; i < n_checkpoints; ++i) { + size_t k = hash_find(replacements->keys, checkpoints[i]); + LM_GGML_ASSERT(k < LM_GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + LM_GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite + replacements->keys[k] = checkpoints[i]; + replacements->vals[k] = checkpoints[i]; + } + + *gb = *gf; + // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], + // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), + // by recomputing them from checkpoints + for (int i = gf->n_nodes; in_nodes; ++i) { + struct lm_ggml_tensor * node = gb_tmp->nodes[i]; + for (int k = 0; k < LM_GGML_MAX_SRC; ++k) { + // insert new tensors recomputing src, reusing already made replacements, + // remember replacements: remember new tensors with mapping from corresponding gf nodes + // recurse for input tensors, + // unless (i.e. terminating when) input tensors are replacments (like checkpoints) + node->src[k] = lm_ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); + } + // insert rewritten backward node with replacements made into resulting backward graph gb + lm_ggml_build_forward_expand(gb, node); + } + + free_hash_map(replacements); +} + +// functions to change gradients considering the case that input a might be initial gradient with zero value + +static struct lm_ggml_tensor * lm_ggml_add_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + return b; + } else { + return lm_ggml_add_impl(ctx, a, b, false); + } +} + +static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + struct lm_ggml_tensor * a_zero = lm_ggml_scale(ctx, a, lm_ggml_new_f32(ctx, 0)); + return lm_ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); + } else { + return lm_ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); + } +} + +static struct lm_ggml_tensor * lm_ggml_add1_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + return lm_ggml_repeat(ctx, b, a); + } else { + return lm_ggml_add1_impl(ctx, a, b, false); + } +} + +static struct lm_ggml_tensor * lm_ggml_sub_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + return lm_ggml_neg(ctx, b); + } else { + return lm_ggml_sub_impl(ctx, a, b, false); + } +} + +static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor, void * zero_table[]) { struct lm_ggml_tensor * src0 = tensor->src[0]; struct lm_ggml_tensor * src1 = tensor->src[1]; @@ -16021,34 +16751,34 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm case LM_GGML_OP_DUP: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } } break; case LM_GGML_OP_ADD: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { - src1->grad = lm_ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = lm_ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table); } } break; case LM_GGML_OP_ADD1: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { - src1->grad = lm_ggml_add_impl(ctx, + src1->grad = lm_ggml_add_or_set(ctx, src1->grad, lm_ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean - inplace); + zero_table); } } break; case LM_GGML_OP_ACC: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { const size_t nb1 = ((int32_t *) tensor->op_params)[0]; @@ -16065,117 +16795,117 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm nb1, nb2, nb3, offset); src1->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src1->grad, lm_ggml_reshape(ctx, lm_ggml_cont(ctx, tensor_grad_view), src1->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_SUB: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { - src1->grad = lm_ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = lm_ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table); } } break; case LM_GGML_OP_MUL: { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_mul(ctx, src1, tensor->grad), - inplace); + zero_table); } if (src1->grad) { src1->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src1->grad, lm_ggml_mul(ctx, src0, tensor->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_DIV: { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_div(ctx, tensor->grad, src1), - inplace); + zero_table); } if (src1->grad) { src1->grad = - lm_ggml_sub_impl(ctx, + lm_ggml_sub_or_set(ctx, src1->grad, lm_ggml_mul(ctx, tensor->grad, lm_ggml_div(ctx, tensor, src1)), - inplace); + zero_table); } } break; case LM_GGML_OP_SQR: { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_scale(ctx, lm_ggml_mul(ctx, src0, tensor->grad), lm_ggml_new_f32(ctx, 2.0f)), - inplace); + zero_table); } } break; case LM_GGML_OP_SQRT: { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_scale(ctx, lm_ggml_div(ctx, tensor->grad, tensor), lm_ggml_new_f32(ctx, 0.5f)), - inplace); + zero_table); } } break; case LM_GGML_OP_LOG: { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_div(ctx, tensor->grad, src0), - inplace); + zero_table); } } break; case LM_GGML_OP_SUM: { if (src0->grad) { src0->grad = - lm_ggml_add1_impl(ctx, + lm_ggml_add1_or_set(ctx, src0->grad, tensor->grad, - inplace); + zero_table); } } break; case LM_GGML_OP_SUM_ROWS: { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_repeat(ctx, tensor->grad, src0->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_MEAN: @@ -16187,20 +16917,20 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { // necessary for llama if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_repeat_back(ctx, tensor->grad, src0->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_REPEAT_BACK: { if (src0->grad) { // TODO: test this - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_repeat(ctx, tensor->grad, src0->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_CONCAT: @@ -16222,10 +16952,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm float eps; memcpy(&eps, tensor->op_params, sizeof(float)); - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_rms_norm_back(ctx, src0, tensor->grad, eps), - inplace); + zero_table); } } break; case LM_GGML_OP_RMS_NORM_BACK: @@ -16249,37 +16979,49 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix // ds1 = t.T.dot(dt) - // tensor.shape [m,p] - // src0.shape [n,m] - // src1.shape [n,p] + // tensor.shape [m,p,qq,rr] + // src0.shape [n,m,q1,r1] + // src1.shape [n,p,qq,rr] // necessary for llama if (src0->grad) { + struct lm_ggml_tensor * s1_tg = + lm_ggml_out_prod(ctx, // [n,m,qq,rr] + src1, // [n,p,qq,rr] + tensor->grad); // [m,p,qq,rr] + const int64_t qq = s1_tg->ne[2]; + const int64_t rr = s1_tg->ne[3]; + const int64_t q1 = src0->ne[2]; + const int64_t r1 = src0->ne[3]; + const bool ne2_broadcasted = qq > q1; + const bool ne3_broadcasted = rr > r1; + if (ne2_broadcasted || ne3_broadcasted) { + // sum broadcast repetitions of s1_tg into shape of src0 + s1_tg = lm_ggml_repeat_back(ctx, s1_tg, src0); + } src0->grad = - lm_ggml_add_impl(ctx, - src0->grad, - lm_ggml_out_prod(ctx, // [n,m] - src1, // [n,p] - tensor->grad), // [m,p] - inplace); + lm_ggml_add_or_set(ctx, + src0->grad, // [n,m,q1,r1] + s1_tg, // [n,m,q1,r1] + zero_table); } if (src1->grad) { src1->grad = - lm_ggml_add_impl(ctx, - src1->grad, - // lm_ggml_mul_mat(ctx, // [n,p] - // lm_ggml_cont(ctx, // [m,n] - // lm_ggml_transpose(ctx, src0)), // [m,n] - // tensor->grad), // [m,p] + lm_ggml_add_or_set(ctx, + src1->grad, // [n,p,qq,rr] + // lm_ggml_mul_mat(ctx, // [n,p,qq,rr] + // lm_ggml_cont(ctx, // [m,n,q1,r1] + // lm_ggml_transpose(ctx, src0)), // [m,n,q1,r1] + // tensor->grad), // [m,p,qq,rr] // // when src0 is bigger than tensor->grad (this is mostly the case in llama), // // avoid transpose of src0, rather transpose smaller tensor->grad // // and then use lm_ggml_out_prod - lm_ggml_out_prod(ctx, // [n,p] - src0, // [n,m] - lm_ggml_transpose(ctx, // [p,m] - tensor->grad)), // [m,p] - inplace); + lm_ggml_out_prod(ctx, // [n,p,qq,rr] + src0, // [n,m,q1,r1] + lm_ggml_transpose(ctx, // [p,m,qq,rr] + tensor->grad)), // [m,p,qq,rr] + zero_table); } } break; case LM_GGML_OP_OUT_PROD: @@ -16291,17 +17033,17 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_scale_impl(ctx, tensor->grad, src1, false), - inplace); + zero_table); } if (src1->grad) { src1->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src1->grad, lm_ggml_sum(ctx, lm_ggml_mul_impl(ctx, tensor->grad, src0, false)), - inplace); + zero_table); } } break; case LM_GGML_OP_SET: @@ -16328,23 +17070,23 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm } if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_acc_impl(ctx, tensor->grad, lm_ggml_neg(ctx, tensor_grad_view), nb1, nb2, nb3, offset, false), - inplace); + zero_table); } if (src1->grad) { src1->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src1->grad, lm_ggml_reshape(ctx, lm_ggml_cont(ctx, tensor_grad_view), src1->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_CPY: @@ -16355,7 +17097,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // tensor = src0 * 1 + src1 * 0 if (src0->grad) { // dsrc0 = dtensor * 1 - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { // dsrc1 = dtensor * 0 -> noop @@ -16367,7 +17109,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm if (src0->grad) { LM_GGML_ASSERT(lm_ggml_is_contiguous(src0->grad)); LM_GGML_ASSERT(lm_ggml_is_contiguous(tensor->grad)); - src0->grad = lm_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } } break; case LM_GGML_OP_RESHAPE: @@ -16375,9 +17117,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, src0->grad, - lm_ggml_reshape(ctx, tensor->grad, src0->grad), - inplace); + lm_ggml_add_or_set(ctx, src0->grad, + lm_ggml_reshape(ctx, + lm_ggml_is_contiguous(tensor->grad) + ? tensor->grad + : lm_ggml_cont(ctx, tensor->grad), + src0->grad), + zero_table); } } break; case LM_GGML_OP_VIEW: @@ -16406,7 +17152,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm nb3 = (nb3 / n0) * ng; } - src0->grad = lm_ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); + src0->grad = lm_ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table); } } break; case LM_GGML_OP_PERMUTE: @@ -16424,14 +17170,14 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm axes_backward[axis2] = 2; axes_backward[axis3] = 3; src0->grad = - lm_ggml_add_impl(ctx, src0->grad, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_permute(ctx, tensor->grad, axes_backward[0], axes_backward[1], axes_backward[2], axes_backward[3]), - inplace); + zero_table); } } break; case LM_GGML_OP_TRANSPOSE: @@ -16439,9 +17185,9 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, src0->grad, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_transpose(ctx, tensor->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_GET_ROWS: @@ -16449,9 +17195,11 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama (only for tokenizer) if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, src0->grad, + lm_ggml_add_or_set(ctx, src0->grad, + // last lm_ggml_get_rows_back argument src0->grad is only + // necessary to setup correct output shape lm_ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), - inplace); + zero_table); } if (src1->grad) { // noop @@ -16471,9 +17219,9 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm if (src0->grad) { const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = - lm_ggml_add_impl(ctx, src0->grad, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), - inplace); + zero_table); } } break; case LM_GGML_OP_DIAG_MASK_ZERO: @@ -16482,9 +17230,9 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm if (src0->grad) { const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = - lm_ggml_add_impl(ctx, src0->grad, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), - inplace); + zero_table); } } break; case LM_GGML_OP_SOFT_MAX: @@ -16492,9 +17240,9 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, src0->grad, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_soft_max_back(ctx, tensor->grad, tensor), - inplace); + zero_table); } } break; @@ -16506,7 +17254,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { // necessary for llama if (src0->grad) { - const int n_past = ((int32_t *) tensor->op_params)[0]; + //const int n_past = ((int32_t *) tensor->op_params)[0]; const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; const int n_ctx = ((int32_t *) tensor->op_params)[3]; @@ -16519,11 +17267,11 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_rope_back(ctx, tensor->grad, - n_past, + src1, n_dims, mode, n_ctx, @@ -16531,13 +17279,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm freq_scale, xpos_base, xpos_down), - inplace); + zero_table); } } break; case LM_GGML_OP_ROPE_BACK: { if (src0->grad) { - const int n_past = ((int32_t *) tensor->op_params)[0]; + //const int n_past = ((int32_t *) tensor->op_params)[0]; const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; const int n_ctx = ((int32_t *) tensor->op_params)[3]; @@ -16550,11 +17298,11 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_rope_impl(ctx, tensor->grad, - n_past, + src1, n_dims, mode, n_ctx, @@ -16563,7 +17311,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm xpos_base, xpos_down, false), - inplace); + zero_table); } } break; case LM_GGML_OP_ALIBI: @@ -16614,145 +17362,42 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm masked); } - if (src0->grad) { - struct lm_ggml_tensor * grad_q = NULL; - const size_t nb0 = flash_grad->nb[0]; - const size_t offset = 0; - switch(src0->n_dims) { - case 2: - { - grad_q = lm_ggml_view_2d(ctx, - flash_grad, - src0->ne[0], - src0->ne[1], - nb0*src0->ne[0], - offset); - } break; - case 3: - { - grad_q = lm_ggml_view_3d(ctx, - flash_grad, - src0->ne[0], - src0->ne[1], - src0->ne[2], - nb0*src0->ne[0], - nb0*src0->ne[0]*src0->ne[1], - offset); - } break; - case 4: - { - grad_q = lm_ggml_view_4d(ctx, - flash_grad, - src0->ne[0], - src0->ne[1], - src0->ne[2], - src0->ne[3], - nb0*src0->ne[0], - nb0*src0->ne[0]*src0->ne[1], - nb0*src0->ne[0]*src0->ne[1]*src0->ne[2], - offset); - } break; - } + struct lm_ggml_tensor * src2 = tensor->src[2]; + const int64_t elem_q = lm_ggml_nelements(src0); + const int64_t elem_k = lm_ggml_nelements(src1); + const int64_t elem_v = lm_ggml_nelements(src2); + + enum lm_ggml_type result_type = flash_grad->type; + LM_GGML_ASSERT(lm_ggml_blck_size(result_type) == 1); + const size_t tsize = lm_ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + LM_GGML_PAD(elem_q * tsize, LM_GGML_MEM_ALIGN); + const size_t offs_v = offs_k + LM_GGML_PAD(elem_k * tsize, LM_GGML_MEM_ALIGN); - src0->grad = lm_ggml_add_impl(ctx, + if (src0->grad) { + struct lm_ggml_tensor * view_q = lm_ggml_view_1d(ctx, flash_grad, elem_q, offs_q); + struct lm_ggml_tensor * grad_q = lm_ggml_reshape(ctx, view_q, src0); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, grad_q, - inplace); + zero_table); } - if (src1->grad) { - struct lm_ggml_tensor * grad_k = NULL; - const size_t nb0 = flash_grad->nb[0]; - const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]; - switch(src1->n_dims) { - case 2: - { - grad_k = lm_ggml_view_2d(ctx, - flash_grad, - src1->ne[0], - src1->ne[1], - nb0*src1->ne[0], - offset); - } break; - case 3: - { - grad_k = lm_ggml_view_3d(ctx, - flash_grad, - src1->ne[0], - src1->ne[1], - src1->ne[2], - nb0*src1->ne[0], - nb0*src1->ne[0]*src1->ne[1], - offset); - } break; - case 4: - { - grad_k = lm_ggml_view_4d(ctx, - flash_grad, - src1->ne[0], - src1->ne[1], - src1->ne[2], - src1->ne[3], - nb0*src1->ne[0], - nb0*src1->ne[0]*src1->ne[1], - nb0*src1->ne[0]*src1->ne[1]*src1->ne[2], - offset); - } break; - } - - src1->grad = lm_ggml_add_impl(ctx, + struct lm_ggml_tensor * view_k = lm_ggml_view_1d(ctx, flash_grad, elem_k, offs_k); + struct lm_ggml_tensor * grad_k = lm_ggml_reshape(ctx, view_k, src1); + src1->grad = lm_ggml_add_or_set(ctx, src1->grad, grad_k, - inplace); + zero_table); } - - struct lm_ggml_tensor * opt0 = tensor->src[2]; - - if (opt0->grad) { - struct lm_ggml_tensor * grad_v = NULL; - const size_t nb0 = flash_grad->nb[0]; - const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3] - + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3]; - switch(opt0->n_dims) { - case 2: - { - grad_v = lm_ggml_view_2d(ctx, - flash_grad, - opt0->ne[0], - opt0->ne[1], - nb0*opt0->ne[0], - offset); - } break; - case 3: - { - grad_v = lm_ggml_view_3d(ctx, - flash_grad, - opt0->ne[0], - opt0->ne[1], - opt0->ne[2], - nb0*opt0->ne[0], - nb0*opt0->ne[0]*opt0->ne[1], - offset); - } break; - case 4: - { - grad_v = lm_ggml_view_4d(ctx, - flash_grad, - opt0->ne[0], - opt0->ne[1], - opt0->ne[2], - opt0->ne[3], - nb0*opt0->ne[0], - nb0*opt0->ne[0]*opt0->ne[1], - nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2], - offset); - } break; - } - - opt0->grad = lm_ggml_add_impl(ctx, - opt0->grad, + if (src2->grad) { + struct lm_ggml_tensor * view_v = lm_ggml_view_1d(ctx, flash_grad, elem_v, offs_v); + struct lm_ggml_tensor * grad_v = lm_ggml_reshape(ctx, view_v, src2); + src2->grad = lm_ggml_add_or_set(ctx, + src2->grad, grad_v, - inplace); + zero_table); } } break; case LM_GGML_OP_FLASH_FF: @@ -16772,12 +17417,12 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { if (src0->grad) { src0->grad = - lm_ggml_add_impl(ctx, + lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_mul(ctx, lm_ggml_sgn(ctx, src0), tensor->grad), - inplace); + zero_table); } } break; case LM_GGML_UNARY_OP_SGN: @@ -16789,7 +17434,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm case LM_GGML_UNARY_OP_NEG: { if (src0->grad) { - src0->grad = lm_ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = lm_ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table); } } break; case LM_GGML_UNARY_OP_STEP: @@ -16809,12 +17454,12 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm case LM_GGML_UNARY_OP_RELU: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_mul(ctx, lm_ggml_step(ctx, src0), tensor->grad), - inplace); + zero_table); } } break; case LM_GGML_UNARY_OP_GELU: @@ -16829,10 +17474,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { // necessary for llama if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_silu_back(ctx, src0, tensor->grad), - inplace); + zero_table); } } break; default: @@ -16855,13 +17500,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm case LM_GGML_OP_CROSS_ENTROPY_LOSS: { if (src0->grad) { - src0->grad = lm_ggml_add_impl(ctx, + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, lm_ggml_cross_entropy_loss_back(ctx, src0, src1, tensor->grad), - inplace); + zero_table); } } break; case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK: @@ -16877,34 +17522,12 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm LM_GGML_ASSERT(false); } break; } -} - -static_assert(LM_GGML_GRAPH_HASHTABLE_SIZE > LM_GGML_MAX_NODES * 2, "LM_GGML_GRAPH_HT_SIZE is too small"); - -static size_t hash(void * p) { - return (size_t)p % LM_GGML_GRAPH_HASHTABLE_SIZE; -} -static bool hash_insert(void * hash_table[], void * p) { - size_t h = hash(p); - - // linear probing - size_t i = h; - while (hash_table[i] != NULL && hash_table[i] != p) { - i = (i + 1) % LM_GGML_GRAPH_HASHTABLE_SIZE; - if (i == h) { - // hash table is full - LM_GGML_ASSERT(false); + for (int i = 0; i < LM_GGML_MAX_SRC; ++i) { + if (tensor->src[i] && tensor->src[i]->grad) { + LM_GGML_ASSERT(lm_ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad)); } } - - if (hash_table[i] == p) { - return true; - } - - // insert - hash_table[i] = p; - return false; } static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * node) { @@ -16922,8 +17545,12 @@ static void lm_ggml_visit_parents(struct lm_ggml_cgraph * cgraph, struct lm_ggml } for (int i = 0; i < LM_GGML_MAX_SRC; ++i) { - if (node->src[i]) { - lm_ggml_visit_parents(cgraph, node->src[i]); + const int k = + (cgraph->order == LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i : + (cgraph->order == LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (LM_GGML_MAX_SRC-1-i) : + /* unknown order, just fall back to using i*/ i; + if (node->src[k]) { + lm_ggml_visit_parents(cgraph, node->src[k]); } } @@ -16982,6 +17609,7 @@ struct lm_ggml_cgraph lm_ggml_build_forward(struct lm_ggml_tensor * tensor) { /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, /*.hash_table =*/ { NULL }, + /*.order =*/ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -17007,12 +17635,22 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_ } } + // remember original gradients which start with zero values + void ** zero_table = malloc(sizeof(void *) * LM_GGML_GRAPH_HASHTABLE_SIZE); + memset(zero_table, 0, sizeof(void*) * LM_GGML_GRAPH_HASHTABLE_SIZE); + for (int i = 0; i < gf->n_nodes; i++) { + if (gf->grads[i]) { + hash_insert(zero_table, gf->grads[i]); + } + } + for (int i = gf->n_nodes - 1; i >= 0; i--) { struct lm_ggml_tensor * node = gf->nodes[i]; - // because we detached the grad nodes from the original graph, we can afford inplace operations + // inplace operations to add gradients are not created by lm_ggml_compute_backward + // use allocator to automatically make inplace operations if (node->grad) { - lm_ggml_compute_backward(ctx, node, keep); + lm_ggml_compute_backward(ctx, node, zero_table); } } @@ -17024,6 +17662,8 @@ void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_ lm_ggml_build_forward_expand(gb, node->grad); } } + + free(zero_table); } struct lm_ggml_cgraph lm_ggml_build_backward(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, bool keep) { @@ -17043,6 +17683,7 @@ struct lm_ggml_cgraph * lm_ggml_new_graph(struct lm_ggml_context * ctx) { /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, /*.hash_table =*/ { NULL }, + /*.order =*/ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -17433,7 +18074,6 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th } break; case LM_GGML_OP_CONCAT: case LM_GGML_OP_MUL_MAT: - case LM_GGML_OP_OUT_PROD: { n_tasks = n_threads; @@ -17475,6 +18115,18 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th cur = 0; } + work_size = MAX(work_size, cur); + } break; + case LM_GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (lm_ggml_is_quantized(node->src[0]->type)) { + cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + work_size = MAX(work_size, cur); } break; case LM_GGML_OP_SCALE: @@ -18568,7 +19220,7 @@ static void lm_ggml_opt_get_params(int np, struct lm_ggml_tensor * const ps[], f } static void lm_ggml_opt_get_grad(int np, struct lm_ggml_tensor * const ps[], float * g) { - int i = 0; + int64_t i = 0; for (int p = 0; p < np; ++p) { const int64_t ne = lm_ggml_nelements(ps[p]) ; // TODO: add function to get all elements at once @@ -18578,6 +19230,17 @@ static void lm_ggml_opt_get_grad(int np, struct lm_ggml_tensor * const ps[], flo } } +static void lm_ggml_opt_acc_grad(int np, struct lm_ggml_tensor * const ps[], float * g, float scale) { + int64_t i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = lm_ggml_nelements(ps[p]) ; + // TODO: add function to get all elements at once + for (int64_t j = 0; j < ne; ++j) { + g[i++] += lm_ggml_get_f32_1d(ps[p]->grad, j) * scale; + } + } +} + // // ADAM // @@ -18626,26 +19289,43 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( const float eps = params.adam.eps; const float gclip = params.adam.gclip; const int decay_min_ndim = params.adam.decay_min_ndim; + const int n_accum = MAX(1, params.n_gradient_accumulation); + const float accum_norm = 1.0f / (float) n_accum; + float * g = opt->adam.g->data; // gradients float * m = opt->adam.m->data; // first moment float * v = opt->adam.v->data; // second moment float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - if (callback) { - callback(callback_data, &sched); - } - - // compute the function value - lm_ggml_graph_reset (gf); - lm_ggml_set_f32 (f->grad, 1.0f); - struct lm_ggml_cplan cplan = lm_ggml_graph_plan(gb, params.n_threads); struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - lm_ggml_graph_compute(gb, &cplan); - opt->adam.fx_prev = lm_ggml_get_f32_1d(f, 0); + bool cancel = false; + + // compute the function value + float fx = 0; + lm_ggml_set_zero(opt->adam.g); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + callback(callback_data, accum_step, &sched, &cancel); + if (cancel) { + break; + } + } + // lm_ggml_graph_reset (gf); + lm_ggml_set_f32 (f->grad, 1.0f); + lm_ggml_graph_compute(gb, &cplan); + lm_ggml_opt_acc_grad(np, ps, g, accum_norm); + fx += lm_ggml_get_f32_1d(f, 0); + } + if (cancel) { + return LM_GGML_OPT_DID_NOT_CONVERGE; + } + fx *= accum_norm; + + opt->adam.fx_prev = fx; opt->adam.fx_best = opt->adam.fx_prev; if (pf) { pf[opt->iter % params.past] = opt->adam.fx_prev; @@ -18668,6 +19348,9 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { + if (cancel) { + break; + } opt->iter = iter0 + t + 1; LM_GGML_PRINT_DEBUG ("=== iter %d ===\n", t); @@ -18690,12 +19373,8 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( if (gclip > 0.0f) { // gradient clipping lm_ggml_float sum = 0.0; - for (int p = 0; p < np; ++p) { - const int64_t ne = lm_ggml_nelements(ps[p]); - for (int64_t j = 0; j < ne; ++j) { - float g = lm_ggml_get_f32_1d(ps[p]->grad, j); - sum += (lm_ggml_float)(g*g); - } + for (int64_t i = 0; i < nx; ++i) { + sum += (lm_ggml_float)(g[i]*g[i]); } lm_ggml_float norm = sqrt(sum); if (norm > (lm_ggml_float) gclip) { @@ -18709,10 +19388,10 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( const int64_t ne = lm_ggml_nelements(ps[p]); const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; for (int64_t j = 0; j < ne; ++j) { - float x = lm_ggml_get_f32_1d(ps[p], j); - float g = lm_ggml_get_f32_1d(ps[p]->grad, j)*gnorm; - m[i] = m[i]*beta1 + g*(1.0f - beta1); - v[i] = v[i]*beta2 + g*g*(1.0f - beta2); + float x = lm_ggml_get_f32_1d(ps[p], j); + float g_ = g[i]*gnorm; + m[i] = m[i]*beta1 + g_*(1.0f - beta1); + v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2); float mh = m[i]*beta1h; float vh = v[i]*beta2h; vh = sqrtf(vh) + eps; @@ -18723,16 +19402,26 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( } } - if (callback) { - callback(callback_data, &sched); + fx = 0; + lm_ggml_set_zero(opt->adam.g); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + callback(callback_data, accum_step, &sched, &cancel); + if (cancel) { + break; + } + } + // lm_ggml_graph_reset (gf); + lm_ggml_set_f32 (f->grad, 1.0f); + lm_ggml_graph_compute(gb, &cplan); + lm_ggml_opt_acc_grad(np, ps, g, accum_norm); + fx += lm_ggml_get_f32_1d(f, 0); } + if (cancel) { + break; + } + fx *= accum_norm; - lm_ggml_graph_reset (gf); - lm_ggml_set_f32 (f->grad, 1.0f); - - lm_ggml_graph_compute(gb, &cplan); - - const float fx = lm_ggml_get_f32_1d(f, 0); opt->loss_after = fx; @@ -18812,11 +19501,11 @@ static enum lm_ggml_opt_result linesearch_backtracking( float * step, const float * xp, struct lm_ggml_tensor * f, - struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, struct lm_ggml_cplan * cplan, const int np, struct lm_ggml_tensor * ps[], + bool * cancel, lm_ggml_opt_callback callback, void * callback_data) { int count = 0; @@ -18830,6 +19519,9 @@ static enum lm_ggml_opt_result linesearch_backtracking( const float dec = 0.5f; const float inc = 2.1f; + const int n_accum = MAX(1, params->n_gradient_accumulation); + const float accum_norm = 1.0f / (float) n_accum; + if (*step <= 0.f) { return LM_GGML_LINESEARCH_INVALID_PARAMETERS; } @@ -18846,13 +19538,7 @@ static enum lm_ggml_opt_result linesearch_backtracking( finit = *fx; dgtest = params->lbfgs.ftol*dginit; - while (true) { - if (callback) { - // LBFG-S does not support learning rate -> ignore learning schedule - float sched = 0; - callback(callback_data, &sched); - } - + while (!*cancel) { lm_ggml_vec_cpy_f32(nx, x, xp); lm_ggml_vec_mad_f32(nx, x, d, *step); @@ -18860,14 +19546,28 @@ static enum lm_ggml_opt_result linesearch_backtracking( { lm_ggml_opt_set_params(np, ps, x); - lm_ggml_graph_reset (gf); - lm_ggml_set_f32 (f->grad, 1.0f); - - lm_ggml_graph_compute(gb, cplan); - - lm_ggml_opt_get_grad(np, ps, g); + *fx = 0; + memset(g, 0, sizeof(float)*nx); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, accum_step, &sched, cancel); + if (*cancel) { + break; + } + } + // lm_ggml_graph_reset (gf); + lm_ggml_set_f32 (f->grad, 1.0f); + lm_ggml_graph_compute(gb, cplan); + lm_ggml_opt_acc_grad(np, ps, g, accum_norm); + *fx += lm_ggml_get_f32_1d(f, 0); + } + if (*cancel) { + break; + } + *fx *= accum_norm; - *fx = lm_ggml_get_f32_1d(f, 0); } ++count; @@ -18913,7 +19613,7 @@ static enum lm_ggml_opt_result linesearch_backtracking( (*step) *= width; } - return LM_GGML_LINESEARCH_FAIL; + LM_GGML_UNREACHABLE(); } static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( @@ -18968,6 +19668,9 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values + const int n_accum = MAX(1, params.n_gradient_accumulation); + const float accum_norm = 1.0f / (float) n_accum; + float fx = 0.0f; // cost function value float xnorm = 0.0f; // ||x|| float gnorm = 0.0f; // ||g|| @@ -18981,24 +19684,33 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( float * lm_s = opt->lbfgs.lms->data; float * lm_y = opt->lbfgs.lmy->data; - if (callback) { - // LBFG-S does not support learning rate -> ignore learning schedule - float sched = 0; - callback(callback_data, &sched); - } + bool cancel = false; // evaluate the function value and its gradient { lm_ggml_opt_set_params(np, ps, x); - lm_ggml_graph_reset (gf); - lm_ggml_set_f32 (f->grad, 1.0f); - - lm_ggml_graph_compute(gb, &cplan); - - lm_ggml_opt_get_grad(np, ps, g); - - fx = lm_ggml_get_f32_1d(f, 0); + fx = 0; + memset(g, 0, sizeof(float)*nx); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, accum_step, &sched, &cancel); + if (cancel) { + break; + } + } + // lm_ggml_graph_reset (gf); + lm_ggml_set_f32 (f->grad, 1.0f); + lm_ggml_graph_compute(gb, &cplan); + lm_ggml_opt_acc_grad(np, ps, g, accum_norm); + fx += lm_ggml_get_f32_1d(f, 0); + } + if (cancel) { + return LM_GGML_OPT_DID_NOT_CONVERGE; + } + fx *= accum_norm; opt->loss_before = fx; opt->loss_after = fx; @@ -19056,7 +19768,10 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( lm_ggml_vec_cpy_f32(nx, xp, x); lm_ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); + if (!cancel) { + break; + } if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -19165,7 +19880,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs( step[0] = 1.0; } - return LM_GGML_OPT_DID_NOT_CONVERGE; + LM_GGML_UNREACHABLE(); } struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type) { @@ -19185,6 +19900,8 @@ struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type) .print_forward_graph = true, .print_backward_graph = true, + .n_gradient_accumulation = 1, + .adam = { .n_iter = 10000, .sched = 1.000f, @@ -19213,6 +19930,8 @@ struct lm_ggml_opt_params lm_ggml_opt_default_params(enum lm_ggml_opt_type type) .print_forward_graph = true, .print_backward_graph = true, + .n_gradient_accumulation = 1, + .lbfgs = { .m = 6, .n_iter = 100, @@ -19243,13 +19962,32 @@ LM_GGML_API void lm_ggml_opt_init( opt->iter = 0; opt->nx = nx; opt->just_initialized = true; + if (opt->ctx == NULL) { + struct lm_ggml_init_params ctx_opt_params; + if (opt->params.type == LM_GGML_OPT_ADAM) { + ctx_opt_params.mem_size = LM_GGML_MEM_ALIGN*3 + lm_ggml_tensor_overhead()*3 + lm_ggml_type_size(LM_GGML_TYPE_F32)*nx*3; + if (opt->params.past > 0) { + ctx_opt_params.mem_size += LM_GGML_MEM_ALIGN + lm_ggml_tensor_overhead() + lm_ggml_type_size(LM_GGML_TYPE_F32)*opt->params.past; + } + } else if (opt->params.type == LM_GGML_OPT_LBFGS) { + ctx_opt_params.mem_size = LM_GGML_MEM_ALIGN*9 + lm_ggml_tensor_overhead()*9 + lm_ggml_type_size(LM_GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2); + if (opt->params.past > 0) { + ctx_opt_params.mem_size += LM_GGML_MEM_ALIGN + lm_ggml_tensor_overhead() + lm_ggml_type_size(LM_GGML_TYPE_F32)*opt->params.past; + } + } + ctx_opt_params.mem_buffer = NULL; + ctx_opt_params.no_alloc = false; + + opt->ctx = lm_ggml_init(ctx_opt_params); + } switch (opt->params.type) { case LM_GGML_OPT_ADAM: { - opt->adam.m = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); - opt->adam.v = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); + opt->adam.g = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); + opt->adam.m = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); + opt->adam.v = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); opt->adam.pf = params.past > 0 - ? lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, params.past) + ? lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, params.past) : NULL; lm_ggml_set_zero(opt->adam.m); lm_ggml_set_zero(opt->adam.v); @@ -19259,18 +19997,18 @@ LM_GGML_API void lm_ggml_opt_init( } break; case LM_GGML_OPT_LBFGS: { - opt->lbfgs.x = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); - opt->lbfgs.xp = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); - opt->lbfgs.g = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); - opt->lbfgs.gp = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); - opt->lbfgs.d = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, nx); + opt->lbfgs.x = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); + opt->lbfgs.xp = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); + opt->lbfgs.g = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); + opt->lbfgs.gp = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); + opt->lbfgs.d = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, nx); opt->lbfgs.pf = params.past > 0 - ? lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, params.past) + ? lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, params.past) : NULL; - opt->lbfgs.lmal = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lmys = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lms = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, nx, params.lbfgs.m); - opt->lbfgs.lmy = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmal = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lmys = lm_ggml_new_tensor_1d(opt->ctx, LM_GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lms = lm_ggml_new_tensor_2d(opt->ctx, LM_GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmy = lm_ggml_new_tensor_2d(opt->ctx, LM_GGML_TYPE_F32, nx, params.lbfgs.m); lm_ggml_set_zero(opt->lbfgs.x); lm_ggml_set_zero(opt->lbfgs.xp); lm_ggml_set_zero(opt->lbfgs.g); @@ -19876,10 +20614,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } break; case GGUF_TYPE_ARRAY: case GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); break; - }; + } } break; case GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); - }; + } if (!ok) { break; @@ -20155,78 +20893,94 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) { return keyfound; } -const char * gguf_get_key(const struct gguf_context * ctx, int i) { - return ctx->kv[i].key.data; +const char * gguf_get_key(const struct gguf_context * ctx, int key_id) { + return ctx->kv[key_id].key.data; } -enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) { - return ctx->kv[i].type; +enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) { + return ctx->kv[key_id].type; } -enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.arr.type; +enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + return ctx->kv[key_id].value.arr.type; } -const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.arr.data; +const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + return ctx->kv[key_id].value.arr.data; } const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); struct gguf_kv * kv = &ctx->kv[key_id]; struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; return str->data; } -int gguf_get_arr_n(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.arr.n; +int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + return ctx->kv[key_id].value.arr.n; } -uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint8; +uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8); + return ctx->kv[key_id].value.uint8; } -int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int8; +int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8); + return ctx->kv[key_id].value.int8; } -uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint16; +uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16); + return ctx->kv[key_id].value.uint16; } -int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int16; +int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16); + return ctx->kv[key_id].value.int16; } -uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint32; +uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32); + return ctx->kv[key_id].value.uint32; } -int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int32; +int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32); + return ctx->kv[key_id].value.int32; } -float gguf_get_val_f32(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.float32; +float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32); + return ctx->kv[key_id].value.float32; } -uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint64; +uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64); + return ctx->kv[key_id].value.uint64; } -int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int64; +int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64); + return ctx->kv[key_id].value.int64; } -double gguf_get_val_f64(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.float64; +double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64); + return ctx->kv[key_id].value.float64; } -bool gguf_get_val_bool(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.bool_; +bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL); + return ctx->kv[key_id].value.bool_; } -const char * gguf_get_val_str (const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.str.data; +const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) { + LM_GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING); + return ctx->kv[key_id].value.str.data; } int gguf_get_n_tensors(const struct gguf_context * ctx) { @@ -20591,10 +21345,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } break; case GGUF_TYPE_ARRAY: case GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); break; - }; + } } break; case GGUF_TYPE_COUNT: LM_GGML_ASSERT(false && "invalid type"); - }; + } } // write tensor infos diff --git a/cpp/ggml.h b/cpp/ggml.h index 2cd0ab11..a7f679e4 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -214,8 +214,8 @@ #define LM_GGML_QNT_VERSION_FACTOR 1000 // do not change this #define LM_GGML_MAX_DIMS 4 -#define LM_GGML_MAX_NODES 4096 -#define LM_GGML_MAX_PARAMS 256 +#define LM_GGML_MAX_NODES 16384 +#define LM_GGML_MAX_PARAMS 1024 #define LM_GGML_MAX_CONTEXTS 64 #define LM_GGML_MAX_SRC 6 #define LM_GGML_MAX_NAME 64 @@ -248,6 +248,14 @@ } \ } while (0) +#ifndef NDEBUG +#define LM_GGML_UNREACHABLE() LM_GGML_ASSERT(!"statement should not be reached") +#elif defined(__GNUC__) +#define LM_GGML_UNREACHABLE() __builtin_unreachable() +#else +#define LM_GGML_UNREACHABLE() ((void) 0) +#endif + // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. // @@ -445,6 +453,12 @@ extern "C" { LM_GGML_OBJECT_WORK_BUFFER }; + enum lm_ggml_log_level { + LM_GGML_LOG_LEVEL_ERROR = 2, + LM_GGML_LOG_LEVEL_WARN = 3, + LM_GGML_LOG_LEVEL_INFO = 4 + }; + // ggml object struct lm_ggml_object { size_t offs; @@ -467,8 +481,8 @@ extern "C" { int n_dims; int64_t ne[LM_GGML_MAX_DIMS]; // number of elements size_t nb[LM_GGML_MAX_DIMS]; // stride in bytes: - // nb[0] = sizeof(type) - // nb[1] = nb[0] * ne[0] + padding + // nb[0] = lm_ggml_type_size(type) + // nb[1] = nb[0] * (ne[0] / lm_ggml_blck_size(type)) + padding // nb[i] = nb[i-1] * ne[i-1] // compute data @@ -520,7 +534,15 @@ extern "C" { // next prime after LM_GGML_MAX_NODES // #define LM_GGML_GRAPH_HASHTABLE_SIZE 4099 // next prime after LM_GGML_MAX_NODES * 2 (nodes + leafs) - #define LM_GGML_GRAPH_HASHTABLE_SIZE 8273 + // #define LM_GGML_GRAPH_HASHTABLE_SIZE 8273 + // #define LM_GGML_GRAPH_HASHTABLE_SIZE 16411 + #define LM_GGML_GRAPH_HASHTABLE_SIZE 32771 + + enum lm_ggml_cgraph_eval_order { + LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + LM_GGML_CGRAPH_EVAL_ORDER_COUNT + }; // computation graph struct lm_ggml_cgraph { @@ -533,6 +555,8 @@ extern "C" { void * visited_hash_table[LM_GGML_GRAPH_HASHTABLE_SIZE]; + enum lm_ggml_cgraph_eval_order order; + // performance int perf_runs; int64_t perf_cycles; @@ -680,12 +704,21 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t value); LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_f32 (struct lm_ggml_tensor * tensor, float value); + // Converts a flat index into coordinates + LM_GGML_API void lm_ggml_unravel_index(const struct lm_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); + LM_GGML_API int32_t lm_ggml_get_i32_1d(const struct lm_ggml_tensor * tensor, int i); LM_GGML_API void lm_ggml_set_i32_1d(const struct lm_ggml_tensor * tensor, int i, int32_t value); + LM_GGML_API int32_t lm_ggml_get_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3); + LM_GGML_API void lm_ggml_set_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); + LM_GGML_API float lm_ggml_get_f32_1d(const struct lm_ggml_tensor * tensor, int i); LM_GGML_API void lm_ggml_set_f32_1d(const struct lm_ggml_tensor * tensor, int i, float value); + LM_GGML_API float lm_ggml_get_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3); + LM_GGML_API void lm_ggml_set_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + LM_GGML_API void * lm_ggml_get_data (const struct lm_ggml_tensor * tensor); LM_GGML_API float * lm_ggml_get_data_f32(const struct lm_ggml_tensor * tensor); @@ -719,6 +752,12 @@ extern "C" { struct lm_ggml_tensor * a, struct lm_ggml_tensor * b); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_add_cast( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + enum lm_ggml_type type); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_add1( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -828,6 +867,7 @@ extern "C" { struct lm_ggml_tensor * a, struct lm_ggml_tensor * b); + // sums repetitions in a into shape of b LM_GGML_API struct lm_ggml_tensor * lm_ggml_repeat_back( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1049,7 +1089,6 @@ extern "C" { size_t nb1, size_t offset); - // a -> b, return view(b) LM_GGML_API struct lm_ggml_tensor * lm_ggml_cpy( struct lm_ggml_context * ctx, @@ -1072,6 +1111,33 @@ extern "C" { struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); + // make contiguous, with new shape + LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_1d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0); + + LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_2d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_3d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_4d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view LM_GGML_API struct lm_ggml_tensor * lm_ggml_reshape( @@ -1219,14 +1285,15 @@ extern "C" { struct lm_ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements + // if mode & 1 == 1, skip n_past elements (DEPRECATED) // if mode & 2 == 1, GPT-NeoX style // if mode & 4 == 1, ChatGLM style - // TODO: avoid creating a new tensor every time + // + // b is an int32 vector with size a->ne[2], it contains the positions LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx); @@ -1235,7 +1302,7 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx); @@ -1244,7 +1311,7 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -1255,7 +1322,7 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -1266,7 +1333,7 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, float base, bool down); @@ -1276,7 +1343,7 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_back( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - int n_past, + struct lm_ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -1656,6 +1723,16 @@ extern "C" { // dump the graph into a file using the dot format LM_GGML_API void lm_ggml_graph_dump_dot(const struct lm_ggml_cgraph * gb, const struct lm_ggml_cgraph * gf, const char * filename); + // build gradient checkpointing backward graph gb for gf using provided checkpoints + // gb_tmp will contain original backward graph with rewritten backward process nodes, + // but without the second forward pass nodes. + LM_GGML_API void lm_ggml_build_backward_gradient_checkpointing( + struct lm_ggml_context * ctx, + struct lm_ggml_cgraph * gf, + struct lm_ggml_cgraph * gb, + struct lm_ggml_cgraph * gb_tmp, + struct lm_ggml_tensor * * checkpoints, + int n_checkpoints); // // optimization // @@ -1690,7 +1767,8 @@ extern "C" { LM_GGML_LINESEARCH_INVALID_PARAMETERS, }; - typedef void (*lm_ggml_opt_callback)(void * data, float * sched); + typedef void (*lm_ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); + typedef void (*lm_ggml_log_callback)(enum lm_ggml_log_level level, const char * text, void * user_data); // optimization parameters // @@ -1721,6 +1799,8 @@ extern "C" { bool print_forward_graph; bool print_backward_graph; + int n_gradient_accumulation; + // ADAM parameters struct { int n_iter; @@ -1766,6 +1846,7 @@ extern "C" { float loss_after; struct { + struct lm_ggml_tensor * g; // current gradient struct lm_ggml_tensor * m; // first moment struct lm_ggml_tensor * v; // second moment struct lm_ggml_tensor * pf; // past function values @@ -1882,26 +1963,26 @@ extern "C" { LM_GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); LM_GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); - LM_GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); - - LM_GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); - LM_GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); - - // results are undefined if the wrong type is used for the key - LM_GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); - LM_GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); - LM_GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); - LM_GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); - LM_GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); - LM_GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); - LM_GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); - LM_GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); - LM_GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); - LM_GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); - LM_GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); - LM_GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); - LM_GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); - LM_GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); + LM_GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id); + + LM_GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id); + LM_GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id); + + // will abort if the wrong type is used for the key + LM_GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id); + LM_GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id); + LM_GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id); + LM_GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id); + LM_GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id); + LM_GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id); + LM_GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id); + LM_GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id); + LM_GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id); + LM_GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); + LM_GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); + LM_GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); + LM_GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); + LM_GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); LM_GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); LM_GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 3eeb0cc0..07437df6 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -72,6 +72,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -92,12 +93,12 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (llama_log_level level, const char* format, ...); -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); +static void llama_log_internal (lm_ggml_log_level level, const char* format, ...); +static void llama_log_callback_default(lm_ggml_log_level level, const char * text, void * user_data); -#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) -#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) -#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +#define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) // // helpers @@ -166,13 +167,13 @@ enum llm_arch { }; static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_STARCODER, "starcoder" }, }; @@ -221,16 +222,16 @@ enum llm_kv { }; static std::map LLM_KV_NAMES = { - { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, - { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, - { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, - { LLM_KV_GENERAL_NAME, "general.name" }, - { LLM_KV_GENERAL_AUTHOR, "general.author" }, - { LLM_KV_GENERAL_URL, "general.url" }, - { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, - { LLM_KV_GENERAL_LICENSE, "general.license" }, - { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" }, - { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" }, + { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, + { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, + { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, + { LLM_KV_GENERAL_NAME, "general.name" }, + { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_URL, "general.url" }, + { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, + { LLM_KV_GENERAL_LICENSE, "general.license" }, + { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, + { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, @@ -448,7 +449,7 @@ struct LLM_TN { // #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ +do { \ const std::string skey(key); \ const int kid = gguf_find_key(ctx, skey.c_str()); \ if (kid >= 0) { \ @@ -460,7 +461,7 @@ struct LLM_TN { } else if (req) { \ throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ } \ -} +} while (0) // // ggml helpers @@ -886,10 +887,10 @@ static void llama_nop(struct lm_ggml_tensor * tensor) { // don't offload by defa static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); LM_GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -904,7 +905,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to struct llama_state { // We save the log callback globally - llama_log_callback log_callback = llama_log_callback_default; + lm_ggml_log_callback log_callback = llama_log_callback_default; void * log_callback_user_data = nullptr; }; @@ -929,23 +930,22 @@ static const size_t kB = 1024; static const size_t MB = kB*kB; static const size_t GB = kB*kB*kB; -// default hparams (LLaMA 7B) struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx_train = 2048; // the context size used during training - uint32_t n_ctx = 512; // the context size used during inference - uint32_t n_embd = 4096; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - uint32_t n_ff = 11008; - - float f_norm_eps = 1e-5; - float f_norm_rms_eps = 1e-5; - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; + bool vocab_only; + uint32_t n_vocab; + uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_embd; + uint32_t n_head; + uint32_t n_head_kv; + uint32_t n_layer; + uint32_t n_rot; + uint32_t n_ff; + + float f_norm_eps; + float f_norm_rms_eps; + + float rope_freq_base_train; + float rope_freq_scale_train; bool operator!=(const llama_hparams & other) const { return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT @@ -962,15 +962,18 @@ struct llama_hparams { uint32_t n_embd_gqa() const { return n_embd/n_gqa(); } +}; - size_t kv_size() const { - size_t result = 2ull; - result *= (size_t) n_embd_gqa(); - result *= (size_t) n_ctx; - result *= (size_t) n_layer; - result *= sizeof(lm_ggml_fp16_t); - return result; - } +struct llama_cparams { + uint32_t n_ctx; // context size used during inference + uint32_t n_batch; + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing + + float rope_freq_base; + float rope_freq_scale; + + bool mul_mat_q; }; struct llama_layer { @@ -1005,7 +1008,29 @@ struct llama_layer { struct lm_ggml_tensor * b3; // ffn_up }; +struct llama_kv_cell { + llama_pos pos = -1; + llama_pos delta = 0; + + std::set seq_id; + + bool has_seq_id(const llama_seq_id & id) const { + return seq_id.find(id) != seq_id.end(); + } +}; + +// ring-buffer of cached KV data struct llama_kv_cache { + bool has_shift = false; + + uint32_t head = 0; + uint32_t size = 0; + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + struct lm_ggml_tensor * k = NULL; struct lm_ggml_tensor * v = NULL; @@ -1013,8 +1038,6 @@ struct llama_kv_cache { llama_buffer buf; - int n; // number of tokens currently in the cache - ~llama_kv_cache() { if (ctx) { lm_ggml_free(ctx); @@ -1076,7 +1099,7 @@ struct llama_model { std::string name = "n/a"; - llama_hparams hparams; + llama_hparams hparams = {}; llama_vocab vocab; struct lm_ggml_tensor * tok_embeddings; @@ -1128,11 +1151,8 @@ struct llama_model { }; struct llama_context { - llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { - if (model_owner) { - delete &model; - } #ifdef LM_GGML_USE_METAL if (ctx_metal) { lm_ggml_metal_free(ctx_metal); @@ -1143,27 +1163,26 @@ struct llama_context { } } + llama_cparams cparams; + + const llama_model & model; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; + std::mt19937 rng; bool has_evaluated_once = false; + int64_t t_start_us; + int64_t t_load_us; int64_t t_sample_us = 0; - int64_t t_eval_us = 0; int64_t t_p_eval_us = 0; + int64_t t_eval_us = 0; int32_t n_sample = 0; // number of tokens sampled - int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - - const llama_model & model; - - bool model_owner = false; - - int64_t t_load_us; - int64_t t_start_us; - - // key + value cache for the self attention - struct llama_kv_cache kv_self; + int32_t n_eval = 0; // number of eval calls // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; @@ -1198,16 +1217,23 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams, struct llama_kv_cache & cache, lm_ggml_type wtype, - int n_ctx, + uint32_t n_ctx, int n_gpu_layers) { - const int n_embd = hparams.n_embd_gqa(); - const int n_layer = hparams.n_layer; + const uint32_t n_embd = hparams.n_embd_gqa(); + const uint32_t n_layer = hparams.n_layer; const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; + cache.has_shift = false; + + cache.head = 0; + cache.size = n_ctx; + + cache.cells.clear(); + cache.cells.resize(n_ctx); + cache.buf.resize(2u*n_elements*lm_ggml_type_size(wtype) + 2u*MB); - cache.n = 0; struct lm_ggml_init_params params; params.mem_size = cache.buf.size; @@ -1228,17 +1254,154 @@ static bool llama_kv_cache_init( (void) n_gpu_layers; #ifdef LM_GGML_USE_CUBLAS - if (n_gpu_layers > n_layer + 1) { + size_t vram_kv_cache = 0; + + if (n_gpu_layers > (int)n_layer + 1) { lm_ggml_cuda_assign_buffers_no_scratch(cache.v); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += lm_ggml_nbytes(cache.v); } - if (n_gpu_layers > n_layer + 2) { + if (n_gpu_layers > (int)n_layer + 2) { lm_ggml_cuda_assign_buffers_no_scratch(cache.k); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += lm_ggml_nbytes(cache.k); + } + if (vram_kv_cache > 0) { + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } #endif // LM_GGML_USE_CUBLAS return true; } +// find an empty slot of size "n_tokens" in the cache +// updates the cache head +static bool llama_kv_cache_find_slot( + struct llama_kv_cache & cache, + const struct llama_batch & batch) { + const uint32_t n_ctx = cache.size; + const uint32_t n_tokens = batch.n_tokens; + + if (n_tokens > n_ctx) { + LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); + return false; + } + + uint32_t n_tested = 0; + + while (true) { + if (cache.head + n_tokens > n_ctx) { + cache.head = 0; + n_tested += n_ctx - cache.head; + continue; + } + + bool found = true; + for (uint32_t i = 0; i < n_tokens; i++) { + if (cache.cells[cache.head + i].pos >= 0) { + found = false; + cache.head += i + 1; + n_tested += i + 1; + break; + } + } + + if (found) { + break; + } + + if (n_tested >= n_ctx) { + //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + return false; + } + } + + for (uint32_t i = 0; i < n_tokens; i++) { + cache.cells[cache.head + i].pos = batch.pos[i]; + cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]); + } + + return true; +} + +// find how many cells are currently in use +static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { + for (uint32_t i = cache.size - 1; i > 0; --i) { + if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) { + return i + 1; + } + } + + return 0; +} + +static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) { + if (c0 < 0) c0 = 0; + if (c1 < 0) c1 = cache.size; + + for (int32_t i = c0; i < c1; ++i) { + cache.cells[i].pos = -1; + cache.cells[i].seq_id.clear(); + } +} + +static void llama_kv_cache_seq_rm( + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.cells[i].seq_id.erase(seq_id); + if (cache.cells[i].seq_id.empty()) { + cache.cells[i].pos = -1; + } + } + } +} + +static void llama_kv_cache_seq_cp( + struct llama_kv_cache & cache, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.cells[i].seq_id.insert(seq_id_dst); + } + } +} + +static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) { + for (uint32_t i = 0; i < cache.size; ++i) { + if (!cache.cells[i].has_seq_id(seq_id)) { + cache.cells[i].pos = -1; + cache.cells[i].seq_id.clear(); + } + } +} + +static void llama_kv_cache_seq_shift( + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.cells[i].pos += delta; + if (cache.cells[i].pos < 0) { + cache.cells[i].pos = -1; + cache.cells[i].seq_id.clear(); + } else { + cache.has_shift = true; + cache.cells[i].delta = delta; + } + } + } +} + // // model loading and saving // @@ -1560,7 +1723,7 @@ struct llama_model_loader { lmlock->grow_to(size_lock); } break; -#if defined(LM_GGML_USE_CUBLAS) +#ifdef LM_GGML_USE_CUBLAS case LM_GGML_BACKEND_GPU: case LM_GGML_BACKEND_GPU_SPLIT: // old code: @@ -1593,7 +1756,15 @@ struct llama_model_loader { // load LLaMA models // -static std::string llama_model_ftype_name(enum llama_ftype ftype) { +static std::string llama_model_arch_name(llm_arch arch) { + auto it = LLM_ARCH_NAMES.find(arch); + if (it == LLM_ARCH_NAMES.end()) { + return "unknown"; + } + return it->second; +} + +static std::string llama_model_ftype_name(llama_ftype ftype) { if (ftype & LLAMA_FTYPE_GUESSED) { return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; } @@ -1649,10 +1820,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) { static void llm_load_hparams( llama_model_loader & ml, - llama_model & model, - int n_ctx, - float rope_freq_base, - float rope_freq_scale) { + llama_model & model) { struct gguf_context * ctx = ml.ctx_gguf; const auto kv = LLM_KV(model.arch); @@ -1663,40 +1831,25 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); // get hparams kv - GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); - GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); + GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - // TODO: manually setting rope freq base and scale should override this - // FIXME: partial fix when the param specified is not the default value, but - // will not work for overriding the model value to the params default - - llama_context_params defaults = llama_context_default_params(); - - // rope_freq_base - { - float ropebase = 10000.0f; - GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) { - rope_freq_base = ropebase; - } - } + // rope_freq_base (optional) + hparams.rope_freq_base_train = 10000.0f; + GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); // rope_freq_scale (inverse of the kv) is optional - { - float ropescale = 1.0f; - GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) { - rope_freq_scale = 1.0f/ropescale; - } - } + float ropescale = 1.0f; + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + hparams.rope_freq_scale_train = 1.0f/ropescale; // sanity check for n_rot (optional) { @@ -1760,13 +1913,9 @@ static void llm_load_hparams( } } break; default: (void)0; - }; + } model.ftype = ml.ftype; - - hparams.n_ctx = n_ctx; - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; } // TODO: This should probably be in llama.h @@ -1787,20 +1936,18 @@ static void llm_load_vocab( throw std::runtime_error("cannot find tokenizer vocab in model file\n"); } + const float * scores = nullptr; const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx == -1) { - throw std::runtime_error("cannot find tokenizer scores in model file\n"); + if (score_idx != -1) { + scores = (const float * ) gguf_get_arr_data(ctx, score_idx); } - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - + const int * toktypes = nullptr; const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx == -1) { - throw std::runtime_error("cannot find token type list in GGUF file\n"); + if (toktype_idx != -1) { + toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); } - const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - // determine vocab type { std::string tokenizer_name; @@ -1868,8 +2015,8 @@ static void llm_load_vocab( auto & token_data = vocab.id_to_token[i]; token_data.text = std::move(word); - token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; + token_data.score = scores ? scores[i] : 0.0f; + token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; } // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' @@ -1892,31 +2039,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { const auto & vocab = model.vocab; // hparams - LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); - LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix - LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); - LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); - LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); - LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); - LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); - LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); - LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); - LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); + LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); + LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); + LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); if (ml.n_bytes < GB) { - LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } else { - LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } // general kv @@ -1934,13 +2080,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { static void llm_load_tensors( llama_model_loader & ml, llama_model & model, - int n_batch, int n_gpu_layers, int main_gpu, const float * tensor_split, - const bool mul_mat_q, - bool low_vram, - lm_ggml_type memory_type, bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -1979,11 +2121,9 @@ static void llm_load_tensors( } (void) main_gpu; - (void) mul_mat_q; -#if defined(LM_GGML_USE_CUBLAS) +#ifdef LM_GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: using " LM_GGML_CUDA_NAME " for GPU acceleration\n", __func__); lm_ggml_cuda_set_main_device(main_gpu); - lm_ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_GPU_SPLIT #elif defined(LM_GGML_USE_CLBLAST) @@ -2018,9 +2158,9 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -2084,9 +2224,9 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -2154,9 +2294,9 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -2231,9 +2371,9 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; @@ -2298,27 +2438,19 @@ static void llm_load_tensors( } break; default: throw std::runtime_error("unknown architecture"); - }; + } } ml.done_getting_tensors(); // print memory requirements { - const size_t scale = memory_type == LM_GGML_TYPE_F32 ? 2 : 1; - // this is the total memory required to run the inference size_t mem_required = ctx_size + mmapped_size - vram_weights; // weights in VRAM not in memory - // this is the memory required by one llama_state - const size_t mem_required_state = scale*hparams.kv_size(); - - LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - - (void) n_batch; + LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); #if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -2327,36 +2459,17 @@ static void llm_load_tensors( if (n_gpu_layers > (int) hparams.n_layer) { LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } - size_t vram_kv_cache = 0; #ifdef LM_GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; - if (n_gpu_layers > (int) hparams.n_layer + 1) { - if (low_vram) { - LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += hparams.kv_size() / 2; - } - } - if (n_gpu_layers > (int) hparams.n_layer + 2) { - if (low_vram) { - LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += hparams.kv_size() / 2; - } - } + const int max_offloadable_layers = hparams.n_layer + 3; #elif defined(LM_GGML_USE_CLBLAST) const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; #endif // LM_GGML_USE_CUBLAS - LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", - __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n", - __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up + LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); + LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0); #else (void) n_gpu_layers; #endif // defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) @@ -2369,7 +2482,7 @@ static void llm_load_tensors( } (void) tensor_split; -#if defined(LM_GGML_USE_CUBLAS) +#ifdef LM_GGML_USE_CUBLAS { lm_ggml_cuda_set_tensor_split(tensor_split); } @@ -2391,29 +2504,24 @@ static void llm_load_tensors( static bool llama_model_load( const std::string & fname, llama_model & model, - int n_ctx, - int n_batch, int n_gpu_layers, int main_gpu, const float * tensor_split, - const bool mul_mat_q, - float rope_freq_base, - float rope_freq_scale, - bool low_vram, - lm_ggml_type memory_type, bool use_mmap, bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); + llama_model_loader ml(fname, use_mmap); - llm_load_arch (*ml, model); - llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale); - llm_load_vocab (*ml, model); + model.hparams.vocab_only = vocab_only; - llm_load_print_meta(*ml, model); + llm_load_arch (ml, model); + llm_load_hparams(ml, model); + llm_load_vocab (ml, model); + + llm_load_print_meta(ml, model); if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { throw std::runtime_error("vocab size mismatch"); @@ -2425,8 +2533,8 @@ static bool llama_model_load( } llm_load_tensors( - *ml, model, n_batch, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, low_vram, memory_type, + ml, model, n_gpu_layers, + main_gpu, tensor_split, use_mlock, progress_callback, progress_callback_user_data); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); @@ -2438,17 +2546,10 @@ static bool llama_model_load( static struct lm_ggml_cgraph * llm_build_llama( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { - - LM_GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - - const int N = n_tokens; - + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; @@ -2456,7 +2557,7 @@ static struct lm_ggml_cgraph * llm_build_llama( const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); @@ -2464,12 +2565,20 @@ static struct lm_ggml_cgraph * llm_build_llama( LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; const int n_gpu_layers = model.n_gpu_layers; + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; + const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + + const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + + //printf("n_kv = %d\n", n_kv); + auto & buf_compute = lctx.buf_compute; struct lm_ggml_init_params params = { @@ -2487,12 +2596,12 @@ static struct lm_ggml_cgraph * llm_build_llama( struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - if (tokens) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, N); + if (batch.token) { + struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*lm_ggml_element_size(inp_tokens)); + memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); } lm_ggml_set_name(inp_tokens, "inp_tokens"); @@ -2502,11 +2611,11 @@ static struct lm_ggml_cgraph * llm_build_llama( LM_GGML_ASSERT(false && "not implemented"); #endif - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N); + inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inpL); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * lm_ggml_element_size(inpL)); + memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); } } @@ -2515,9 +2624,6 @@ static struct lm_ggml_cgraph * llm_build_llama( // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case lm_ggml_cuda_assign_buffers has no effect offload_func_t offload_func_nr = llama_nop; // nr = non-repeating offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; @@ -2534,12 +2640,75 @@ static struct lm_ggml_cgraph * llm_build_llama( } #endif // LM_GGML_USE_CUBLAS + // KQ_scale struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); + } + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + offload_func_kq(KQ_mask); + lm_ggml_set_name(KQ_mask, "KQ_mask"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, lm_ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + // KQ_pos - contains the positions + struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + offload_func_kq(KQ_pos); + lm_ggml_set_name(KQ_pos, "KQ_pos"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + // shift the entire K-cache if needed + if (do_rope_shift) { + struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); + offload_func_kq(K_shift); + lm_ggml_set_name(K_shift, "K_shift"); + lm_ggml_allocr_alloc(lctx.alloc, K_shift); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) K_shift->data; + for (int i = 0; i < n_ctx; ++i) { + data[i] = kv_self.cells[i].delta; + } + } + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * tmp = + lm_ggml_rope_custom_inplace(ctx0, + lm_ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_head_kv, n_ctx, + lm_ggml_element_size(kv_self.k)*n_embd_head, + lm_ggml_element_size(kv_self.k)*n_embd_gqa, + lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_embd_head, 0, 0, freq_base, freq_scale); + offload_func_kq(tmp); + lm_ggml_build_forward_expand(gf, tmp); + } } - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); for (int il = 0; il < n_layer; ++il) { lm_ggml_format_name(inpL, "layer_inp_%d", il); @@ -2577,33 +2746,33 @@ static struct lm_ggml_cgraph * llm_build_llama( offload_func_kq(tmpq); lm_ggml_set_name(tmpq, "tmpq"); - struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom_inplace(ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Kcur); lm_ggml_set_name(Kcur, "Kcur"); - struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom_inplace(ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Qcur); lm_ggml_set_name(Qcur, "Qcur"); // store key and value to memory { - // compute the transposed [N, n_embd] V matrix + // compute the transposed [n_tokens, n_embd] V matrix struct lm_ggml_tensor * tmpv = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); offload_func_v(tmpv); lm_ggml_set_name(tmpv, "tmpv"); - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); + struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); offload_func_v(Vcur); lm_ggml_set_name(Vcur, "Vcur"); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); offload_func_kq(k); lm_ggml_set_name(k, "k"); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + n_past*lm_ggml_element_size(kv_self.v)); + (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); offload_func_v(v); lm_ggml_set_name(v, "v"); @@ -2618,7 +2787,7 @@ static struct lm_ggml_cgraph * llm_build_llama( struct lm_ggml_tensor * K = lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, + n_embd_head, n_kv, n_head_kv, lm_ggml_element_size(kv_self.k)*n_embd_gqa, lm_ggml_element_size(kv_self.k)*n_embd_head, lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); @@ -2631,25 +2800,25 @@ static struct lm_ggml_cgraph * llm_build_llama( lm_ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale_inplace(ctx0, KQ, KQ_scale); + // KQ_scaled shape [n_kv, n_tokens, n_head, 1] + struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); offload_func_kq(KQ_scaled); lm_ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - struct lm_ggml_tensor * KQ_masked = lm_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); offload_func_kq(KQ_masked); lm_ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max_inplace(ctx0, KQ_masked); + struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); offload_func_v(KQ_soft_max); lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct lm_ggml_tensor * V = lm_ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, + n_kv, n_embd_head, n_head_kv, lm_ggml_element_size(kv_self.v)*n_ctx, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); @@ -2664,7 +2833,7 @@ static struct lm_ggml_cgraph * llm_build_llama( // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation // is there a better way? - struct lm_ggml_tensor * V_cont = lm_ggml_cpy(ctx0, V, lm_ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); + struct lm_ggml_tensor * V_cont = lm_ggml_cpy(ctx0, V, lm_ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V_cont, KQ_soft_max); #endif @@ -2673,10 +2842,8 @@ static struct lm_ggml_cgraph * llm_build_llama( offload_func_v(KQV_merged); lm_ggml_set_name(KQV_merged, "KQV_merged"); - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = lm_ggml_cpy(ctx0, - KQV_merged, - lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N)); + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); offload_func_v(cur); lm_ggml_set_name(cur, "KQV_merged_contiguous"); @@ -2767,20 +2934,12 @@ static struct lm_ggml_cgraph * llm_build_llama( return gf; } - static struct lm_ggml_cgraph * llm_build_baichaun( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { - - LM_GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - - const int N = n_tokens; - + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; @@ -2788,7 +2947,7 @@ static struct lm_ggml_cgraph * llm_build_baichaun( const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); @@ -2796,12 +2955,18 @@ static struct lm_ggml_cgraph * llm_build_baichaun( LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; const int n_gpu_layers = model.n_gpu_layers; + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; + const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + + const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + auto & buf_compute = lctx.buf_compute; struct lm_ggml_init_params params = { @@ -2819,12 +2984,12 @@ static struct lm_ggml_cgraph * llm_build_baichaun( struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - if (tokens) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, N); + if (batch.token) { + struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*lm_ggml_element_size(inp_tokens)); + memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); } lm_ggml_set_name(inp_tokens, "inp_tokens"); @@ -2834,11 +2999,11 @@ static struct lm_ggml_cgraph * llm_build_baichaun( LM_GGML_ASSERT(false && "not implemented"); #endif - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N); + inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inpL); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * lm_ggml_element_size(inpL)); + memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); } } @@ -2847,9 +3012,6 @@ static struct lm_ggml_cgraph * llm_build_baichaun( // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case lm_ggml_cuda_assign_buffers has no effect offload_func_t offload_func_nr = llama_nop; // nr = non-repeating offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; @@ -2866,12 +3028,75 @@ static struct lm_ggml_cgraph * llm_build_baichaun( } #endif // LM_GGML_USE_CUBLAS + // KQ_scale struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); } - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + offload_func_kq(KQ_mask); + lm_ggml_set_name(KQ_mask, "KQ_mask"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, lm_ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + // KQ_pos - contains the positions + struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + offload_func_kq(KQ_pos); + lm_ggml_set_name(KQ_pos, "KQ_pos"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + // shift the entire K-cache if needed + if (do_rope_shift) { + struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); + offload_func_kq(K_shift); + lm_ggml_set_name(K_shift, "K_shift"); + lm_ggml_allocr_alloc(lctx.alloc, K_shift); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) K_shift->data; + for (int i = 0; i < n_ctx; ++i) { + data[i] = kv_self.cells[i].delta; + } + } + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * tmp = + lm_ggml_rope_custom_inplace(ctx0, + lm_ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_head_kv, n_ctx, + lm_ggml_element_size(kv_self.k)*n_embd_head, + lm_ggml_element_size(kv_self.k)*n_embd_gqa, + lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_embd_head, 0, 0, freq_base, freq_scale); + offload_func_kq(tmp); + lm_ggml_build_forward_expand(gf, tmp); + } + } for (int il = 0; il < n_layer; ++il) { lm_ggml_format_name(inpL, "layer_inp_%d", il); @@ -2913,12 +3138,12 @@ static struct lm_ggml_cgraph * llm_build_baichaun( struct lm_ggml_tensor * Qcur; switch (model.type) { case MODEL_7B: - Kcur = lm_ggml_rope_custom_inplace(ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = lm_ggml_rope_custom_inplace(ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); break; case MODEL_13B: - Kcur = lm_ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N); - Qcur = lm_ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N); + Kcur = lm_ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); + Qcur = lm_ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens); break; default: LM_GGML_ASSERT(false); @@ -2932,23 +3157,23 @@ static struct lm_ggml_cgraph * llm_build_baichaun( // store key and value to memory { - // compute the transposed [N, n_embd] V matrix + // compute the transposed [n_tokens, n_embd] V matrix struct lm_ggml_tensor * tmpv = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); offload_func_v(tmpv); lm_ggml_set_name(tmpv, "tmpv"); - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); + struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); offload_func_v(Vcur); lm_ggml_set_name(Vcur, "Vcur"); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); offload_func_kq(k); lm_ggml_set_name(k, "k"); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + n_past*lm_ggml_element_size(kv_self.v)); + (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); offload_func_v(v); lm_ggml_set_name(v, "v"); @@ -2963,7 +3188,7 @@ static struct lm_ggml_cgraph * llm_build_baichaun( struct lm_ggml_tensor * K = lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, + n_embd_head, n_kv, n_head_kv, lm_ggml_element_size(kv_self.k)*n_embd_gqa, lm_ggml_element_size(kv_self.k)*n_embd_head, lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); @@ -2976,8 +3201,8 @@ static struct lm_ggml_cgraph * llm_build_baichaun( lm_ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale_inplace(ctx0, KQ, KQ_scale); + // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] + struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); offload_func_kq(KQ_scaled); lm_ggml_set_name(KQ_scaled, "KQ_scaled"); @@ -2986,58 +3211,44 @@ static struct lm_ggml_cgraph * llm_build_baichaun( switch (model.type) { case MODEL_7B: - KQ_masked = lm_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); break; case MODEL_13B: - KQ_scaled_alibi =lm_ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8); + // TODO: replace with lm_ggml_add() + KQ_scaled_alibi = lm_ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); lm_ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - KQ_masked = lm_ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); + KQ_masked = lm_ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); break; default: LM_GGML_ASSERT(false); } - // KQ_masked = mask_past(KQ_scaled) - // struct lm_ggml_tensor * KQ_masked = lm_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - // struct lm_ggml_tensor * KQ_masked = lm_ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); - // offload_func_kq(KQ_masked); - // lm_ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max_inplace(ctx0, KQ_masked); + struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); offload_func_v(KQ_soft_max); lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct lm_ggml_tensor * V = lm_ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, + n_kv, n_embd_head, n_head_kv, lm_ggml_element_size(kv_self.v)*n_ctx, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); offload_func_v(V); lm_ggml_set_name(V, "V"); -#if 1 struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); offload_func_v(KQV); lm_ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct lm_ggml_tensor * V_cont = lm_ggml_cpy(ctx0, V, lm_ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif // KQV_merged = KQV.permute(0, 2, 1, 3) struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); offload_func_v(KQV_merged); lm_ggml_set_name(KQV_merged, "KQV_merged"); - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = lm_ggml_cpy(ctx0, - KQV_merged, - lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N)); + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); offload_func_v(cur); lm_ggml_set_name(cur, "KQV_merged_contiguous"); @@ -3130,17 +3341,10 @@ static struct lm_ggml_cgraph * llm_build_baichaun( static struct lm_ggml_cgraph * llm_build_falcon( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { - - LM_GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - - const int N = n_tokens; - + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; @@ -3148,7 +3352,7 @@ static struct lm_ggml_cgraph * llm_build_falcon( const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); @@ -3156,12 +3360,21 @@ static struct lm_ggml_cgraph * llm_build_falcon( LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; const int n_gpu_layers = model.n_gpu_layers; + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; + const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + + const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + + //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", + // kv_head, n_kv, n_tokens, n_ctx, lm_ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); + auto & buf_compute = lctx.buf_compute; struct lm_ggml_init_params params = { @@ -3179,12 +3392,12 @@ static struct lm_ggml_cgraph * llm_build_falcon( struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; - if (tokens) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, N); + if (batch.token) { + struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*lm_ggml_element_size(inp_tokens)); + memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); } lm_ggml_set_name(inp_tokens, "inp_tokens"); @@ -3194,11 +3407,11 @@ static struct lm_ggml_cgraph * llm_build_falcon( LM_GGML_ASSERT(false && "not implemented"); #endif - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N); + inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inpL); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, embd, N * n_embd * lm_ggml_element_size(inpL)); + memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); } } @@ -3207,9 +3420,6 @@ static struct lm_ggml_cgraph * llm_build_falcon( // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case lm_ggml_cuda_assign_buffers has no effect offload_func_t offload_func_nr = llama_nop; // nr = non-repeating offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; @@ -3226,12 +3436,75 @@ static struct lm_ggml_cgraph * llm_build_falcon( } #endif // LM_GGML_USE_CUBLAS + // KQ_scale struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); } - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + offload_func_kq(KQ_mask); + lm_ggml_set_name(KQ_mask, "KQ_mask"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, lm_ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + // KQ_pos - contains the positions + struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + offload_func_kq(KQ_pos); + lm_ggml_set_name(KQ_pos, "KQ_pos"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + // shift the entire K-cache if needed + if (do_rope_shift) { + struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); + offload_func_kq(K_shift); + lm_ggml_set_name(K_shift, "K_shift"); + lm_ggml_allocr_alloc(lctx.alloc, K_shift); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) K_shift->data; + for (int i = 0; i < n_ctx; ++i) { + data[i] = kv_self.cells[i].delta; + } + } + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * tmp = + lm_ggml_rope_custom_inplace(ctx0, + lm_ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_head_kv, n_ctx, + lm_ggml_element_size(kv_self.k)*n_embd_head, + lm_ggml_element_size(kv_self.k)*n_embd_gqa, + lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(tmp); + lm_ggml_build_forward_expand(gf, tmp); + } + } for (int il = 0; il < n_layer; ++il) { struct lm_ggml_tensor * attn_norm; @@ -3288,45 +3561,45 @@ static struct lm_ggml_cgraph * llm_build_falcon( // TODO: these 2 lm_ggml_conts are technically not needed, but we add them until CUDA support for // non-contiguous views is added for the rope operator struct lm_ggml_tensor * tmpq = lm_ggml_cont(ctx0, lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head, N, + ctx0, cur, n_embd_head, n_head, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0)); offload_func_kq(tmpq); struct lm_ggml_tensor * tmpk = lm_ggml_cont(ctx0, lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, + ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head)); offload_func_kq(tmpk); struct lm_ggml_tensor * tmpv = lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, + ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); offload_func_v(tmpv); // using mode = 2 for neox mode - struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); offload_func_kq(Qcur); - struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); offload_func_kq(Kcur); { - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); offload_func_v(Vcur); offload_func_v(Vcur->src[0]->src[0]); lm_ggml_set_name(Vcur, "Vcur"); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); offload_func_kq(k); lm_ggml_set_name(k, "k"); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + n_past*lm_ggml_element_size(kv_self.v)); + (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); offload_func_v(v); lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); @@ -3339,7 +3612,7 @@ static struct lm_ggml_cgraph * llm_build_falcon( struct lm_ggml_tensor * K = lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, + n_embd_head, n_kv, n_head_kv, lm_ggml_element_size(kv_self.k)*n_embd_gqa, lm_ggml_element_size(kv_self.k)*n_embd_head, lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); @@ -3350,21 +3623,21 @@ static struct lm_ggml_cgraph * llm_build_falcon( offload_func_kq(KQ); lm_ggml_set_name(KQ, "KQ"); - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale_inplace(ctx0, KQ, KQ_scale); + struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); offload_func_kq(KQ_scaled); lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - struct lm_ggml_tensor * KQ_masked = lm_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); offload_func_kq(KQ_masked); lm_ggml_set_name(KQ_masked, "KQ_masked"); - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max_inplace(ctx0, KQ_masked); + struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); offload_func_v(KQ_soft_max); lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); struct lm_ggml_tensor * V = lm_ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, + n_kv, n_embd_head, n_head_kv, lm_ggml_element_size(kv_self.v)*n_ctx, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); @@ -3379,7 +3652,7 @@ static struct lm_ggml_cgraph * llm_build_falcon( offload_func_v(KQV_merged); lm_ggml_set_name(KQV_merged, "KQV_merged"); - cur = lm_ggml_cpy(ctx0, KQV_merged, lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N)); + cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); offload_func_v(cur); lm_ggml_set_name(cur, "KQV_merged_contiguous"); @@ -3437,17 +3710,10 @@ static struct lm_ggml_cgraph * llm_build_falcon( static struct lm_ggml_cgraph * llm_build_starcoder( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { - - LM_GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT - - const int N = n_tokens; - + const llama_batch & batch) { const auto & model = lctx.model; const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; @@ -3455,7 +3721,7 @@ static struct lm_ggml_cgraph * llm_build_starcoder( const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); @@ -3463,7 +3729,11 @@ static struct lm_ggml_cgraph * llm_build_starcoder( LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - const float norm_eps = hparams.f_norm_eps; + const float norm_eps = hparams.f_norm_eps; + + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; + const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -3484,12 +3754,12 @@ static struct lm_ggml_cgraph * llm_build_starcoder( struct lm_ggml_tensor * position; struct lm_ggml_tensor * inpL; - if (tokens) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, N); + if (batch.token) { + struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, tokens, N*lm_ggml_element_size(inp_tokens)); + memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); } lm_ggml_set_name(inp_tokens, "inp_tokens"); @@ -3499,21 +3769,21 @@ static struct lm_ggml_cgraph * llm_build_starcoder( LM_GGML_ASSERT(false && "not implemented"); #endif - token = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N); + token = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, token); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, embd, N * n_embd * lm_ggml_element_size(token)); + memcpy(token->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(token)); } } { // Compute position embeddings. - struct lm_ggml_tensor * inp_positions = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, N); + struct lm_ggml_tensor * inp_positions = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); lm_ggml_allocr_alloc(lctx.alloc, inp_positions); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) inp_positions->data)[i] = n_past + i; + for (int i = 0; i < n_tokens; ++i) { + ((int32_t *) inp_positions->data)[i] = batch.pos[i]; } } lm_ggml_set_name(inp_positions, "inp_positions"); @@ -3521,12 +3791,35 @@ static struct lm_ggml_cgraph * llm_build_starcoder( position = lm_ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); } + // KQ_scale struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!lm_ggml_allocr_is_measure(lctx.alloc)) { lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); } - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + lm_ggml_set_name(KQ_mask, "KQ_mask"); + lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, lm_ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } inpL = lm_ggml_add(ctx0, token, position); lm_ggml_set_name(inpL, "inpL"); @@ -3542,23 +3835,23 @@ static struct lm_ggml_cgraph * llm_build_starcoder( // Self Attention cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); - struct lm_ggml_tensor * tmpq = lm_ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct lm_ggml_tensor * tmpk = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd); - struct lm_ggml_tensor * tmpv = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + struct lm_ggml_tensor * tmpq = lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); + struct lm_ggml_tensor * tmpk = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); + struct lm_ggml_tensor * tmpv = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); struct lm_ggml_tensor * Qcur = tmpq; struct lm_ggml_tensor * Kcur = tmpk; { - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); lm_ggml_set_name(Vcur, "Vcur"); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); lm_ggml_set_name(k, "k"); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + n_past*lm_ggml_element_size(kv_self.v)); + (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); @@ -3568,13 +3861,13 @@ static struct lm_ggml_cgraph * llm_build_starcoder( lm_ggml_permute(ctx0, lm_ggml_cpy(ctx0, Qcur, - lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_embd_head, n_head, N)), + lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), 0, 2, 1, 3); lm_ggml_set_name(Q, "Q"); struct lm_ggml_tensor * K = lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, + n_embd_head, n_kv, n_head_kv, lm_ggml_element_size(kv_self.k)*n_embd_gqa, lm_ggml_element_size(kv_self.k)*n_embd_head, lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); @@ -3585,12 +3878,12 @@ static struct lm_ggml_cgraph * llm_build_starcoder( lm_ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] + // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale_inplace(ctx0, KQ, KQ_scale); lm_ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - struct lm_ggml_tensor * KQ_masked = lm_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); lm_ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) @@ -3600,7 +3893,7 @@ static struct lm_ggml_cgraph * llm_build_starcoder( // split cached V into n_head heads struct lm_ggml_tensor * V = lm_ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, + n_kv, n_embd_head, n_head_kv, lm_ggml_element_size(kv_self.v)*n_ctx, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); @@ -3613,10 +3906,8 @@ static struct lm_ggml_cgraph * llm_build_starcoder( struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); lm_ggml_set_name(KQV_merged, "KQV_merged"); - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = lm_ggml_cpy(ctx0, - KQV_merged, - lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, N)); + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); lm_ggml_set_name(cur, "KQV_merged_contiguous"); } @@ -3666,10 +3957,7 @@ static struct lm_ggml_cgraph * llm_build_starcoder( static struct lm_ggml_cgraph * llama_build_graph( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past) { + const llama_batch & batch) { const auto & model = lctx.model; struct lm_ggml_cgraph * result = NULL; @@ -3677,76 +3965,117 @@ static struct lm_ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past); + result = llm_build_llama(lctx, batch); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past); + result = llm_build_baichaun(lctx, batch); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past); + result = llm_build_falcon(lctx, batch); } break; case LLM_ARCH_STARCODER: { - result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past); + result = llm_build_starcoder(lctx, batch); } break; default: LM_GGML_ASSERT(false); - }; + } return result; } -// evaluate the transformer +// decode a batch of tokens by evaluating the transformer // // - lctx: llama context -// - tokens: new batch of tokens to process -// - embd embeddings input -// - n_tokens number of tokens -// - n_past: the context size so far +// - batch: batch to evaluate // - n_threads: number of threads to use // -static bool llama_eval_internal( +// return 0 on success +// return positive int on warning +// return negative int on error +// +static int llama_decode_internal( llama_context & lctx, - const llama_token * tokens, - const float * embd, - int n_tokens, - int n_past, - int n_threads, - const char * cgraph_fname) { + llama_batch batch) { + const uint32_t n_tokens = batch.n_tokens; - LM_GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + if (n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + return -1; + } - LM_GGML_ASSERT(n_tokens > 0); - LM_GGML_ASSERT(n_past >= 0); - // TODO: keep the values of n_batch and n_ctx - // LM_GGML_ASSERT(n_tokens <= n_batch); - // LM_GGML_ASSERT(n_past + n_tokens <= n_ctx); + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const auto n_batch = cparams.n_batch; + + LM_GGML_ASSERT(n_tokens <= n_batch); + + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT const int64_t t_start_us = lm_ggml_time_us(); #ifdef LM_GGML_USE_MPI - lm_ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); + // TODO: needs fix after #3228 + LM_GGML_ASSERT(false && "not implemented"); + //lm_ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); #endif LM_GGML_ASSERT(n_threads > 0); - const int N = n_tokens; - - const auto & model = lctx.model; - const auto & hparams = model.hparams; - - const auto & kv_self = lctx.kv_self; + auto & kv_self = lctx.kv_self; LM_GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; + // helpers for smoother batch API transistion + // after deprecating the llama_eval calls, these will be removed + std::vector pos; + std::vector seq_id; + + if (batch.pos == nullptr) { + pos.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + pos[i] = batch.all_pos_0 + i*batch.all_pos_1; + } + + batch.pos = pos.data(); + } + + if (batch.seq_id == nullptr) { + seq_id.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + seq_id[i] = batch.all_seq_id; + } + + batch.seq_id = seq_id.data(); + } + + // we always start to search for a free slot from the start of the cache + // TODO: better strategies can be implemented + kv_self.head = 0; + + if (!llama_kv_cache_find_slot(kv_self, batch)) { + return 1; + } + + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + //kv_self.n = std::max(32, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA? + kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); + + //printf("kv_self.n = %d\n", kv_self.n); + lm_ggml_allocr_reset(lctx.alloc); - lm_ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); + lm_ggml_cgraph * gf = llama_build_graph(lctx, batch); lm_ggml_allocr_alloc_graph(lctx.alloc, gf); @@ -3755,6 +4084,7 @@ static bool llama_eval_internal( lm_ggml_tensor * node = gf->leafs[i]; if (node->backend == LM_GGML_BACKEND_GPU && node->extra == NULL) { lm_ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + lm_ggml_cuda_copy_to_device(node); } } @@ -3764,6 +4094,8 @@ static bool llama_eval_internal( lm_ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); } } + + lm_ggml_cuda_set_mul_mat_q(cparams.mul_mat_q); #endif // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -3773,10 +4105,19 @@ static bool llama_eval_internal( // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // with the BLAS calls. need a better solution - if (N >= 32 && lm_ggml_cpu_has_blas() && !lm_ggml_cpu_has_gpublas()) { + if (n_tokens >= 32 && lm_ggml_cpu_has_blas() && !lm_ggml_cpu_has_gpublas()) { n_threads = std::min(4, n_threads); } + // If all tensors can be run on the GPU then using more than 1 thread is detrimental. + const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || + model.arch == LLM_ARCH_BAICHUAN || + model.arch == LLM_ARCH_FALCON; + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; + if (lm_ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { + n_threads = 1; + } + struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct lm_ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; @@ -3803,12 +4144,9 @@ static bool llama_eval_internal( lm_ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif - // update kv token count - lctx.kv_self.n = n_past + N; - - if (cgraph_fname) { - lm_ggml_graph_export(gf, cgraph_fname); - } + // update the kv ring buffer + lctx.kv_self.head += n_tokens; + lctx.kv_self.has_shift = false; #ifdef LM_GGML_PERF // print timing information per ggml operation (for debugging purposes) @@ -3825,13 +4163,20 @@ static bool llama_eval_internal( { auto & logits_out = lctx.logits; - if (lctx.logits_all) { - logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) lm_ggml_get_data(res), sizeof(float)*n_vocab*N); + if (batch.logits) { + logits_out.resize(n_vocab * n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + if (batch.logits[i] == 0) { + continue; + } + memcpy(logits_out.data() + (n_vocab*i), (float *) lm_ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); + } + } else if (lctx.logits_all) { + logits_out.resize(n_vocab * n_tokens); + memcpy(logits_out.data(), (float *) lm_ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); } else { - // return result for just the last token logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) lm_ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(logits_out.data(), (float *) lm_ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); } } @@ -3840,20 +4185,27 @@ static bool llama_eval_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) lm_ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + memcpy(embedding_out.data(), (float *) lm_ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd); } // measure the performance only for the single-token evals - if (N == 1) { + if (n_tokens == 1) { lctx.t_eval_us += lm_ggml_time_us() - t_start_us; lctx.n_eval++; } - else if (N > 1) { + else if (n_tokens > 1) { lctx.t_p_eval_us += lm_ggml_time_us() - t_start_us; - lctx.n_p_eval += N; + lctx.n_p_eval += n_tokens; } - return true; + // get a more accurate load time, upon first eval + // TODO: fix this + if (!lctx.has_evaluated_once) { + lctx.t_load_us = lm_ggml_time_us() - lctx.t_start_us; + lctx.has_evaluated_once = true; + } + + return 0; } // @@ -4274,7 +4626,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } break; - }; + } return output; } @@ -4678,6 +5030,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) // sampling // +void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + seed = time(NULL); + } + ctx->rng.seed(seed); +} + void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { LM_GGML_ASSERT(candidates->size > 0); @@ -4886,7 +5245,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { +void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = lm_ggml_time_us(); for (size_t i = 0; i < candidates_p->size; ++i) { @@ -4898,6 +5257,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array } } +void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { + llama_sample_temp(ctx, candidates_p, temp); +} + void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { if (last_tokens_size == 0 || penalty == 1.0f) { return; @@ -5021,7 +5384,7 @@ void llama_sample_classifier_free_guidance( LM_GGML_ASSERT(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); LM_GGML_ASSERT(n_vocab == (int)candidates->size); LM_GGML_ASSERT(!candidates->sorted); @@ -5050,7 +5413,7 @@ void llama_sample_classifier_free_guidance( llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { LM_GGML_ASSERT(ctx); - auto N = float(llama_n_vocab(ctx)); + auto N = float(llama_n_vocab(llama_get_model(ctx))); int64_t t_start_sample_us; t_start_sample_us = lm_ggml_time_us(); @@ -5237,7 +5600,7 @@ struct llama_logit_info { }; llama_logit_info(llama_context * ctx) : logits(llama_get_logits(ctx)) - , n_vocab(llama_n_vocab(ctx)) + , n_vocab(llama_n_vocab(llama_get_model(ctx))) , max_l(*std::max_element(logits, logits + n_vocab)) , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l})) { } @@ -5275,7 +5638,6 @@ struct llama_beam_search_data { size_t n_beams; int n_past; int n_predict; - int n_threads; std::vector beams; std::vector next_beams; @@ -5285,12 +5647,11 @@ struct llama_beam_search_data { // Used to communicate to/from callback on beams state. std::vector beam_views; - llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads) + llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict) : ctx(ctx) , n_beams(n_beams) , n_past(n_past) , n_predict(n_predict) - , n_threads(n_threads) , beam_views(n_beams) { beams.reserve(n_beams); next_beams.reserve(n_beams); @@ -5327,7 +5688,7 @@ struct llama_beam_search_data { } else { // beam is not at end-of-sentence, so branch with next top_k tokens. if (!beam.tokens.empty()) { - llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads); + llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0)); } llama_logit_info logit_info(ctx); std::vector next_tokens = logit_info.top_k(n_beams); @@ -5401,7 +5762,7 @@ struct llama_beam_search_data { callback(callback_data, get_beams_state(false)); // Sets common_prefix_length update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed. if (common_prefix_length) { - llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads); + llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0)); n_past += common_prefix_length; } // Zero-out next_beam probabilities to place them last in following min-heap. @@ -5442,11 +5803,11 @@ struct llama_beam_search_data { void llama_beam_search(llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, - size_t n_beams, int n_past, int n_predict, int n_threads) { + size_t n_beams, int n_past, int n_predict) { assert(ctx); const int64_t t_start_sample_us = lm_ggml_time_us(); - llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads); + llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict); beam_search_data.loop(callback, callback_data); @@ -5666,11 +6027,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + // mmap consistently increases speed Linux, and also increases speed on Windows with + // hot cache. It may cause a slowdown on macOS, possibly related to free memory. +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model_loader ml(fname_inp, use_mmap); + if (ml.use_mmap) { + ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, lm_ggml_is_numa())); + } llama_model model; - llm_load_arch(*ml, model); - llm_load_hparams(*ml, model, 0, 0, 0); + llm_load_arch(ml, model); + llm_load_hparams(ml, model); if (params->only_copy) { ftype = model.ftype; @@ -5680,7 +6052,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml->ctx_gguf); + gguf_set_kv (ctx_out, ml.ctx_gguf); gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); @@ -5688,8 +6060,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s int n_attention_wv = 0; int n_feed_forward_w2 = 0; - for (int i = 0; i < ml->n_tensors; ++i) { - struct lm_ggml_tensor * meta = ml->get_tensor_meta(i); + for (int i = 0; i < ml.n_tensors; ++i) { + struct lm_ggml_tensor * meta = ml.get_tensor_meta(i); const std::string name = lm_ggml_get_name(meta); @@ -5725,8 +6097,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> f32_conv_buf; // populate the original tensors so we get an initial meta data - for (int i = 0; i < ml->n_tensors; ++i) { - struct lm_ggml_tensor * meta = ml->get_tensor_meta(i); + for (int i = 0; i < ml.n_tensors; ++i) { + struct lm_ggml_tensor * meta = ml.get_tensor_meta(i); gguf_add_tensor(ctx_out, meta); } @@ -5739,19 +6111,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // placeholder for the meta data ::zeros(fout, meta_size); - for (int i = 0; i < ml->n_tensors; ++i) { - struct lm_ggml_tensor * tensor = ml->get_tensor_meta(i); + for (int i = 0; i < ml.n_tensors; ++i) { + struct lm_ggml_tensor * tensor = ml.get_tensor_meta(i); const std::string name = lm_ggml_get_name(tensor); - if (read_data.size() < lm_ggml_nbytes(tensor)) { - read_data.resize(lm_ggml_nbytes(tensor)); + if (!ml.use_mmap) { + if (read_data.size() < lm_ggml_nbytes(tensor)) { + read_data.resize(lm_ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); - ml->load_data_for(tensor); + ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, ml->n_tensors, + ++idx, ml.n_tensors, lm_ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), lm_ggml_type_name(tensor->type)); @@ -5901,9 +6275,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -// TODO: after the GGUF PR, this likely won't work and needs to be updated static int llama_apply_lora_from_file_internal( - const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads + const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads ) { LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -5932,7 +6305,7 @@ static int llama_apply_lora_from_file_internal( int32_t lora_alpha; fin.read((char *) &lora_r, sizeof(lora_r)); fin.read((char *) &lora_alpha, sizeof(lora_alpha)); - float scaling = (float)lora_alpha / (float)lora_r; + float scaling = scale * (float)lora_alpha / (float)lora_r; LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); @@ -6148,9 +6521,10 @@ static int llama_apply_lora_from_file_internal( lm_ggml_set_name(r, "r_cpy"); } - struct lm_ggml_cgraph gf = lm_ggml_build_forward(r); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(lora_ctx); + lm_ggml_build_forward_expand(gf, r); - lm_ggml_graph_compute_helper(work_buffer, &gf, n_threads); + lm_ggml_graph_compute_helper(work_buffer, gf, n_threads); // we won't need these tensors again, reset the context to save memory lm_ggml_free(lora_ctx); @@ -6179,27 +6553,16 @@ static int llama_apply_lora_from_file_internal( // // interface implementation // - -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, - /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, +struct llama_model_params llama_model_default_params() { + struct llama_model_params result = { /*.n_gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, - /*.rope_freq_base =*/ 10000.0f, - /*.rope_freq_scale =*/ 1.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, - /*.low_vram =*/ false, - /*.mul_mat_q =*/ true, - /*.f16_kv =*/ true, - /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, - /*.embedding =*/ false, }; #ifdef LM_GGML_USE_METAL @@ -6209,6 +6572,24 @@ struct llama_context_params llama_context_default_params() { return result; } +struct llama_context_params llama_context_default_params() { + struct llama_context_params result = { + /*.seed =*/ LLAMA_DEFAULT_SEED, + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.n_threads =*/ LM_GGML_DEFAULT_N_THREADS, // TODO: better default + /*.n_threads_batch =*/ LM_GGML_DEFAULT_N_THREADS, + /*.rope_freq_base =*/ 0.0f, + /*.rope_freq_scale =*/ 0.0f, + /*.mul_mat_q =*/ true, + /*.f16_kv =*/ true, + /*.logits_all =*/ false, + /*.embedding =*/ false, + }; + + return result; +} + struct llama_model_quantize_params llama_model_quantize_default_params() { struct llama_model_quantize_params result = { /*.nthread =*/ 0, @@ -6264,13 +6645,11 @@ int64_t llama_time_us(void) { struct llama_model * llama_load_model_from_file( const char * path_model, - struct llama_context_params params) { + struct llama_model_params params) { lm_ggml_time_init(); llama_model * model = new llama_model; - lm_ggml_type memory_type = params.f16_kv ? LM_GGML_TYPE_F16 : LM_GGML_TYPE_F32; - unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; @@ -6287,9 +6666,9 @@ struct llama_model * llama_load_model_from_file( }; } - if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, - params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, + if (!llama_model_load(path_model, *model, params.n_gpu_layers, + params.main_gpu, params.tensor_split, + params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; @@ -6313,18 +6692,33 @@ struct llama_context * llama_new_context_with_model( llama_context * ctx = new llama_context(*model); + const auto & hparams = model->hparams; + auto & cparams = ctx->cparams; + + cparams.n_batch = params.n_batch; + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; lm_ggml_type memory_type = params.f16_kv ? LM_GGML_TYPE_F16 : LM_GGML_TYPE_F32; // reserve memory for context buffers - if (!params.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + if (!hparams.vocab_only) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -6335,11 +6729,9 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } - const auto & hparams = ctx->model.hparams; - // resized during inference if (params.logits_all) { - ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab); } else { ctx->logits.reserve(hparams.n_vocab); } @@ -6357,26 +6749,28 @@ struct llama_context * llama_new_context_with_model( ctx->alloc = lm_ggml_allocr_new_measure(tensor_alignment); // build worst-case graph - int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); - int n_past = hparams.n_ctx - n_tokens; + int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); + int n_past = cparams.n_ctx - n_tokens; llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - lm_ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); + #ifdef LM_GGML_USE_METAL - if (params.n_gpu_layers > 0) { + if (model->n_gpu_layers > 0) { ctx->ctx_metal = lm_ggml_metal_init(1); if (!ctx->ctx_metal) { LLAMA_LOG_ERROR("%s: lm_ggml_metal_init() failed\n", __func__); llama_free(ctx); return NULL; } - lm_ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - lm_ggml_allocr_set_parse_seq(ctx->alloc, lm_ggml_metal_get_concur_list(ctx->ctx_metal), lm_ggml_metal_if_optimized(ctx->ctx_metal)); + lm_ggml_metal_log_set_callback(llama_log_callback_default, NULL); + //lm_ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); + //lm_ggml_allocr_set_parse_seq(ctx->alloc, lm_ggml_metal_get_concur_list(ctx->ctx_metal), lm_ggml_metal_if_optimized(ctx->ctx_metal)); } #endif // measure memory requirements for the graph size_t alloc_size = lm_ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); // recreate allocator with exact memory requirements lm_ggml_allocr_free(ctx->alloc); @@ -6385,28 +6779,46 @@ struct llama_context * llama_new_context_with_model( ctx->alloc = lm_ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); #ifdef LM_GGML_USE_METAL if (ctx->ctx_metal) { - lm_ggml_allocr_set_parse_seq(ctx->alloc, lm_ggml_metal_get_concur_list(ctx->ctx_metal), lm_ggml_metal_if_optimized(ctx->ctx_metal)); + //lm_ggml_allocr_set_parse_seq(ctx->alloc, lm_ggml_metal_get_concur_list(ctx->ctx_metal), lm_ggml_metal_if_optimized(ctx->ctx_metal)); } #endif #ifdef LM_GGML_USE_CUBLAS - if (params.low_vram) { - LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); - lm_ggml_cuda_set_scratch_size(0); // disable scratch - } else { - lm_ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + lm_ggml_cuda_set_scratch_size(alloc_size); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + + // calculate total VRAM usage + auto add_tensor = [](const lm_ggml_tensor * t, size_t & size) { + if (t->backend == LM_GGML_BACKEND_GPU || t->backend == LM_GGML_BACKEND_GPU_SPLIT) { + size += lm_ggml_nbytes(t); + } + }; + size_t model_vram_size = 0; + for (const auto & kv : model->tensors_by_name) { + add_tensor(kv.second, model_vram_size); } + + size_t kv_vram_size = 0; + add_tensor(ctx->kv_self.k, kv_vram_size); + add_tensor(ctx->kv_self.v, kv_vram_size); + + size_t ctx_vram_size = alloc_size + kv_vram_size; + size_t total_vram_size = model_vram_size + ctx_vram_size; + + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, + total_vram_size / 1024.0 / 1024.0, + model_vram_size / 1024.0 / 1024.0, + ctx_vram_size / 1024.0 / 1024.0); #endif } #ifdef LM_GGML_USE_METAL - if (params.n_gpu_layers > 0) { + if (model->n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers void * data_ptr = NULL; size_t data_size = 0; - if (params.use_mmap) { + if (ctx->model.mapping) { data_ptr = ctx->model.mapping->addr; data_size = ctx->model.mapping->size; } else { @@ -6425,11 +6837,8 @@ struct llama_context * llama_new_context_with_model( return NULL; \ } - LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - - LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - + LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); + LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); #undef LLAMA_METAL_CHECK_BUF } @@ -6441,8 +6850,10 @@ struct llama_context * llama_new_context_with_model( if (lm_ggml_mpi_rank(ctx->ctx_mpi) > 0) { // Enter a blocking eval loop with dummy input, letting rank=0 drive the process - const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); - while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; + // TODO: needs fix after #3228 + LM_GGML_ASSERT(false && "not implemented"); + //const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); + //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; llama_backend_free(); exit(1); } @@ -6451,63 +6862,37 @@ struct llama_context * llama_new_context_with_model( return ctx; } -static struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params) { - struct llama_model * model = llama_load_model_from_file(path_model, params); - if (!model) { - return nullptr; - } - - struct llama_context * ctx = llama_new_context_with_model(model, params); - ctx->model_owner = true; - - return ctx; -} - void llama_free(struct llama_context * ctx) { delete ctx; } -int llama_n_vocab(const struct llama_context * ctx) { - return llama_model_n_vocab(&ctx->model); +const llama_model * llama_get_model(const struct llama_context * ctx) { + return &ctx->model; } int llama_n_ctx(const struct llama_context * ctx) { - return llama_model_n_ctx(&ctx->model); + return ctx->cparams.n_ctx; } -int llama_n_ctx_train(const struct llama_context * ctx) { - return llama_model_n_ctx_train(&ctx->model); +enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { + return model->vocab.type; } -int llama_n_embd(const struct llama_context * ctx) { - return llama_model_n_embd(&ctx->model); -} - -enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) { - return ctx->model.vocab.type; -} - -int llama_model_n_vocab(const struct llama_model * model) { +int llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } -int llama_model_n_ctx(const struct llama_model * model) { - return model->hparams.n_ctx; -} - -int llama_model_n_ctx_train(const struct llama_model * model) { +int llama_n_ctx_train(const struct llama_model * model) { return model->hparams.n_ctx_train; } -int llama_model_n_embd(const struct llama_model * model) { +int llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", - model->name.c_str(), + llama_model_arch_name(model->arch).c_str(), llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -6528,6 +6913,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return nparams; } +struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { + return lm_ggml_get_tensor(model->ctx, name); +} + int llama_model_quantize( const char * fname_inp, const char * fname_out, @@ -6541,18 +6930,18 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } -int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; @@ -6560,16 +6949,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha } int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->kv_self.n; + return ctx->kv_self.head; } -#define LLAMA_MAX_RNG_STATE (64*1024) +void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) { + llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1); +} -void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { - if (seed == LLAMA_DEFAULT_SEED) { - seed = time(NULL); - } - ctx->rng.seed(seed); +void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); +} + +void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { + llama_kv_cache_seq_keep(ctx->kv_self, seq_id); +} + +void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); } // Returns the *maximum* size of the state @@ -6657,6 +7057,16 @@ struct llama_data_file_context : llama_data_context { * */ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { + // TODO: does not support multi-sequence states + { + const auto & kv_self = ctx->kv_self; + for (uint32_t i = 0; i < kv_self.head; ++i) { + LM_GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i); + LM_GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1); + LM_GGML_ASSERT(kv_self.cells[i].has_seq_id(0)); + } + } + // copy rng { std::stringstream rng_ss; @@ -6707,12 +7117,14 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; + const auto & cparams = ctx->cparams; + const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = hparams.n_ctx; + const int n_ctx = cparams.n_ctx; const size_t kv_size = kv_self.buf.size; - const int kv_ntok = llama_get_kv_cache_token_count(ctx); + const int kv_ntok = kv_self.head; data_ctx->write(&kv_size, sizeof(kv_size)); data_ctx->write(&kv_ntok, sizeof(kv_ntok)); @@ -6815,9 +7227,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; + const auto & cparams = ctx->cparams; + const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = hparams.n_ctx; + const int n_ctx = cparams.n_ctx; size_t kv_size; int kv_ntok; @@ -6856,7 +7270,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { lm_ggml_free(cpy_ctx); } - ctx->kv_self.n = kv_ntok; + ctx->kv_self.head = kv_ntok; + ctx->kv_self.size = kv_size; } const size_t nread = inp - src; @@ -6951,64 +7366,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi int llama_eval( struct llama_context * ctx, - const llama_token * tokens, - int n_tokens, - int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } + llama_token * tokens, + int32_t n_tokens, + int n_past) { + llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); - // get a more accurate load time, upon first eval - // TODO: fix this - if (!ctx->has_evaluated_once) { - ctx->t_load_us = lm_ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; + const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0)); + if (ret < 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } - return 0; + return ret; } int llama_eval_embd( struct llama_context * ctx, - const float * embd, - int n_tokens, - int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; - } + float * embd, + int32_t n_tokens, + int n_past) { + llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); - // get a more accurate load time, upon first eval - // TODO: fix this - if (!ctx->has_evaluated_once) { - ctx->t_load_us = lm_ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; + llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, }; + + const int ret = llama_decode_internal(*ctx, batch); + if (ret < 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } - return 0; + return ret; } -int llama_eval_export(struct llama_context * ctx, const char * fname) { - const int n_batch = 1; - const int n_ctx = 512 - n_batch; +void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { + ctx->cparams.n_threads = n_threads; + ctx->cparams.n_threads_batch = n_threads_batch; +} + +struct llama_batch llama_batch_get_one( + llama_token * tokens, + int32_t n_tokens, + llama_pos pos_0, + llama_seq_id seq_id) { + return { + /*n_tokens =*/ n_tokens, + /*tokens =*/ tokens, + /*embd =*/ nullptr, + /*pos =*/ nullptr, + /*seq_id =*/ nullptr, + /*logits =*/ nullptr, + /*all_pos_0 =*/ pos_0, + /*all_pos_1 =*/ 1, + /*all_seq_id =*/ seq_id, + }; +} - const std::vector tmp(n_batch, llama_token_bos(ctx)); +struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) { + llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; - if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { - LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); - return 1; + if (embd) { + batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); + } else { + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); } - return 0; + batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens); + batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + + return batch; +} + +void llama_batch_free(struct llama_batch batch) { + if (batch.token) free(batch.token); + if (batch.embd) free(batch.embd); + if (batch.pos) free(batch.pos); + if (batch.seq_id) free(batch.seq_id); + if (batch.logits) free(batch.logits); +} + +int llama_decode( + struct llama_context * ctx, + struct llama_batch batch) { + const int ret = llama_decode_internal(*ctx, batch); + if (ret < 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); + } + + return ret; } float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } +float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + return ctx->logits.data() + i*ctx->model.hparams.n_vocab; +} + float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } @@ -7038,16 +7491,6 @@ llama_token llama_token_nl(const struct llama_context * ctx) { } int llama_tokenize( - struct llama_context * ctx, - const char * text, - int text_len, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos); -} - -int llama_tokenize_with_model( const struct llama_model * model, const char * text, int text_len, @@ -7068,13 +7511,9 @@ int llama_tokenize_with_model( return res.size(); } -int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) { - return llama_token_to_piece_with_model(&ctx->model, token, buf, length); -} - // does not write null-terminator to buf -int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { - if (0 <= token && token < llama_model_n_vocab(model)) { +int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) { + if (0 <= token && token < llama_n_vocab(model)) { if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) { @@ -7094,7 +7533,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke buf[2] = '\x85'; return 3; } else if (llama_is_control_token(model->vocab, token)) { - ; + // do nothing } else if (llama_is_byte_token(model->vocab, token)) { if (length < 1) { return -1; @@ -7202,12 +7641,12 @@ const std::vector> & llama_inter return ctx->model.tensors_by_name; } -void llama_log_set(llama_log_callback log_callback, void * user_data) { +void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; } -static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { +static void llama_log_internal_v(lm_ggml_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); char buffer[128]; @@ -7224,14 +7663,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_ va_end(args_copy); } -static void llama_log_internal(llama_log_level level, const char * format, ...) { +static void llama_log_internal(lm_ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); llama_log_internal_v(level, format, args); va_end(args); } -static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { +static void llama_log_callback_default(lm_ggml_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; fputs(text, stderr); diff --git a/cpp/llama.h b/cpp/llama.h index ad0d94ea..f412175f 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -37,6 +37,8 @@ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF +#define LLAMA_MAX_RNG_STATE (64*1024) + #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN @@ -60,13 +62,9 @@ extern "C" { struct llama_model; struct llama_context; - typedef int llama_token; - - enum llama_log_level { - LLAMA_LOG_LEVEL_ERROR = 2, - LLAMA_LOG_LEVEL_WARN = 3, - LLAMA_LOG_LEVEL_INFO = 4 - }; + typedef int32_t llama_pos; + typedef int32_t llama_token; + typedef int32_t llama_seq_id; enum llama_vocab_type { LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece @@ -86,24 +84,24 @@ extern "C" { // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -122,41 +120,68 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - + // Input data for llama_decode + // A llama_batch object can contain input about one or many sequences + // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens + // + // - token : the token ids of the input (used when embd is NULL) + // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) + // - pos : the positions of the respective token in the sequence + // - seq_id : the sequence to which the respective token belongs + // - logits : if zero, the logits for the respective token will not be output + // + typedef struct llama_batch { + int32_t n_tokens; + + llama_token * token; + float * embd; + llama_pos * pos; + llama_seq_id * seq_id; + int8_t * logits; + + // NOTE: helpers for smooth API transition - can be deprecated in the future + // for future-proof code, use the above fields instead and ignore everything below + // + // pos[i] = all_pos_0 + i*all_pos_1 + // + llama_pos all_pos_0; // used if pos == NULL + llama_pos all_pos_1; // used if pos == NULL + llama_seq_id all_seq_id; // used if seq_id == NULL + } llama_batch; + + struct llama_model_params { + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor - // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback void * progress_callback_user_data; // Keep the booleans together to avoid misalignment during copy-by-value. - bool low_vram; // if true, reduce VRAM usage at the cost of performance - bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache - bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM - bool embedding; // embedding mode only }; - // Signature for logging events - // Note that text includes the new line character at the end for most events. - // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it - // if it exists. - // It might not exist for progress report where '.' is output repeatedly. - typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); + struct llama_context_params { + uint32_t seed; // RNG seed, -1 for random + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing + + // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + + // Keep the booleans together to avoid misalignment during copy-by-value. + bool mul_mat_q; // if true, use experimental mul_mat_q kernels + bool f16_kv; // use fp16 for KV cache, fp32 otherwise + bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool embedding; // embedding mode only + }; // model quantization parameters typedef struct llama_model_quantize_params { @@ -215,6 +240,8 @@ extern "C" { int32_t n_eval; }; + // Helpers for getting default parameters + LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); @@ -228,7 +255,7 @@ extern "C" { LLAMA_API struct llama_model * llama_load_model_from_file( const char * path_model, - struct llama_context_params params); + struct llama_model_params params); LLAMA_API void llama_free_model(struct llama_model * model); @@ -245,25 +272,28 @@ extern "C" { LLAMA_API bool llama_mmap_supported (void); LLAMA_API bool llama_mlock_supported(void); - LLAMA_API int llama_n_vocab (const struct llama_context * ctx); + LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); + LLAMA_API int llama_n_ctx (const struct llama_context * ctx); - LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx); - LLAMA_API int llama_n_embd (const struct llama_context * ctx); - LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx); + LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); - LLAMA_API int llama_model_n_vocab (const struct llama_model * model); - LLAMA_API int llama_model_n_ctx (const struct llama_model * model); - LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model); - LLAMA_API int llama_model_n_embd (const struct llama_model * model); + LLAMA_API int llama_n_vocab (const struct llama_model * model); + LLAMA_API int llama_n_ctx_train(const struct llama_model * model); + LLAMA_API int llama_n_embd (const struct llama_model * model); // Get a string describing the model type LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); + // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Get a llama model tensor + LLAMA_API struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); + // Returns 0 on success LLAMA_API int llama_model_quantize( const char * fname_inp, @@ -279,21 +309,65 @@ extern "C" { LLAMA_API DEPRECATED(int llama_apply_lora_from_file( struct llama_context * ctx, const char * path_lora, + float scale, const char * path_base_model, int n_threads), - "please use llama_model_apply_lora_from_file instead"); + "use llama_model_apply_lora_from_file instead"); LLAMA_API int llama_model_apply_lora_from_file( const struct llama_model * model, - const char * path_lora, - const char * path_base_model, - int n_threads); + const char * path_lora, + float scale, + const char * path_base_model, + int n_threads); + + // + // KV cache + // // Returns the number of tokens in the KV cache - LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); + LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx), + "avoid using this, it will be removed in the future, instead - count the tokens in user code"); - // Sets the current rng seed. - LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); + // Remove all tokens data of cells in [c0, c1) + LLAMA_API void llama_kv_cache_tokens_rm( + struct llama_context * ctx, + int32_t c0, + int32_t c1); + + // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + LLAMA_API void llama_kv_cache_seq_rm( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); + + // Copy all tokens that belong to the specified sequence to another sequence + // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence + LLAMA_API void llama_kv_cache_seq_cp( + struct llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); + + // Removes all tokens that do not belong to the specified sequence + LLAMA_API void llama_kv_cache_seq_keep( + struct llama_context * ctx, + llama_seq_id seq_id); + + // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) + // If the KV cache is RoPEd, the KV data is updated accordingly + LLAMA_API void llama_kv_cache_seq_shift( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); + + // + // State / sessions + // // Returns the maximum size in bytes of the state (rng, logits, embedding // and kv_cache) - will often be smaller after compacting tokens @@ -302,48 +376,102 @@ extern "C" { // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. // Returns the number of bytes copied - LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); + LLAMA_API size_t llama_copy_state_data( + struct llama_context * ctx, + uint8_t * dst); // Set the state reading from the specified address // Returns the number of bytes read - LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); + LLAMA_API size_t llama_set_state_data( + struct llama_context * ctx, + uint8_t * src); // Save/load session file - LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_API bool llama_load_session_file( + struct llama_context * ctx, + const char * path_session, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out); - // Run the llama inference to obtain the logits and probabilities for the next token. + LLAMA_API bool llama_save_session_file( + struct llama_context * ctx, + const char * path_session, + const llama_token * tokens, + size_t n_token_count); + + // + // Decoding + // + + // Run the llama inference to obtain the logits and probabilities for the next token(s). // tokens + n_tokens is the provided batch of new tokens to process // n_past is the number of tokens to use from previous eval calls // Returns 0 on success - LLAMA_API int llama_eval( + // DEPRECATED: use llama_decode() instead + LLAMA_API DEPRECATED(int llama_eval( struct llama_context * ctx, - const llama_token * tokens, - int n_tokens, - int n_past, - int n_threads); + llama_token * tokens, + int32_t n_tokens, + int n_past), + "use llama_decode() instead"); // Same as llama_eval, but use float matrix input directly. - LLAMA_API int llama_eval_embd( + // DEPRECATED: use llama_decode() instead + LLAMA_API DEPRECATED(int llama_eval_embd( struct llama_context * ctx, - const float * embd, - int n_tokens, - int n_past, - int n_threads); + float * embd, + int32_t n_tokens, + int n_past), + "use llama_decode() instead"); + + // Return batch for single sequence of tokens starting at pos_0 + // + // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it + // + LLAMA_API struct llama_batch llama_batch_get_one( + llama_token * tokens, + int32_t n_tokens, + llama_pos pos_0, + llama_seq_id seq_id); + + // Allocates a batch of tokens on the heap + // The batch has to be freed with llama_batch_free() + // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float) + // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token + // The rest of the llama_batch members are allocated with size n_tokens + // All members are left uninitialized + LLAMA_API struct llama_batch llama_batch_init( + int32_t n_tokens, + int32_t embd); + + // Frees a batch of tokens allocated with llama_batch_init() + LLAMA_API void llama_batch_free(struct llama_batch batch); + + // Positive return values does not mean a fatal error, but rather a warning. + // 0 - success + // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) + // < 0 - error + LLAMA_API int llama_decode( + struct llama_context * ctx, + struct llama_batch batch); - // Export a static computation graph for context of 511 and batch size of 1 - // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these - // parameters here to keep things simple - // IMPORTANT: do not use for anything else other than debugging and testing! - LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); + // Set the number of threads used for decoding + // n_threads is the number of threads used for generation (single token) + // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) + LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row - // Can be mutated in order to change the probabilities of the next token - // Rows: n_tokens + // Logits for which llama_batch.logits[i] == 0 are undefined + // Rows: n_tokens provided with llama_batch // Cols: n_vocab LLAMA_API float * llama_get_logits(struct llama_context * ctx); + // Logits for the ith token. Equivalent to: + // llama_get_logits(ctx) + i*n_vocab + LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); + // Get the embeddings for the input // shape: [n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @@ -372,14 +500,6 @@ extern "C" { // Returns the number of tokens on success, no more than n_max_tokens // Returns a negative number on failure - the number of tokens that would have been returned LLAMA_API int llama_tokenize( - struct llama_context * ctx, - const char * text, - int text_len, - llama_token * tokens, - int n_max_tokens, - bool add_bos); - - LLAMA_API int llama_tokenize_with_model( const struct llama_model * model, const char * text, int text_len, @@ -392,12 +512,6 @@ extern "C" { // Does not write null terminator to the buffer. // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. LLAMA_API int llama_token_to_piece( - const struct llama_context * ctx, - llama_token token, - char * buf, - int length); - - LLAMA_API int llama_token_to_piece_with_model( const struct llama_model * model, llama_token token, char * buf, @@ -420,11 +534,25 @@ extern "C" { // Sampling functions // + // Sets the current rng seed. + LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); + /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); + LLAMA_API void llama_sample_repetition_penalty( + struct llama_context * ctx, + llama_token_data_array * candidates, + const llama_token * last_tokens, + size_t last_tokens_size, + float penalty); /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + LLAMA_API void llama_sample_frequency_and_presence_penalties( + struct llama_context * ctx, + llama_token_data_array * candidates, + const llama_token * last_tokens, + size_t last_tokens_size, + float alpha_frequency, + float alpha_presence); /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. @@ -437,23 +565,54 @@ extern "C" { float scale); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); + LLAMA_API void llama_sample_softmax( + struct llama_context * ctx, + llama_token_data_array * candidates); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); + LLAMA_API void llama_sample_top_k( + struct llama_context * ctx, + llama_token_data_array * candidates, + int k, + size_t min_keep); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); + LLAMA_API void llama_sample_top_p( + struct llama_context * ctx, + llama_token_data_array * candidates, + float p, + size_t min_keep); /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); + LLAMA_API void llama_sample_tail_free( + struct llama_context * ctx, + llama_token_data_array * candidates, + float z, + size_t min_keep); /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); - LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); + LLAMA_API void llama_sample_typical( + struct llama_context * ctx, + llama_token_data_array * candidates, + float p, + size_t min_keep); + + LLAMA_API void llama_sample_temp( + struct llama_context * ctx, + llama_token_data_array * candidates, + float temp); + + LLAMA_API DEPRECATED(void llama_sample_temperature( + struct llama_context * ctx, + llama_token_data_array * candidates, + float temp), + "use llama_sample_temp instead"); /// @details Apply constraints from grammar - LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar); + LLAMA_API void llama_sample_grammar( + struct llama_context * ctx, + llama_token_data_array * candidates, + const struct llama_grammar * grammar); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. @@ -461,23 +620,41 @@ extern "C" { /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); + LLAMA_API llama_token llama_sample_token_mirostat( + struct llama_context * ctx, + llama_token_data_array * candidates, + float tau, + float eta, + int m, + float * mu); /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); + LLAMA_API llama_token llama_sample_token_mirostat_v2( + struct llama_context * ctx, + llama_token_data_array * candidates, + float tau, + float eta, + float * mu); /// @details Selects the token with the highest probability. - LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); + LLAMA_API llama_token llama_sample_token_greedy( + struct llama_context * ctx, + llama_token_data_array * candidates); /// @details Randomly selects a token from the candidates based on their probabilities. - LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); + LLAMA_API llama_token llama_sample_token( + struct llama_context * ctx, + llama_token_data_array * candidates); /// @details Accepts the sampled token into the grammar - LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); + LLAMA_API void llama_grammar_accept_token( + struct llama_context * ctx, + struct llama_grammar * grammar, + llama_token token); // // Beam search @@ -485,9 +662,10 @@ extern "C" { struct llama_beam_view { const llama_token * tokens; + size_t n_tokens; - float p; // Cumulative beam probability (renormalized relative to all beams) - bool eob; // Callback should set this to true when a beam is at end-of-beam. + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Callback should set this to true when a beam is at end-of-beam. }; // Passed to beam_search_callback function. @@ -496,9 +674,10 @@ extern "C" { // These pointers are valid only during the synchronous callback, so should not be saved. struct llama_beams_state { struct llama_beam_view * beam_views; + size_t n_beams; // Number of elements in beam_views[]. size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. - bool last_call; // True iff this is the last callback invocation. + bool last_call; // True iff this is the last callback invocation. }; // Type of pointer to the beam_search_callback function. @@ -513,11 +692,17 @@ extern "C" { /// @param n_beams Number of beams to use. /// @param n_past Number of tokens already evaluated. /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. - /// @param n_threads Number of threads as passed to llama_eval(). - LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads); + LLAMA_API void llama_beam_search( + struct llama_context * ctx, + llama_beam_search_callback_fn_t callback, + void * callback_data, + size_t n_beams, + int n_past, + int n_predict); // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); + LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_reset_timings(struct llama_context * ctx); @@ -526,7 +711,7 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_set(lm_ggml_log_callback log_callback, void * user_data); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/cpp/log.h b/cpp/log.h index 18f3b976..b8953fdc 100644 --- a/cpp/log.h +++ b/cpp/log.h @@ -225,31 +225,31 @@ enum LogTriState // USE LOG() INSTEAD // #ifndef _MSC_VER - #define LOG_IMPL(str, ...) \ - { \ + #define LOG_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ fflush(LOG_TARGET); \ } \ - } + } while (0) #else - #define LOG_IMPL(str, ...) \ - { \ + #define LOG_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ fflush(LOG_TARGET); \ } \ - } + } while (0) #endif // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // #ifndef _MSC_VER - #define LOG_TEE_IMPL(str, ...) \ - { \ + #define LOG_TEE_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ @@ -260,10 +260,10 @@ enum LogTriState fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ fflush(LOG_TEE_TARGET); \ } \ - } + } while (0) #else - #define LOG_TEE_IMPL(str, ...) \ - { \ + #define LOG_TEE_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ @@ -274,7 +274,7 @@ enum LogTriState fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ fflush(LOG_TEE_TARGET); \ } \ - } + } while (0) #endif // The '\0' as a last argument, is a trick to bypass the silly @@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); } inline void log_test() { log_disable(); - LOG("01 Hello World to nobody, because logs are disabled!\n") + LOG("01 Hello World to nobody, because logs are disabled!\n"); log_enable(); - LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)) - LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n") + LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); + LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); log_set_target(stderr); - LOG("04 Hello World to stderr!\n") - LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n") + LOG("04 Hello World to stderr!\n"); + LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("06 Hello World to default log file!\n") + LOG("06 Hello World to default log file!\n"); log_set_target(stdout); - LOG("07 Hello World to stdout!\n") + LOG("07 Hello World to stdout!\n"); log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("08 Hello World to default log file again!\n") + LOG("08 Hello World to default log file again!\n"); log_disable(); - LOG("09 Hello World _1_ into the void!\n") + LOG("09 Hello World _1_ into the void!\n"); log_enable(); - LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n") + LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); log_disable(); log_set_target("llama.anotherlog.log"); - LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n") + LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); log_enable(); - LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n") + LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); log_set_target("llama.yetanotherlog.log"); - LOG("13 Hello World this time in yet new file?\n") + LOG("13 Hello World this time in yet new file?\n"); log_set_target(log_filename_generator("llama_autonamed", "log")); - LOG("14 Hello World in log with generated filename!\n") + LOG("14 Hello World in log with generated filename!\n"); #ifdef _MSC_VER - LOG_TEE("15 Hello msvc TEE without arguments\n") - LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") - LOG_TEELN("17 Hello msvc TEELN without arguments\n") - LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test") - LOG("19 Hello msvc LOG without arguments\n") - LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test") - LOGLN("21 Hello msvc LOGLN without arguments\n") - LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test") + LOG_TEE("15 Hello msvc TEE without arguments\n"); + LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); + LOG_TEELN("17 Hello msvc TEELN without arguments\n"); + LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); + LOG("19 Hello msvc LOG without arguments\n"); + LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); + LOGLN("21 Hello msvc LOGLN without arguments\n"); + LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); #endif } @@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv) buf << " " << argv[i]; } } - LOGLN("Cmd:%s", buf.str().c_str()) + LOGLN("Cmd:%s", buf.str().c_str()); } #define log_tostr(var) log_var_to_string_impl(var).c_str() @@ -620,10 +620,10 @@ inline std::string log_var_to_string_impl(const std::vector & var) #define LOGLN(...) // dummy stub #undef LOG_TEE -#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf +#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf #undef LOG_TEELN -#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf +#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf #undef LOG_DISABLE #define LOG_DISABLE() // dummy stub diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp index bd615c1d..cd75bfd4 100644 --- a/cpp/rn-llama.hpp +++ b/cpp/rn-llama.hpp @@ -270,6 +270,10 @@ struct llama_rn_context // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); + + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx); + embd = prompt_tokens; if (n_past == num_prompt_tokens) { @@ -302,19 +306,26 @@ struct llama_rn_context if (embd.size() >= (size_t)params.n_ctx) { - // Reset context - const int n_left = (params.n_ctx - params.n_keep) / 2; + // Shift context + + const int n_left = n_past - params.n_keep - 1; + const int n_discard = n_left/2; + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + + for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) + { + embd[i - n_discard] = embd[i]; + } + embd.resize(embd.size() - n_discard); + + n_past -= n_discard; - std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); - new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); - embd = new_tokens; - n_past = params.n_keep; - truncated = true; LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s", params.n_ctx, params.n_keep, - n_left, - tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str() + n_left ); } @@ -325,7 +336,7 @@ struct llama_rn_context { n_eval = params.n_batch; } - if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) + if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s", n_eval, @@ -348,7 +359,7 @@ struct llama_rn_context // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; @@ -364,7 +375,7 @@ struct llama_rn_context { auto *logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); // Apply params.logit_bias map for (const auto &it : params.logit_bias) @@ -414,13 +425,13 @@ struct llama_rn_context { static float mirostat_mu = 2.0f * mirostat_tau; const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); } else if (mirostat == 2) { static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else @@ -431,7 +442,7 @@ struct llama_rn_context llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep); llama_sample_typical(ctx, &candidates_p, typical_p, min_keep); llama_sample_top_p(ctx, &candidates_p, top_p, min_keep); - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); result.tok = llama_sample_token(ctx, &candidates_p); } } @@ -566,7 +577,7 @@ struct llama_rn_context std::vector getEmbedding() { - static const int n_embd = llama_n_embd(ctx); + static const int n_embd = llama_n_embd(llama_get_model(ctx)); if (!params.embedding) { LOG_WARNING("embedding disabled, embedding: %s", params.embedding); diff --git a/docs/API/README.md b/docs/API/README.md index 9676bc65..a0fb9246 100644 --- a/docs/API/README.md +++ b/docs/API/README.md @@ -30,7 +30,7 @@ llama.rn #### Defined in -[index.ts:40](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L40) +[index.ts:40](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L40) ___ @@ -40,7 +40,7 @@ ___ #### Defined in -[index.ts:38](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L38) +[index.ts:38](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L38) ___ @@ -57,7 +57,7 @@ ___ #### Defined in -[index.ts:28](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L28) +[index.ts:28](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L28) ## Functions @@ -79,7 +79,7 @@ ___ #### Defined in -[grammar.ts:134](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L134) +[grammar.ts:134](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L134) ___ @@ -99,7 +99,7 @@ ___ #### Defined in -[index.ts:113](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L113) +[index.ts:113](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L113) ___ @@ -113,7 +113,7 @@ ___ #### Defined in -[index.ts:129](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L129) +[index.ts:129](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L129) ___ @@ -133,4 +133,4 @@ ___ #### Defined in -[index.ts:109](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L109) +[index.ts:109](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L109) diff --git a/docs/API/classes/LlamaContext.md b/docs/API/classes/LlamaContext.md index b70dcab1..b0f41941 100644 --- a/docs/API/classes/LlamaContext.md +++ b/docs/API/classes/LlamaContext.md @@ -37,7 +37,7 @@ #### Defined in -[index.ts:49](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L49) +[index.ts:49](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L49) ## Properties @@ -47,7 +47,7 @@ #### Defined in -[index.ts:45](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L45) +[index.ts:45](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L45) ___ @@ -57,7 +57,7 @@ ___ #### Defined in -[index.ts:43](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L43) +[index.ts:43](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L43) ___ @@ -67,7 +67,7 @@ ___ #### Defined in -[index.ts:47](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L47) +[index.ts:47](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L47) ## Methods @@ -88,7 +88,7 @@ ___ #### Defined in -[index.ts:59](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L59) +[index.ts:59](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L59) ___ @@ -108,7 +108,7 @@ ___ #### Defined in -[index.ts:96](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L96) +[index.ts:96](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L96) ___ @@ -128,7 +128,7 @@ ___ #### Defined in -[index.ts:100](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L100) +[index.ts:100](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L100) ___ @@ -142,7 +142,7 @@ ___ #### Defined in -[index.ts:104](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L104) +[index.ts:104](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L104) ___ @@ -156,7 +156,7 @@ ___ #### Defined in -[index.ts:88](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L88) +[index.ts:88](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L88) ___ @@ -176,4 +176,4 @@ ___ #### Defined in -[index.ts:92](https://github.com/mybigday/llama.rn/blob/50235c2/src/index.ts#L92) +[index.ts:92](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/index.ts#L92) diff --git a/docs/API/classes/SchemaGrammarConverter.md b/docs/API/classes/SchemaGrammarConverter.md index b7cc5871..4be9e7d7 100644 --- a/docs/API/classes/SchemaGrammarConverter.md +++ b/docs/API/classes/SchemaGrammarConverter.md @@ -33,7 +33,7 @@ #### Defined in -[grammar.ts:39](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L39) +[grammar.ts:39](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L39) ## Properties @@ -43,7 +43,7 @@ #### Defined in -[grammar.ts:35](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L35) +[grammar.ts:35](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L35) ___ @@ -53,7 +53,7 @@ ___ #### Defined in -[grammar.ts:37](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L37) +[grammar.ts:37](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L37) ## Methods @@ -74,7 +74,7 @@ ___ #### Defined in -[grammar.ts:45](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L45) +[grammar.ts:45](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L45) ___ @@ -88,7 +88,7 @@ ___ #### Defined in -[grammar.ts:125](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L125) +[grammar.ts:125](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L125) ___ @@ -109,4 +109,4 @@ ___ #### Defined in -[grammar.ts:65](https://github.com/mybigday/llama.rn/blob/50235c2/src/grammar.ts#L65) +[grammar.ts:65](https://github.com/mybigday/llama.rn/blob/acfc7ab/src/grammar.ts#L65) diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 007b309c..b4ef6817 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.2.0-rc.6): + - llama-rn (0.2.0): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1242,7 +1242,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: 205e066e2daf2495c2844b8e99a7dd8f8f2cb22c + llama-rn: 38a0f48bb799df21706bc5552929475114ddf9cb RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 diff --git a/example/src/App.tsx b/example/src/App.tsx index e3e95215..ac5a4db1 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -108,7 +108,7 @@ export default function App() { initLlama({ model: file.uri, use_mlock: true, - n_gpu_layers:1, // > 0: enable GPU + n_gpu_layers: 0, // > 0: enable GPU // embedding: true, }) .then((ctx) => { diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm index 539a145a..74d33bb2 100644 --- a/ios/RNLlamaContext.mm +++ b/ios/RNLlamaContext.mm @@ -58,7 +58,9 @@ + (instancetype)initWithParams:(NSDictionary *)params { if (params[@"memory_f16"]) defaultParams.memory_f16 = [params[@"memory_f16"] boolValue]; if (params[@"lora"]) { - defaultParams.lora_adapter = [params[@"lora"] UTF8String]; + float lora_scaled = 1.0f; + if (params[@"lora_scaled"]) lora_scaled = [params[@"lora_scaled"] floatValue]; + defaultParams.lora_adapter.push_back({[params[@"lora"] UTF8String], lora_scaled}); defaultParams.use_mmap = false; } if (params[@"lora_base"]) defaultParams.lora_base = [params[@"lora_base"] UTF8String]; @@ -176,7 +178,7 @@ - (NSDictionary *)completion:(NSDictionary *)params } if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) { - const int n_vocab = llama_n_vocab(llama->ctx); + const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx)); NSArray *logit_bias = params[@"logit_bias"]; for (NSArray *el in logit_bias) { if ([el isKindOfClass:[NSArray class]] && [el count] == 2) { diff --git a/llama.cpp b/llama.cpp index 7ddf1855..f5ef5cfb 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 7ddf185537b712ea0ccbc5f222ee92bed654914e +Subproject commit f5ef5cfb18148131fcf45bdd2331f0db5ab7c3d0 diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index a4e1a51d..c20c3d40 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,7 +1,7 @@ ---- llama.cpp.orig 2023-09-18 12:19:40 -+++ llama.cpp 2023-09-18 12:19:42 -@@ -646,16 +646,16 @@ - +--- llama.cpp.orig 2023-09-30 13:34:05 ++++ llama.cpp 2023-09-30 13:34:06 +@@ -647,16 +647,16 @@ + if (prefetch > 0) { // Advise the kernel to preload the mapped memory - if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { diff --git a/src/NativeRNLlama.ts b/src/NativeRNLlama.ts index 8d0894b4..f33a4e3c 100644 --- a/src/NativeRNLlama.ts +++ b/src/NativeRNLlama.ts @@ -19,6 +19,7 @@ export type NativeContextParams = { memory_f16?: boolean lora?: string // lora_adaptor + lora_scaled?: number lora_base?: string rope_freq_base?: number