From 1f20cefab73602dd4ad3d67d06bc5585ec4716db Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Fri, 10 Nov 2023 18:06:19 +0800 Subject: [PATCH] feat: sync llama.cpp (#31) * feat: sync llama.cpp * feat: use -O3 * feat: sync llama.cpp * feat: sync llama.cpp * feat: sync llama.cpp --- android/src/main/CMakeLists.txt | 4 +- cpp/build-info.h | 9 - cpp/common.cpp | 163 +- cpp/common.h | 61 +- cpp/ggml-alloc.c | 21 +- cpp/ggml-impl.h | 237 ++ cpp/ggml-metal-llama.metal | 196 +- cpp/ggml-metal.m | 98 +- cpp/{k_quants.c => ggml-quants.c} | 4428 ++++++++++++++++++++------- cpp/{k_quants.h => ggml-quants.h} | 103 +- cpp/ggml.c | 4576 +++++++--------------------- cpp/ggml.h | 34 +- cpp/llama.cpp | 4593 ++++++++++++----------------- cpp/llama.h | 54 +- cpp/log.h | 122 +- cpp/sampling.cpp | 15 +- cpp/sampling.h | 1 + example/ios/Podfile.lock | 4 +- llama-rn.podspec | 4 +- llama.cpp | 2 +- scripts/bootstrap.sh | 16 +- scripts/common.cpp.patch | 11 + scripts/ggml-metal.m.patch | 10 +- scripts/llama.cpp.patch | 6 +- scripts/log.h.patch | 8 +- 25 files changed, 7198 insertions(+), 7578 deletions(-) delete mode 100644 cpp/build-info.h create mode 100644 cpp/ggml-impl.h rename cpp/{k_quants.c => ggml-quants.c} (69%) rename cpp/{k_quants.h => ggml-quants.h} (63%) create mode 100644 scripts/common.cpp.patch diff --git a/android/src/main/CMakeLists.txt b/android/src/main/CMakeLists.txt index 67c768b5..d7d0d9de 100644 --- a/android/src/main/CMakeLists.txt +++ b/android/src/main/CMakeLists.txt @@ -12,7 +12,7 @@ set( ${RNLLAMA_LIB_DIR}/ggml-alloc.c ${RNLLAMA_LIB_DIR}/ggml-backend.c ${RNLLAMA_LIB_DIR}/ggml.c - ${RNLLAMA_LIB_DIR}/k_quants.c + ${RNLLAMA_LIB_DIR}/ggml-quants.c ${RNLLAMA_LIB_DIR}/common.cpp ${RNLLAMA_LIB_DIR}/grammar-parser.cpp ${RNLLAMA_LIB_DIR}/sampling.cpp @@ -32,7 +32,7 @@ function(build_library target_name) target_link_libraries(${target_name} ${LOG_LIB} android) - target_compile_options(${target_name} PRIVATE -DLM_GGML_USE_K_QUANTS -pthread) + target_compile_options(${target_name} PRIVATE -pthread) if (${target_name} STREQUAL "rnllama_v8fp16_va") target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod) diff --git a/cpp/build-info.h b/cpp/build-info.h deleted file mode 100644 index e36ff30d..00000000 --- a/cpp/build-info.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef BUILD_INFO_H -#define BUILD_INFO_H - -#define BUILD_NUMBER 1429 -#define BUILD_COMMIT "34b2a5e" -#define BUILD_COMPILER "" -#define BUILD_TARGET "unknown" - -#endif // BUILD_INFO_H diff --git a/cpp/common.cpp b/cpp/common.cpp index 131673f7..252af3ff 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -1,5 +1,4 @@ #include "common.h" -#include "build-info.h" #include "llama.h" #include @@ -91,6 +90,19 @@ void process_escapes(std::string& input) { case '\'': input[output_idx++] = '\''; break; case '\"': input[output_idx++] = '\"'; break; case '\\': input[output_idx++] = '\\'; break; + case 'x': + // Handle \x12, etc + if (input_idx + 2 < input_len) { + const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; + char *err_p = nullptr; + const long val = std::strtol(x, &err_p, 16); + if (err_p == x + 2) { + input_idx += 2; + input[output_idx++] = char(val); + break; + } + } + // fall through default: input[output_idx++] = '\\'; input[output_idx++] = input[input_idx]; break; } @@ -103,9 +115,24 @@ void process_escapes(std::string& input) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + bool result = true; + try { + if (!gpt_params_parse_ex(argc, argv, params)) { + gpt_print_usage(argc, argv, gpt_params()); + exit(0); + } + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + gpt_print_usage(argc, argv, gpt_params()); + exit(1); + } + return result; +} + +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; - gpt_params default_params; const std::string arg_prefix = "--"; llama_sampling_params & sparams = params.sparams; @@ -204,12 +231,52 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); + } else if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + } else if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } else if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } else if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } else if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -218,12 +285,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } sparams.top_p = std::stof(argv[i]); + } else if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.min_p = std::stof(argv[i]); } else if (arg == "--temp") { if (++i >= argc) { invalid_param = true; break; } sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); } else if (arg == "--tfs") { if (++i >= argc) { invalid_param = true; @@ -342,6 +416,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_sequences = std::stoi(argv[i]); + } else if (arg == "--p-accept" || arg == "-pa") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.p_accept = std::stof(argv[i]); + } else if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.p_split = std::stof(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -547,11 +633,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, default_params); -#ifndef LOG_DISABLE_LOGS - log_print_usage(); -#endif // LOG_DISABLE_LOGS - exit(0); + return false; + } else if (arg == "--random-prompt") { params.random_prompt = true; } else if (arg == "--in-prefix-bos") { @@ -610,22 +693,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); - exit(1); + throw std::invalid_argument("error: unknown argument: " + arg); } } if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); - exit(1); + throw std::invalid_argument("error: invalid parameter for argument: " + arg); } if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { - fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); - gpt_print_usage(argc, argv, default_params); - exit(1); + + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } if (params.escape) { @@ -644,6 +722,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); printf("options:\n"); @@ -678,6 +757,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); + printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); @@ -700,9 +780,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); - printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -716,6 +803,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); + printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept); + printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); @@ -743,7 +832,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif // LM_GGML_USE_CUBLAS #endif printf(" --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); + printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); @@ -754,6 +843,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); printf("\n"); +#ifndef LOG_DISABLE_LOGS + log_print_usage(); +#endif // LOG_DISABLE_LOGS } std::string get_system_info(const gpt_params & params) { @@ -807,17 +899,23 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); - cparams.n_ctx = params.n_ctx; - cparams.n_batch = params.n_batch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; - cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; - cparams.logits_all = params.logits_all; - cparams.embedding = params.embedding; - cparams.rope_freq_base = params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale; + cparams.n_ctx = params.n_ctx; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + cparams.seed = params.seed; + cparams.f16_kv = params.memory_f16; + cparams.logits_all = params.logits_all; + cparams.embedding = params.embedding; + cparams.rope_scaling_type = params.rope_scaling_type; + cparams.rope_freq_base = params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.yarn_orig_ctx = params.yarn_orig_ctx; return cparams; } @@ -888,7 +986,7 @@ std::tuple llama_init_from_gpt_par std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); - llama_kv_cache_tokens_rm(lctx, -1, -1); + llama_kv_cache_clear(lctx); llama_reset_timings(lctx); } @@ -1127,8 +1225,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const llama_sampling_params & sparams = params.sparams; - fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); - fprintf(stream, "build_number: %d\n", BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); @@ -1274,6 +1370,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); } diff --git a/cpp/common.h b/cpp/common.h index 84523a4f..dd6b002e 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -9,6 +9,7 @@ #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" +#include #include #include #include @@ -25,35 +26,51 @@ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) -#define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ - fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +// build info +extern int LLAMA_BUILD_NUMBER; +extern char const *LLAMA_COMMIT; +extern char const *LLAMA_COMPILER; +extern char const *LLAMA_BUILD_TARGET; + // // CLI argument parsing // int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed + uint32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 16; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 16; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_accept = 0.5f; // speculative decoding accept probability + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment + // pinging @cebtenzzre // // sampling parameters struct llama_sampling_params sparams; @@ -77,7 +94,7 @@ struct gpt_params { int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS @@ -110,6 +127,8 @@ struct gpt_params { std::string image = ""; // path to an image file }; +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); + bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c index 23278bff..2dbd6b51 100644 --- a/cpp/ggml-alloc.c +++ b/cpp/ggml-alloc.c @@ -378,9 +378,13 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) { } } -static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view) { +static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view, bool update_backend) { assert(view->view_src != NULL && view->view_src->data != NULL); - view->backend = view->view_src->backend; + + if (update_backend) { + view->backend = view->view_src->backend; + } + view->buffer = view->view_src->buffer; view->data = (char *)view->view_src->data + view->view_offs; @@ -394,7 +398,7 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (lm_ggml_is_view(node)) { - init_view(alloc, node); + init_view(alloc, node, true); } else { // see if we can reuse a parent's buffer (inplace) if (lm_ggml_op_can_inplace(node->op)) { @@ -424,15 +428,14 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); node->view_src = view_src; view_src_hn->n_views += 1; - init_view(alloc, node); + init_view(alloc, node, false); return; } - } - else { + } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->view_src = parent; p_hn->n_views += 1; - init_view(alloc, node); + init_view(alloc, node, false); return; } } @@ -463,7 +466,7 @@ size_t lm_ggml_allocr_alloc_graph_n( hash_get(ht, view_src)->n_views += 1; if (node->buffer == NULL && node->data != NULL) { // view of a pre-allocated tensor, didn't call init_view() yet - init_view(alloc, node); + init_view(alloc, node, true); } } @@ -474,7 +477,7 @@ size_t lm_ggml_allocr_alloc_graph_n( } hash_get(ht, parent)->n_children += 1; if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(alloc, parent); + init_view(alloc, parent, true); } } } diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h new file mode 100644 index 00000000..07881b71 --- /dev/null +++ b/cpp/ggml-impl.h @@ -0,0 +1,237 @@ +#pragma once + +#include "ggml.h" + +// GGML internal header + +#include +#include +#include +#include // memcpy +#include // fabsf + +#ifdef __cplusplus +extern "C" { +#endif + +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. +// if C99 - static_assert is noop +// ref: https://stackoverflow.com/a/53923785/4039976 +#ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else +#define static_assert(cond, msg) struct global_scope_noop_trick +#endif +#endif + +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#ifndef __SSE3__ +#define __SSE3__ +#endif +#endif + +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#if defined(__ARM_NEON) && !defined(_MSC_VER) + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +#define LM_GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) +#define LM_GGML_COMPUTE_FP32_TO_FP16(x) (x) + +#define LM_GGML_FP16_TO_FP32(x) ((float) (x)) +#define LM_GGML_FP32_TO_FP16(x) (x) + +#else + +#ifdef __wasm_simd128__ +#include +#else +#ifdef __POWER9_VECTOR__ +#include +#undef bool +#define bool _Bool +#else +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) +#if !defined(__riscv) +#include +#endif +#endif +#endif +#endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif + +#ifdef __F16C__ + +#ifdef _MSC_VER +#define LM_GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define LM_GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else +#define LM_GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define LM_GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif + +#elif defined(__POWER9_VECTOR__) + +#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x) +#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x) +#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) { + register double d; + register lm_ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x) +#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // __ARM_NEON + +// precomputed f32 table for f16 (256 KB) +// defined in ggml.c, initialized in lm_ggml_init() +extern float lm_ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32, +// so we define LM_GGML_FP16_TO_FP32 and LM_GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(LM_GGML_FP16_TO_FP32) || !defined(LM_GGML_FP32_TO_FP16) + +inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return lm_ggml_table_f32_f16[s]; +} + +#define LM_GGML_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x) +#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x) + +#endif + + // TODO: backend v2 PR + +#ifdef __cplusplus +} +#endif diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal index f4b46056..7c35f23a 100644 --- a/cpp/ggml-metal-llama.metal +++ b/cpp/ggml-metal-llama.metal @@ -184,36 +184,73 @@ kernel void kernel_soft_max( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig[2]; - const int64_t i02 = tgpig[1]; - const int64_t i01 = tgpig[0]; + threadgroup float * buf [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + const int64_t i03 = (tgpig) / (ne02*ne01); + const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; + const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY; - for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) { + float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY; + + for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) { lmax = MAX(lmax, psrc0[i00]); } - const float max = simd_max(lmax); + + float max = simd_max(lmax); + if (tiisg == 0) { + buf[sgitg] = max; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]); + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max = buf[0]; // parallel sum float lsum = 0.0f; - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { const float exp_psrc0 = exp(psrc0[i00] - max); lsum += exp_psrc0; // Remember the result of exp here. exp is expensive, so we really do not - // whish to compute it twice. + // wish to compute it twice. pdst[i00] = exp_psrc0; } - const float sum = simd_sum(lsum); + float sum = simd_sum(lsum); + if (tiisg == 0) { + buf[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] += buf[tpitg + i]; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[0]; + + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { pdst[i00] /= sum; } } @@ -224,37 +261,73 @@ kernel void kernel_soft_max_4( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig[2]; - const int64_t i02 = tgpig[1]; - const int64_t i01 = tgpig[0]; + threadgroup float * buf [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + const int64_t i03 = (tgpig) / (ne02*ne01); + const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; + const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); // parallel max - float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY; - for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) { + float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY; + + for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) { lmax4 = fmax(lmax4, psrc4[i00]); } - float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); - const float max = simd_max(lmax); + const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); + float max = simd_max(lmax); + if (tiisg == 0) { + buf[sgitg] = max; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]); + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max = buf[0]; // parallel sum float4 lsum4 = 0.0f; - for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { const float4 exp_psrc4 = exp(psrc4[i00] - max); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } - float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; - const float sum = simd_sum(lsum); + const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; + float sum = simd_sum(lsum); + if (tiisg == 0) { + buf[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] += buf[tpitg + i]; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[0]; - for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { pdst4[i00] /= sum; } } @@ -274,7 +347,7 @@ kernel void kernel_diag_mask_inf( dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; } else { dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; - } + } } kernel void kernel_diag_mask_inf_8( @@ -988,6 +1061,45 @@ kernel void kernel_alibi_f32( } } +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + thread float * cos_theta, thread float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); + } + *cos_theta = cos(theta) * mscale; + *sin_theta = sin(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base)); +} + +static void rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base))); +} + typedef void (rope_t)( device const void * src0, device const int32_t * src1, @@ -1011,8 +1123,13 @@ typedef void (rope_t)( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, + constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]); @@ -1041,8 +1158,13 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, + constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]) { @@ -1052,19 +1174,22 @@ kernel void kernel_rope( const bool is_neox = mode & 2; + float corr_dims[2]; + rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); + device const int32_t * pos = src1; const int64_t p = pos[i2]; - const float theta_0 = freq_scale * (float)p; + const float theta_0 = (float)p; const float inv_ndims = -1.f/n_dims; if (!is_neox) { for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { const float theta = theta_0 * pow(freq_base, inv_ndims*i0); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -1079,9 +1204,12 @@ kernel void kernel_rope( for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { - const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + const float cur_rot = inv_ndims*ic - ib; + + const float theta = theta_0 * pow(freq_base, cur_rot); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); const int64_t i0 = ib*n_dims + ic/2; diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index c858811c..4c68f3d3 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -210,6 +210,10 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"]; + if (sourcePath == nil) { + LM_GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); + sourcePath = @"ggml-metal.metal"; + } LM_GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; if (error) { @@ -234,14 +238,17 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, // load kernels { NSError * error = nil; -#define LM_GGML_METAL_ADD_KERNEL(name) \ - ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + + /* LM_GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ + */ +#define LM_GGML_METAL_ADD_KERNEL(name) \ + ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ if (error) { \ - LM_GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + LM_GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -994,11 +1001,15 @@ void lm_ggml_metal_graph_compute( } break; case LM_GGML_OP_SOFT_MAX: { - const int nth = MIN(32, ne00); + int nth = 32; // SIMD width if (ne00%4 == 0) { [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; } else { + do { + nth *= 2; + } while (nth <= ne00 && nth <= 1024); + nth /= 2; [encoder setComputePipelineState:ctx->pipeline_soft_max]; } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1006,8 +1017,9 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case LM_GGML_OP_DIAG_MASK_INF: { @@ -1336,7 +1348,7 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0]; const int64_t nrows = lm_ggml_nrows(src0); @@ -1388,14 +1400,19 @@ void lm_ggml_metal_graph_compute( const int nth = MIN(1024, ne00); - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; - float freq_base; - float freq_scale; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); switch (src0->type) { case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; @@ -1403,30 +1420,35 @@ void lm_ggml_metal_graph_compute( default: LM_GGML_ASSERT(false); }; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; - [encoder setBytes:&mode length:sizeof( int) atIndex:21]; - [encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; - [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; + [encoder setBytes:&mode length:sizeof( int) atIndex:21]; + [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; diff --git a/cpp/k_quants.c b/cpp/ggml-quants.c similarity index 69% rename from cpp/k_quants.c rename to cpp/ggml-quants.c index c37eb222..abc8dbb9 100644 --- a/cpp/k_quants.c +++ b/cpp/ggml-quants.c @@ -1,9 +1,10 @@ -#include "k_quants.h" -#include "ggml.h" +#include "ggml-quants.h" +#include "ggml-impl.h" #include #include #include +#include #ifdef __ARM_NEON @@ -65,1251 +66,3480 @@ inline static int32_t vaddvq_s32(int32x4_t v) { #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) -// -// 2-6 bit quantization in super-blocks -// +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(x, x); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(y, x); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m128i ones = _mm_set1_epi16(1); + return _mm_madd_epi16(ones, dot); +} -// -// ===================== Helper functions -// -static inline int nearest_int(float fval) { - assert(fval <= 4194303.f); - float val = fval + 12582912.f; - int i; memcpy(&i, &val, sizeof(int)); - return (i & 0x007fffff) - 0x00400000; +#if __AVX__ || __AVX2__ || __AVX512F__ +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { - float max = 0; - float amax = 0; - for (int i = 0; i < n; ++i) { - float ax = fabsf(x[i]); - if (ax > amax) { amax = ax; max = x[i]; } - } - if (amax < 1e-30f) { // all zero - for (int i = 0; i < n; ++i) { - L[i] = 0; +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + const __m128i sum64 = _mm_add_epi32(hi64, sum128); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +#if defined(__AVX2__) || defined(__AVX512F__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = _mm256_set_epi64x( + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); + const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytes = _mm256_or_si256(bytes, bit_mask); + return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8( 0xF ); + return _mm256_and_si256(lowMask, bytes); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + const __m256i ones = _mm256_set1_epi16(1); + const __m256i summed_pairs = _mm256_madd_epi16(ones, x); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { +#if __AVXVNNI__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + return sum_i16_pairs_float(dot); +#endif +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +#if __AVXVNNIINT8__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); + return mul_sum_us8_pairs_float(ax, sy); +#endif +} + +static inline __m128i packNibbles( __m256i bytes ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh +#if __AVX512F__ + const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 + bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh + return _mm256_cvtepi16_epi8(bytes); // abcd_efgh +#else + const __m256i lowByte = _mm256_set1_epi16( 0xFF ); + __m256i high = _mm256_andnot_si256( lowByte, bytes ); + __m256i low = _mm256_and_si256( lowByte, bytes ); + high = _mm256_srli_epi16( high, 4 ); + bytes = _mm256_or_si256( low, high ); + + // Compress uint16_t lanes into bytes + __m128i r0 = _mm256_castsi256_si128( bytes ); + __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); + return _mm_packus_epi16( r0, r1 ); +#endif +} +#elif defined(__AVX__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); + __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); + __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); + const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytesl = _mm_or_si128(bytesl, bit_mask); + bytesh = _mm_or_si128(bytesh, bit_mask); + bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); + bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); + return MM256_SET_M128I(bytesh, bytesl); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + // Load 16 bytes from memory + __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); + __m128i tmph = _mm_srli_epi16(tmpl, 4); + const __m128i lowMask = _mm_set1_epi8(0xF); + tmpl = _mm_and_si128(lowMask, tmpl); + tmph = _mm_and_si128(lowMask, tmph); + return MM256_SET_M128I(tmph, tmpl); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { + const __m128i ones = _mm_set1_epi16(1); + const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); + const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); + const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + const __m128i axl = _mm256_castsi256_si128(ax); + const __m128i axh = _mm256_extractf128_si256(ax, 1); + const __m128i syl = _mm256_castsi256_si128(sy); + const __m128i syh = _mm256_extractf128_si256(sy, 1); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m128i xl = _mm256_castsi256_si128(x); + const __m128i xh = _mm256_extractf128_si256(x, 1); + const __m128i yl = _mm256_castsi256_si128(y); + const __m128i yh = _mm256_extractf128_si256(y, 1); + // Get absolute values of x vectors + const __m128i axl = _mm_sign_epi8(xl, xl); + const __m128i axh = _mm_sign_epi8(xh, xh); + // Sign the values of the y vectors + const __m128i syl = _mm_sign_epi8(yl, xl); + const __m128i syh = _mm_sign_epi8(yh, xh); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m128i lowByte = _mm_set1_epi16( 0xFF ); + __m128i high = _mm_andnot_si128( lowByte, bytes1 ); + __m128i low = _mm_and_si128( lowByte, bytes1 ); + high = _mm_srli_epi16( high, 4 ); + bytes1 = _mm_or_si128( low, high ); + high = _mm_andnot_si128( lowByte, bytes2 ); + low = _mm_and_si128( lowByte, bytes2 ); + high = _mm_srli_epi16( high, 4 ); + bytes2 = _mm_or_si128( low, high ); + + return _mm_packus_epi16( bytes1, bytes2); +} +#endif +#elif defined(__SSSE3__) +// horizontally add 4x4 floats +static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { + __m128 res_0 =_mm_hadd_ps(a, b); + __m128 res_1 =_mm_hadd_ps(c, d); + __m128 res =_mm_hadd_ps(res_0, res_1); + res =_mm_hadd_ps(res, res); + res =_mm_hadd_ps(res, res); + + return _mm_cvtss_f32(res); +} +#endif // __AVX__ || __AVX2__ || __AVX512F__ +#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) + +#if defined(__ARM_NEON) + +#if !defined(__aarch64__) + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +#endif +#endif + +#if defined(__ARM_NEON) || defined(__wasm_simd128__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + +// reference implementation for deterministic creation of model files +void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - return 0.f; - } - float iscale = -nmax / max; - if (rmse_type == 0) { - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; } - return 1/iscale; - } - bool return_early = false; - if (rmse_type < 0) { - rmse_type = -rmse_type; - return_early = true; - } - int weight_type = rmse_type%2; - float sumlx = 0; - float suml2 = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - L[i] = l + nmax; - float w = weight_type == 1 ? x[i] * x[i] : 1; - sumlx += w*x[i]*l; - suml2 += w*l*l; } - float scale = sumlx/suml2; - if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; - float best = scale * sumlx; - for (int is = -9; is <= 9; ++is) { - if (is == 0) { - continue; +} + +void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_0_reference(x, y, k); +} + +void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { + const int qk = QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; } - iscale = -(nmax + 0.1f*is) / max; - sumlx = suml2 = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - float w = weight_type == 1 ? x[i] * x[i] : 1; - sumlx += w*x[i]*l; - suml2 += w*l*l; + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + y[i].m = LM_GGML_FP32_TO_FP16(min); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; } - if (suml2 > 0 && sumlx*sumlx > best*suml2) { - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + } +} + +void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_1_reference(x, y, k); +} + +void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; } - scale = sumlx/suml2; best = scale*sumlx; } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(qh)); } - return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { - float max = 0; - float amax = 0; - for (int i = 0; i < n; ++i) { - float ax = fabsf(x[i]); - if (ax > amax) { amax = ax; max = x[i]; } +void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { + quantize_row_q5_0_reference(x, y, k); +} + +void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { + const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 5) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + y[i].m = LM_GGML_FP32_TO_FP16(min); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); } - if (!amax) { // all zero - for (int i = 0; i < n; ++i) { L[i] = 0; } - return 0.f; +} + +void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q5_1_reference(x, y, k); +} + +// reference implementation for deterministic creation of model files +void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = x[i*QK8_0 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = x[i*QK8_0 + j]*id; + + y[i].qs[j] = roundf(x0); + } } - float iscale = -nmax / max; - if (do_rmse) { - float sumlx = 0; - float suml2 = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - L[i] = l; - float w = x[i]*x[i]; - sumlx += w*x[i]*l; - suml2 += w*l*l; +} + +void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); } - for (int itry = 0; itry < 5; ++itry) { - int n_changed = 0; - for (int i = 0; i < n; ++i) { - float w = x[i]*x[i]; - float slx = sumlx - w*x[i]*L[i]; - if (slx > 0) { - float sl2 = suml2 - w*L[i]*L[i]; - int new_l = nearest_int(x[i] * sl2 / slx); - new_l = MAX(-nmax, MIN(nmax-1, new_l)); - if (new_l != L[i]) { - slx += w*x[i]*new_l; - sl2 += w*new_l*new_l; - if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { - L[i] = new_l; sumlx = slx; suml2 = sl2; - ++n_changed; - } - } - } + } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + } + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = LM_GGML_FP32_TO_FP16(d); + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#elif defined(__riscv_v_intrinsic) + + size_t vl = __riscv_vsetvl_e32m4(QK8_0); + + for (int i = 0; i < nb; i++) { + // load elements + vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl); + + vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); + vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); + float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = LM_GGML_FP32_TO_FP16(d); + + vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); + + // convert to integer + vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); + vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); + + // store result + __riscv_vse8_v_i8m1(y[i].qs , vs, vl); + } +#else + LM_GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_reference(x, y, k); +#endif +} + +// reference implementation for deterministic creation of model files +void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { + assert(QK8_1 == 32); + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_1; j++) { + const float v = x[i*QK8_1 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int sum = 0; + + for (int j = 0; j < QK8_1/2; ++j) { + const float v0 = x[i*QK8_1 + j]*id; + const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; + + y[i].qs[ j] = roundf(v0); + y[i].qs[QK8_1/2 + j] = roundf(v1); + + sum += y[i].qs[ j]; + sum += y[i].qs[QK8_1/2 + j]; + } + + y[i].s = sum*d; + } +} + +void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int32x4_t accv = vdupq_n_s32(0); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + + accv = vaddq_s32(accv, vi); + } + + y[i].s = d * vaddvq_s32(accv); + } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + v128_t accv = wasm_i32x4_splat(0); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + + accv = wasm_i32x4_add(accv, vi); + } + + y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + + wasm_i32x4_extract_lane(accv, 1) + + wasm_i32x4_extract_lane(accv, 2) + + wasm_i32x4_extract_lane(accv, 3)); + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Compute the sum of the quants and set y[i].s + y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); + const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); + y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#elif defined(__riscv_v_intrinsic) + + size_t vl = __riscv_vsetvl_e32m4(QK8_1); + + for (int i = 0; i < nb; i++) { + // load elements + vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl); + + vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); + vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl); + vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); + float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); + + // convert to integer + vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); + vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); + + // store result + __riscv_vse8_v_i8m1(y[i].qs , vs, vl); + + // compute sum for y[i].s + vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl); + vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl); + + // set y[i].s + int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); + y[i].s = sum*d; + } +#else + LM_GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_reference(x, y, k); +#endif +} + +void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { + static const int qk = QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const float m = LM_GGML_FP16_TO_FP32(x[i].m); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { + static const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const float m = LM_GGML_FP16_TO_FP32(x[i].m); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int x0 = (x[i].qs[j] & 0x0F) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { + static const int qk = QK8_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } +} + +// +// 2-6 bit quantization in super-blocks +// + +// +// ===================== Helper functions +// +static inline int nearest_int(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { + float max = 0; + float amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { amax = ax; max = x[i]; } + } + if (amax < 1e-30f) { // all zero + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + return 0.f; + } + float iscale = -nmax / max; + if (rmse_type == 0) { + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + } + return 1/iscale; + } + bool return_early = false; + if (rmse_type < 0) { + rmse_type = -rmse_type; + return_early = true; + } + int weight_type = rmse_type%2; + float sumlx = 0; + float suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + L[i] = l + nmax; + float w = weight_type == 1 ? x[i] * x[i] : 1; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + float scale = sumlx/suml2; + if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; + float best = scale * sumlx; + for (int is = -9; is <= 9; ++is) { + if (is == 0) { + continue; + } + iscale = -(nmax + 0.1f*is) / max; + sumlx = suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + float w = weight_type == 1 ? x[i] * x[i] : 1; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + if (suml2 > 0 && sumlx*sumlx > best*suml2) { + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + } + scale = sumlx/suml2; best = scale*sumlx; + } + } + return scale; +} + +static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { + float max = 0; + float amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { amax = ax; max = x[i]; } + } + if (!amax) { // all zero + for (int i = 0; i < n; ++i) { L[i] = 0; } + return 0.f; + } + float iscale = -nmax / max; + if (do_rmse) { + float sumlx = 0; + float suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + L[i] = l; + float w = x[i]*x[i]; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + for (int itry = 0; itry < 5; ++itry) { + int n_changed = 0; + for (int i = 0; i < n; ++i) { + float w = x[i]*x[i]; + float slx = sumlx - w*x[i]*L[i]; + if (slx > 0) { + float sl2 = suml2 - w*L[i]*L[i]; + int new_l = nearest_int(x[i] * sl2 / slx); + new_l = MAX(-nmax, MIN(nmax-1, new_l)); + if (new_l != L[i]) { + slx += w*x[i]*new_l; + sl2 += w*new_l*new_l; + if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { + L[i] = new_l; sumlx = slx; suml2 = sl2; + ++n_changed; + } + } + } + } + if (!n_changed) { + break; + } + } + for (int i = 0; i < n; ++i) { + L[i] += nmax; + } + return sumlx / suml2; + } + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + L[i] = l + nmax; + } + return 1/iscale; +} + +static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, + int ntry, float alpha) { + float min = x[0]; + float max = x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + } + if (max == min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = 0; + return 0.f; + } + if (min > 0) min = 0; + float iscale = nmax/(max - min); + float scale = 1/iscale; + for (int itry = 0; itry < ntry; ++itry) { + float sumlx = 0; int suml2 = 0; + bool did_change = false; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + if (l != L[i]) { + L[i] = l; + did_change = true; + } + sumlx += (x[i] - min)*l; + suml2 += l*l; + } + scale = sumlx/suml2; + float sum = 0; + for (int i = 0; i < n; ++i) { + sum += x[i] - scale*L[i]; + } + min = alpha*min + (1 - alpha)*sum/n; + if (min > 0) min = 0; + iscale = 1/scale; + if (!did_change) break; + } + *the_min = -min; + return scale; +} + +static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights[0]; + float sum_x = sum_w * x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + float w = weights[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) min = 0; + if (max == min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + float w = weights[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { + if (j < 4) { + *d = q[j] & 63; *m = q[j + 4] & 63; + } else { + *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +//========================- 2-bit (de)-quantization + +void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + uint8_t L[QK_K]; + uint8_t Laux[16]; + float weights[16]; + float mins[QK_K/16]; + float scales[QK_K/16]; + + const float q4scale = 15.f; + + for (int i = 0; i < nb; i++) { + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + if (max_scale > 0) { + float iscale = q4scale/max_scale; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(iscale*scales[j]); + y[i].scales[j] = l; + } + y[i].d = LM_GGML_FP32_TO_FP16(max_scale/q4scale); + } else { + for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; + y[i].d = LM_GGML_FP32_TO_FP16(0.f); + } + if (max_min > 0) { + float iscale = q4scale/max_min; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(iscale*mins[j]); + y[i].scales[j] |= (l << 4); + } + y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/q4scale); + } else { + y[i].dmin = LM_GGML_FP32_TO_FP16(0.f); + } + for (int j = 0; j < QK_K/16; ++j) { + const float d = LM_GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF); + if (!d) continue; + const float dm = LM_GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int((x[16*j + ii] + dm)/d); + l = MAX(0, MIN(3, l)); + L[16*j + ii] = l; + } + } + +#if QK_K == 256 + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } +#else + for (int l = 0; l < 16; ++l) { + y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + } +#endif + + x += QK_K; + + } +} + +void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const float min = LM_GGML_FP16_TO_FP32(x[i].dmin); + + const uint8_t * q = x[i].qs; + +#if QK_K == 256 + int is = 0; + float dl, ml; + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + uint8_t sc = x[i].scales[is++]; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + + sc = x[i].scales[is++]; + dl = d * (sc & 0xF); ml = min * (sc >> 4); + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + + shift += 2; + } + q += 32; + } +#else + float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); + float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); + float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); + float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); + for (int l = 0; l < 16; ++l) { + y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1; + y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2; + y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3; + y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4; + } + y += QK_K; +#endif + } +} + +void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { + quantize_row_q2_K_reference(x, vy, k); +} + +size_t lm_ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + quantize_row_q2_K_reference(src + j, y, k); + } + return (n/QK_K*sizeof(block_q2_K)); +} + +//========================= 3-bit (de)-quantization + +void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + int8_t L[QK_K]; + float scales[QK_K / 16]; + + for (int i = 0; i < nb; i++) { + + float max_scale = 0; + float amax = 0; + for (int j = 0; j < QK_K/16; ++j) { + scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true); + float scale = fabsf(scales[j]); + if (scale > amax) { + amax = scale; max_scale = scales[j]; + } + } + +#if QK_K == 256 + memset(y[i].scales, 0, 12); + if (max_scale) { + float iscale = -32.f/max_scale; + for (int j = 0; j < QK_K/16; ++j) { + int8_t l = nearest_int(iscale*scales[j]); + l = MAX(-32, MIN(31, l)) + 32; + if (j < 8) { + y[i].scales[j] = l & 0xF; + } else { + y[i].scales[j-8] |= ((l & 0xF) << 4); + } + l >>= 4; + y[i].scales[j%4 + 8] |= (l << (2*(j/4))); + } + y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); + } else { + y[i].d = LM_GGML_FP32_TO_FP16(0.f); + } + + int8_t sc; + for (int j = 0; j < QK_K/16; ++j) { + sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; + sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; + float d = LM_GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + L[16*j + ii] = l + 4; + } + } +#else + if (max_scale) { + float iscale = -8.f/max_scale; + for (int j = 0; j < QK_K/16; j+=2) { + int l1 = nearest_int(iscale*scales[j]); + l1 = 8 + MAX(-8, MIN(7, l1)); + int l2 = nearest_int(iscale*scales[j+1]); + l2 = 8 + MAX(-8, MIN(7, l2)); + y[i].scales[j/2] = l1 | (l2 << 4); + } + y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); + } else { + for (int j = 0; j < QK_K/16; j+=2) { + y[i].scales[j/2] = 0; + } + y[i].d = LM_GGML_FP32_TO_FP16(0.f); + } + for (int j = 0; j < QK_K/16; ++j) { + int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; + float d = LM_GGML_FP16_TO_FP32(y[i].d) * (s - 8); + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + L[16*j + ii] = l + 4; + } + } +#endif + + memset(y[i].hmask, 0, QK_K/8); + // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. + int m = 0; + uint8_t hm = 1; + for (int j = 0; j < QK_K; ++j) { + if (L[j] > 3) { + y[i].hmask[m] |= hm; + L[j] -= 4; + } + if (++m == QK_K/8) { + m = 0; hm <<= 1; + } + } +#if QK_K == 256 + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } +#else + for (int l = 0; l < 16; ++l) { + y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + } +#endif + + x += QK_K; + } +} + +#if QK_K == 256 +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + uint32_t aux[4]; + const int8_t * scales = (const int8_t*)aux; + + for (int i = 0; i < nb; i++) { + + const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict q = x[i].qs; + const uint8_t * restrict hm = x[i].hmask; + uint8_t m = 1; + + memcpy(aux, x[i].scales, 12); + uint32_t tmp = aux[2]; + aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + int is = 0; + float dl; + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)); + } + + dl = d_all * (scales[is++] - 32); + for (int l = 0; l < 16; ++l) { + *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)); + } + + shift += 2; + m <<= 1; + } + q += 32; + } + + } +} +#else +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + assert(QK_K == 64); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict q = x[i].qs; + const uint8_t * restrict hm = x[i].hmask; + + const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); + const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); + const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); + const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); + + for (int l=0; l<8; ++l) { + uint8_t h = hm[l]; + y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); + y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); + y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); + y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); + y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); + y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); + y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); + y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); + } + y += QK_K; + } +} +#endif + +void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { + quantize_row_q3_K_reference(x, vy, k); +} + +size_t lm_ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + quantize_row_q3_K_reference(src + j, y, k); + } + return (n/QK_K*sizeof(block_q3_K)); +} + +// ====================== 4-bit (de)-quantization + +void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + uint8_t L[QK_K]; + uint8_t Laux[32]; + float weights[32]; + float mins[QK_K/32]; + float scales[QK_K/32]; + + for (int i = 0; i < nb; i++) { + + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + +#if QK_K == 256 + float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + for (int j = 0; j < QK_K/32; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + ls = MIN(63, ls); + lm = MIN(63, lm); + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } + y[i].d = LM_GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/63.f); + + uint8_t sc, m; + for (int j = 0; j < QK_K/32; ++j) { + get_scale_min_k4(j, y[i].scales, &sc, &m); + const float d = LM_GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) continue; + const float dm = LM_GGML_FP16_TO_FP32(y[i].dmin) * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(15, l)); + L[32*j + ii] = l; + } + } +#else + const float s_factor = 15.f; + float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; + float inv_min = max_min > 0 ? s_factor/max_min : 0.f; + int d1 = nearest_int(inv_scale*scales[0]); + int m1 = nearest_int(inv_min*mins[0]); + int d2 = nearest_int(inv_scale*scales[1]); + int m2 = nearest_int(inv_min*mins[1]); + y[i].scales[0] = d1 | (m1 << 4); + y[i].scales[1] = d2 | (m2 << 4); + y[i].d[0] = LM_GGML_FP32_TO_FP16(max_scale/s_factor); + y[i].d[1] = LM_GGML_FP32_TO_FP16(max_min/s_factor); + + float sumlx = 0; + int suml2 = 0; + for (int j = 0; j < QK_K/32; ++j) { + const uint8_t sd = y[i].scales[j] & 0xF; + const uint8_t sm = y[i].scales[j] >> 4; + const float d = LM_GGML_FP16_TO_FP32(y[i].d[0]) * sd; + if (!d) continue; + const float m = LM_GGML_FP16_TO_FP32(y[i].d[1]) * sm; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + m)/d); + l = MAX(0, MIN(15, l)); + L[32*j + ii] = l; + sumlx += (x[32*j + ii] + m)*l*sd; + suml2 += l*l*sd*sd; + } + } + if (suml2) { + y[i].d[0] = LM_GGML_FP32_TO_FP16(sumlx/suml2); + } +#endif + uint8_t * q = y[i].qs; + for (int j = 0; j < QK_K; j += 64) { + for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); + q += 32; + } + + x += QK_K; + + } +} + +void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const uint8_t * q = x[i].qs; + +#if QK_K == 256 + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const float min = LM_GGML_FP16_TO_FP32(x[i].dmin); + + int is = 0; + uint8_t sc, m; + for (int j = 0; j < QK_K; j += 64) { + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1; + for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; + q += 32; is += 2; + } +#else + const float dall = LM_GGML_FP16_TO_FP32(x[i].d[0]); + const float mall = LM_GGML_FP16_TO_FP32(x[i].d[1]); + const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4); + const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4); + for (int l = 0; l < 32; ++l) { + y[l+ 0] = d1 * (q[l] & 0xF) - m1; + y[l+32] = d2 * (q[l] >> 4) - m2; + } + y += QK_K; +#endif + + } +} + +void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_q4_K * restrict y = vy; + quantize_row_q4_K_reference(x, y, k); +} + +size_t lm_ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { + assert(k % QK_K == 0); + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + quantize_row_q4_K_reference(src + j, y, k); + } + return (n/QK_K*sizeof(block_q4_K)); +} + +// ====================== 5-bit (de)-quantization + +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + +#if QK_K == 256 + uint8_t L[QK_K]; + float mins[QK_K/32]; + float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; +#else + int8_t L[QK_K]; + float scales[QK_K/16]; +#endif + + for (int i = 0; i < nb; i++) { + +#if QK_K == 256 + + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + for (int j = 0; j < QK_K/32; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + ls = MIN(63, ls); + lm = MIN(63, lm); + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } + y[i].d = LM_GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/63.f); + + uint8_t sc, m; + for (int j = 0; j < QK_K/32; ++j) { + get_scale_min_k4(j, y[i].scales, &sc, &m); + const float d = LM_GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) continue; + const float dm = LM_GGML_FP16_TO_FP32(y[i].dmin) * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(31, l)); + L[32*j + ii] = l; + } + } + + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); + + uint8_t m1 = 1, m2 = 2; + for (int n = 0; n < QK_K; n += 64) { + for (int j = 0; j < 32; ++j) { + int l1 = L[n + j]; + if (l1 > 15) { + l1 -= 16; qh[j] |= m1; + } + int l2 = L[n + j + 32]; + if (l2 > 15) { + l2 -= 16; qh[j] |= m2; + } + ql[j] = l1 | (l2 << 4); + } + m1 <<= 2; m2 <<= 2; + ql += 32; + } +#else + float max_scale = 0, amax = 0; + for (int j = 0; j < QK_K/16; ++j) { + scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1); + float abs_scale = fabsf(scales[j]); + if (abs_scale > amax) { + amax = abs_scale; + max_scale = scales[j]; + } + } + + float iscale = -128.f/max_scale; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(iscale*scales[j]); + y[i].scales[j] = MAX(-128, MIN(127, l)); + } + y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); + + for (int j = 0; j < QK_K/16; ++j) { + const float d = LM_GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; + if (!d) continue; + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-16, MIN(15, l)); + L[16*j + ii] = l + 16; + } + } + + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); + + for (int j = 0; j < 32; ++j) { + int jm = j%8; + int is = j/8; + int l1 = L[j]; + if (l1 > 15) { + l1 -= 16; qh[jm] |= (1 << is); + } + int l2 = L[j + 32]; + if (l2 > 15) { + l2 -= 16; qh[jm] |= (1 << (4 + is)); + } + ql[j] = l1 | (l2 << 4); + } +#endif + + x += QK_K; + + } +} + +void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const uint8_t * ql = x[i].qs; + const uint8_t * qh = x[i].qh; + +#if QK_K == 256 + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + const float min = LM_GGML_FP16_TO_FP32(x[i].dmin); + + int is = 0; + uint8_t sc, m; + uint8_t u1 = 1, u2 = 2; + for (int j = 0; j < QK_K; j += 64) { + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + const float d1 = d * sc; const float m1 = min * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + const float d2 = d * sc; const float m2 = min * m; + for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1; + for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2; + ql += 32; is += 2; + u1 <<= 2; u2 <<= 2; + } +#else + float d = LM_GGML_FP16_TO_FP32(x[i].d); + const int8_t * restrict s = x[i].scales; + for (int l = 0; l < 8; ++l) { + y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); + y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); + y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); + y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); + y[l+32] = d * s[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); + y[l+40] = d * s[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); + y[l+48] = d * s[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); + y[l+56] = d * s[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); + } + y += QK_K; +#endif + } +} + +void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_q5_K * restrict y = vy; + quantize_row_q5_K_reference(x, y, k); +} + +size_t lm_ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { + assert(k % QK_K == 0); + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + quantize_row_q5_K_reference(src + j, y, k); + } + return (n/QK_K*sizeof(block_q5_K)); +} + +// ====================== 6-bit (de)-quantization + +void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + int8_t L[QK_K]; + float scales[QK_K/16]; + + for (int i = 0; i < nb; i++) { + + float max_scale = 0; + float max_abs_scale = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + + const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1); + scales[ib] = scale; + + const float abs_scale = fabsf(scale); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; + max_scale = scale; + } + + } + + if (!max_abs_scale) { + memset(&y[i], 0, sizeof(block_q6_K)); + y[i].d = LM_GGML_FP32_TO_FP16(0.f); + x += QK_K; + continue; + } + + float iscale = -128.f/max_scale; + y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); + for (int ib = 0; ib < QK_K/16; ++ib) { + y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); + } + + for (int j = 0; j < QK_K/16; ++j) { + float d = LM_GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; + if (!d) { + continue; } - if (!n_changed) { - break; + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-32, MIN(31, l)); + L[16*j + ii] = l + 32; } } - for (int i = 0; i < n; ++i) { - L[i] += nmax; - } - return sumlx / suml2; - } - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - L[i] = l + nmax; - } - return 1/iscale; -} -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, - int ntry, float alpha) { - float min = x[0]; - float max = x[0]; - for (int i = 1; i < n; ++i) { - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - } - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = 0; - return 0.f; - } - if (min > 0) min = 0; - float iscale = nmax/(max - min); - float scale = 1/iscale; - for (int itry = 0; itry < ntry; ++itry) { - float sumlx = 0; int suml2 = 0; - bool did_change = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - if (l != L[i]) { - L[i] = l; - did_change = true; + uint8_t * restrict ql = y[i].ql; + uint8_t * restrict qh = y[i].qh; +#if QK_K == 256 + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + const uint8_t q1 = L[j + l + 0] & 0xF; + const uint8_t q2 = L[j + l + 32] & 0xF; + const uint8_t q3 = L[j + l + 64] & 0xF; + const uint8_t q4 = L[j + l + 96] & 0xF; + ql[l+ 0] = q1 | (q3 << 4); + ql[l+32] = q2 | (q4 << 4); + qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6); } - sumlx += (x[i] - min)*l; - suml2 += l*l; + ql += 64; + qh += 32; } - scale = sumlx/suml2; - float sum = 0; - for (int i = 0; i < n; ++i) { - sum += x[i] - scale*L[i]; +#else + for (int l = 0; l < 32; ++l) { + const uint8_t q1 = L[l + 0] & 0xF; + const uint8_t q2 = L[l + 32] & 0xF; + ql[l] = q1 | (q2 << 4); } - min = alpha*min + (1 - alpha)*sum/n; - if (min > 0) min = 0; - iscale = 1/scale; - if (!did_change) break; + for (int l = 0; l < 16; ++l) { + qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6); + } +#endif + + x += QK_K; + } - *the_min = -min; - return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, - float rmin, float rdelta, int nstep, bool use_mad) { - float min = x[0]; - float max = x[0]; - float sum_w = weights[0]; - float sum_x = sum_w * x[0]; - for (int i = 1; i < n; ++i) { - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - float w = weights[i]; - sum_w += w; - sum_x += w * x[i]; - } - if (min > 0) min = 0; - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = -min; - return 0.f; - } - float iscale = nmax/(max - min); - float scale = 1/iscale; - float best_mad = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - L[i] = MAX(0, MIN(nmax, l)); - float diff = scale * L[i] + min - x[i]; - diff = use_mad ? fabsf(diff) : diff * diff; - float w = weights[i]; - best_mad += w * diff; - } - if (nstep < 1) { - *the_min = -min; - return scale; - } - for (int is = 0; is <= nstep; ++is) { - iscale = (rmin + rdelta*is + nmax)/(max - min); - float sum_l = 0, sum_l2 = 0, sum_xl = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - Laux[i] = l; - float w = weights[i]; - sum_l += w*l; - sum_l2 += w*l*l; - sum_xl += w*l*x[i]; - } - float D = sum_w * sum_l2 - sum_l * sum_l; - if (D > 0) { - float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; - float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; - if (this_min > 0) { - this_min = 0; - this_scale = sum_xl / sum_l2; - } - float mad = 0; - for (int i = 0; i < n; ++i) { - float diff = this_scale * Laux[i] + this_min - x[i]; - diff = use_mad ? fabsf(diff) : diff * diff; - float w = weights[i]; - mad += w * diff; - } - if (mad < best_mad) { - for (int i = 0; i < n; ++i) { - L[i] = Laux[i]; - } - best_mad = mad; - scale = this_scale; - min = this_min; +void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict ql = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict sc = x[i].scales; + +#if QK_K == 256 + for (int n = 0; n < QK_K; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + y[l + 0] = d * sc[is + 0] * q1; + y[l + 32] = d * sc[is + 2] * q2; + y[l + 64] = d * sc[is + 4] * q3; + y[l + 96] = d * sc[is + 6] * q4; } + y += 128; + ql += 64; + qh += 32; + sc += 8; + } +#else + for (int l = 0; l < 16; ++l) { + const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + y[l+ 0] = d * sc[0] * q1; + y[l+16] = d * sc[1] * q2; + y[l+32] = d * sc[2] * q3; + y[l+48] = d * sc[3] * q4; } + y += 64; +#endif + } - *the_min = -min; - return scale; } -#if QK_K == 256 -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { - if (j < 4) { - *d = q[j] & 63; *m = q[j + 4] & 63; - } else { - *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); +void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_q6_K * restrict y = vy; + quantize_row_q6_K_reference(x, y, k); +} + +size_t lm_ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK_K == 0); + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + quantize_row_q6_K_reference(src + j, y, k); } + return (n/QK_K*sizeof(block_q6_K)); } -#endif -//========================- 2-bit (de)-quantization +//===================================== Q8_K ============================================== -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; - uint8_t L[QK_K]; - uint8_t Laux[16]; - float weights[16]; - float mins[QK_K/16]; - float scales[QK_K/16]; - - const float q4scale = 15.f; - for (int i = 0; i < nb; i++) { - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; - for (int j = 0; j < QK_K/16; ++j) { - for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); - scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } - } - if (max_scale > 0) { - float iscale = q4scale/max_scale; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*scales[j]); - y[i].scales[j] = l; - } - y[i].d = lm_ggml_fp32_to_fp16(max_scale/q4scale); - } else { - for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; - y[i].d = lm_ggml_fp32_to_fp16(0.f); - } - if (max_min > 0) { - float iscale = q4scale/max_min; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*mins[j]); - y[i].scales[j] |= (l << 4); + float max = 0; + float amax = 0; + for (int j = 0; j < QK_K; ++j) { + float ax = fabsf(x[j]); + if (ax > amax) { + amax = ax; max = x[j]; } - y[i].dmin = lm_ggml_fp32_to_fp16(max_min/q4scale); - } else { - y[i].dmin = lm_ggml_fp32_to_fp16(0.f); + } + if (!amax) { + y[i].d = 0; + memset(y[i].qs, 0, QK_K); + x += QK_K; + continue; + } + const float iscale = -128.f/max; + for (int j = 0; j < QK_K; ++j) { + int v = nearest_int(iscale*x[j]); + y[i].qs[j] = MIN(127, v); } for (int j = 0; j < QK_K/16; ++j) { - const float d = lm_ggml_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF); - if (!d) continue; - const float dm = lm_ggml_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4); + int sum = 0; for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int((x[16*j + ii] + dm)/d); - l = MAX(0, MIN(3, l)); - L[16*j + ii] = l; - } - } - -#if QK_K == 256 - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + sum += y[i].qs[j*16 + ii]; } + y[i].bsums[j] = sum; } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif - + y[i].d = 1/iscale; x += QK_K; - } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; for (int i = 0; i < nb; i++) { + for (int j = 0; j < QK_K; ++j) { + *y++ = x[i].d * x[i].qs[j]; + } + } +} - const float d = lm_ggml_fp16_to_fp32(x[i].d); - const float min = lm_ggml_fp16_to_fp32(x[i].dmin); +void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { + quantize_row_q8_K_reference(x, y, k); +} - const uint8_t * q = x[i].qs; +//===================================== Dot ptoducts ================================= -#if QK_K == 256 - int is = 0; - float dl, ml; - for (int n = 0; n < QK_K; n += 128) { - int shift = 0; - for (int j = 0; j < 4; ++j) { +// +// Helper functions +// +#if __AVX__ || __AVX2__ || __AVX512F__ - uint8_t sc = x[i].scales[is++]; - dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; +// shuffles to pick the required scales in dot products +static inline __m256i get_scale_shuffle_q3k(int i) { + static const uint8_t k_shuffle[128] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + return _mm256_loadu_si256((const __m256i*)k_shuffle + i); +} +static inline __m256i get_scale_shuffle_k4(int i) { + static const uint8_t k_shuffle[256] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + }; + return _mm256_loadu_si256((const __m256i*)k_shuffle + i); +} +static inline __m128i get_scale_shuffle(int i) { + static const uint8_t k_shuffle[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, + 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, + 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 + }; + return _mm_loadu_si128((const __m128i*)k_shuffle + i); +} +#endif - sc = x[i].scales[is++]; - dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; +void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; - shift += 2; - } - q += 32; - } + assert(n % qk == 0); + + const block_q4_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); #else - float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); - float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); - float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); - float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); - for (int l = 0; l < 16; ++l) { - y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1; - y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2; - y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3; - y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4; - } - y += QK_K; + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); #endif } -} -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q2_K_reference(x, vy, k); -} + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); -size_t lm_ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - (void)hist; // TODO: collect histograms + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); - for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; - quantize_row_q2_K_reference(src + j, y, k); + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); } - return (n/QK_K*sizeof(block_q2_K)); -} -//========================= 3-bit (de)-quantization + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); - int8_t L[QK_K]; - float scales[QK_K / 16]; + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx = _mm_and_si128(lowMask, tmp); + __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + + bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); + + // Apply the scale, and accumulate + acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__SSSE3__) + // set constants + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + // Initialize accumulator with zeros + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + // First round without accumulation + { + _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[0].d) * LM_GGML_FP16_TO_FP32(y[0].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[1].d) * LM_GGML_FP16_TO_FP32(y[1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + acc_0 = _mm_mul_ps( d_0_1, p0 ); + acc_1 = _mm_mul_ps( d_0_1, p1 ); + acc_2 = _mm_mul_ps( d_2_3, p2 ); + acc_3 = _mm_mul_ps( d_2_3, p3 ); + } + + assert(nb % 2 == 0); // TODO: handle odd nb + + // Main loop + for (int i = 2; i < nb; i+=2) { + _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[i + 1].d) * LM_GGML_FP16_TO_FP32(y[i + 1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); + __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); + __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); + __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); + + // Acummulate + acc_0 = _mm_add_ps(p0_d, acc_0); + acc_1 = _mm_add_ps(p1_d, acc_1); + acc_2 = _mm_add_ps(p2_d, acc_2); + acc_3 = _mm_add_ps(p3_d, acc_3); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); for (int i = 0; i < nb; i++) { + // load elements + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - float max_scale = 0; - float amax = 0; - for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true); - float scale = fabsf(scales[j]); - if (scale > amax) { - amax = scale; max_scale = scales[j]; - } - } + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); -#if QK_K == 256 - memset(y[i].scales, 0, 12); - if (max_scale) { - float iscale = -32.f/max_scale; - for (int j = 0; j < QK_K/16; ++j) { - int8_t l = nearest_int(iscale*scales[j]); - l = MAX(-32, MIN(31, l)) + 32; - if (j < 8) { - y[i].scales[j] = l & 0xF; - } else { - y[i].scales[j-8] |= ((l & 0xF) << 4); - } - l >>= 4; - y[i].scales[j%4 + 8] |= (l << (2*(j/4))); - } - y[i].d = lm_ggml_fp32_to_fp16(1/iscale); - } else { - y[i].d = lm_ggml_fp32_to_fp16(0.f); - } + // mask and store lower part of x, and then upper part + vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - int8_t sc; - for (int j = 0; j < QK_K/16; ++j) { - sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; - sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; - float d = lm_ggml_fp16_to_fp32(y[i].d) * sc; - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - L[16*j + ii] = l + 4; - } - } -#else - if (max_scale) { - float iscale = -8.f/max_scale; - for (int j = 0; j < QK_K/16; j+=2) { - int l1 = nearest_int(iscale*scales[j]); - l1 = 8 + MAX(-8, MIN(7, l1)); - int l2 = nearest_int(iscale*scales[j+1]); - l2 = 8 + MAX(-8, MIN(7, l2)); - y[i].scales[j/2] = l1 | (l2 << 4); - } - y[i].d = lm_ggml_fp32_to_fp16(1/iscale); - } else { - for (int j = 0; j < QK_K/16; j+=2) { - y[i].scales[j/2] = 0; - } - y[i].d = lm_ggml_fp32_to_fp16(0.f); - } - for (int j = 0; j < QK_K/16; ++j) { - int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; - float d = lm_ggml_fp16_to_fp32(y[i].d) * (s - 8); - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - L[16*j + ii] = l + 4; - } - } -#endif + vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + // subtract offset + vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl); + vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d); + } - memset(y[i].hmask, 0, QK_K/8); - // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. - int m = 0; - uint8_t hm = 1; - for (int j = 0; j < QK_K; ++j) { - if (L[j] > 3) { - y[i].hmask[m] |= hm; - L[j] -= 4; - } - if (++m == QK_K/8) { - m = 0; hm <<= 1; - } - } -#if QK_K == 256 - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); - } - } + *s = sumf; #else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } -#endif - x += QK_K; + sumf += sumi*LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d); } + + *s = sumf; +#endif } -#if QK_K == 256 -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; +void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_1; + const int nb = n / qk; - const uint32_t kmask1 = 0x03030303; - const uint32_t kmask2 = 0x0f0f0f0f; + assert(n % qk == 0); - uint32_t aux[4]; - const int8_t * scales = (const int8_t*)aux; + const block_q4_1 * restrict x = vx; + const block_q8_1 * restrict y = vy; - for (int i = 0; i < nb; i++) { + // TODO: add WASM SIMD +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); - const float d_all = lm_ggml_fp16_to_fp32(x[i].d); + float summs = 0; - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - uint8_t m = 1; + assert(nb % 2 == 0); // TODO: handle odd nb - memcpy(aux, x[i].scales, 12); - uint32_t tmp = aux[2]; - aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); - aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); - aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); - aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int i = 0; i < nb; i += 2) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i + 0]; + const block_q8_1 * restrict y1 = &y[i + 1]; - int is = 0; - float dl; - for (int n = 0; n < QK_K; n += 128) { - int shift = 0; - for (int j = 0; j < 4; ++j) { + summs += LM_GGML_FP16_TO_FP32(x0->m) * y0->s + LM_GGML_FP16_TO_FP32(x1->m) * y1->s; - dl = d_all * (scales[is++] - 32); - for (int l = 0; l < 16; ++l) { - *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4)); - } + const uint8x16_t m4b = vdupq_n_u8(0x0F); - dl = d_all * (scales[is++] - 32); - for (int l = 0; l < 16; ++l) { - *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4)); - } + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); - shift += 2; - m <<= 1; - } - q += 32; - } + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); +#endif } -} + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float d0 = LM_GGML_FP16_TO_FP32(x[i].d); + const float d1 = y[i].d; + + summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + const __m256 d0v = _mm256_set1_ps( d0 ); + const __m256 d1v = _mm256_set1_ps( d1 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + + const __m256 xy = mul_sum_us8_pairs_float(bx, by); + + // Accumulate d0*d1*x*y +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d0d1, xy, acc ); #else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { - assert(k % QK_K == 0); - assert(QK_K == 64); - const int nb = k / QK_K; + acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); +#endif + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); for (int i = 0; i < nb; i++) { + // load elements + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - const float d_all = lm_ggml_fp16_to_fp32(x[i].d); + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + // mask and store lower part of x, and then upper part + vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); - const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); - const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); - const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); + vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - for (int l=0; l<8; ++l) { - uint8_t h = hm[l]; - y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); - y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); - y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); - y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); - y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); - y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); - y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); - y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - y += QK_K; + + sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; } -} -#endif -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q3_K_reference(x, vy, k); + *s = sumf; +#endif } -size_t lm_ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - (void)hist; // TODO: collect histograms +void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; - for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; - quantize_row_q3_K_reference(src + j, y, k); + assert(n % qk == 0); + assert(qk == QK5_0); + + const block_q5_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q5_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + // extract the 5th bit via lookup table ((!b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); +#endif } - return (n/QK_K*sizeof(block_q3_K)); -} -// ====================== 4-bit (de)-quantization + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + uint32_t qh; + uint64_t tmp[4]; - uint8_t L[QK_K]; - uint8_t Laux[32]; - float weights[32]; - float mins[QK_K/32]; - float scales[QK_K/32]; + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); + const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(LM_GGML_FP16_TO_FP32(x0->d) * LM_GGML_FP16_TO_FP32(y0->d)))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + // Main loop for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; - for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } - } + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); + bx = _mm256_or_si256(bx, bxhi); -#if QK_K == 256 - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; - for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); - if (j < 4) { - y[i].scales[j] = ls; - y[i].scales[j+4] = lm; - } else { - y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); - y[i].scales[j-4] |= ((ls >> 4) << 6); - y[i].scales[j-0] |= ((lm >> 4) << 6); - } - } - y[i].d = lm_ggml_fp32_to_fp16(max_scale/63.f); - y[i].dmin = lm_ggml_fp32_to_fp16(max_min/63.f); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - uint8_t sc, m; - for (int j = 0; j < QK_K/32; ++j) { - get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = lm_ggml_fp16_to_fp32(y[i].d) * sc; - if (!d) continue; - const float dm = lm_ggml_fp16_to_fp32(y[i].dmin) * m; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(15, l)); - L[32*j + ii] = l; - } - } -#else - const float s_factor = 15.f; - float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; - float inv_min = max_min > 0 ? s_factor/max_min : 0.f; - int d1 = nearest_int(inv_scale*scales[0]); - int m1 = nearest_int(inv_min*mins[0]); - int d2 = nearest_int(inv_scale*scales[1]); - int m2 = nearest_int(inv_min*mins[1]); - y[i].scales[0] = d1 | (m1 << 4); - y[i].scales[1] = d2 | (m2 << 4); - y[i].d[0] = lm_ggml_fp32_to_fp16(max_scale/s_factor); - y[i].d[1] = lm_ggml_fp32_to_fp16(max_min/s_factor); + const __m256 q = mul_sum_i8_pairs_float(bx, by); - float sumlx = 0; - int suml2 = 0; - for (int j = 0; j < QK_K/32; ++j) { - const uint8_t sd = y[i].scales[j] & 0xF; - const uint8_t sm = y[i].scales[j] >> 4; - const float d = lm_ggml_fp16_to_fp32(y[i].d[0]) * sd; - if (!d) continue; - const float m = lm_ggml_fp16_to_fp32(y[i].d[1]) * sm; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + m)/d); - l = MAX(0, MIN(15, l)); - L[32*j + ii] = l; - sumlx += (x[32*j + ii] + m)*l*sd; - suml2 += l*l*sd*sd; - } - } - if (suml2) { - y[i].d[0] = lm_ggml_fp32_to_fp16(sumlx/suml2); - } -#endif - uint8_t * q = y[i].qs; - for (int j = 0; j < QK_K; j += 64) { - for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); - q += 32; - } + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8((char)0xF0); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_andnot_si128(bxhil, mask); + bxhih = _mm_andnot_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = MM256_SET_M128I(bxh, bxl); + + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); + } - x += QK_K; + *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; - } -} + uint32_t qh; -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + // These tempory registers are for masking and shift operations + vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); + vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl); + + vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl); + vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); - const uint8_t * q = x[i].qs; + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl); + vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl); + vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); -#if QK_K == 256 + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl); + vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl); - const float d = lm_ggml_fp16_to_fp32(x[i].d); - const float min = lm_ggml_fp16_to_fp32(x[i].dmin); + // narrowing + vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl); + vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); - int is = 0; - uint8_t sc, m; - for (int j = 0; j < QK_K; j += 64) { - get_scale_min_k4(is + 0, x[i].scales, &sc, &m); - const float d1 = d * sc; const float m1 = min * m; - get_scale_min_k4(is + 1, x[i].scales, &sc, &m); - const float d2 = d * sc; const float m2 = min * m; - for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1; - for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; - q += 32; is += 2; - } + vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl); + vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); + + // load + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); + + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); + + vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); + + vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); + vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); + + vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl); + vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; #else - const float dall = lm_ggml_fp16_to_fp32(x[i].d[0]); - const float mall = lm_ggml_fp16_to_fp32(x[i].d[1]); - const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4); - const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4); - for (int l = 0; l < 32; ++l) { - y[l+ 0] = d1 * (q[l] & 0xF) - m1; - y[l+32] = d2 * (q[l] >> 4) - m2; + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - y += QK_K; -#endif + sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi; } -} -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { - assert(k % QK_K == 0); - block_q4_K * restrict y = vy; - quantize_row_q4_K_reference(x, y, k); + *s = sumf; +#endif } -size_t lm_ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms +void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_1; + const int nb = n / qk; - for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; - quantize_row_q4_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q4_K)); -} + assert(n % qk == 0); + assert(qk == QK5_1); -// ====================== 5-bit (de)-quantization + const block_q5_1 * restrict x = vx; + const block_q8_1 * restrict y = vy; -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); -#if QK_K == 256 - uint8_t L[QK_K]; - float mins[QK_K/32]; - float scales[QK_K/32]; - float weights[32]; - uint8_t Laux[32]; + float summs0 = 0.0f; + float summs1 = 0.0f; + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q5_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i]; + const block_q8_1 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + summs0 += LM_GGML_FP16_TO_FP32(x0->m) * y0->s; + summs1 += LM_GGML_FP16_TO_FP32(x1->m) * y1->s; + + // extract the 5th bit via lookup table ((b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit + const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); #else - int8_t L[QK_K]; - float scales[QK_K/16]; + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); #endif + } - for (int i = 0; i < nb; i++) { + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); -#if QK_K == 256 + float summs = 0.0f; - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; - for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } - } + uint32_t qh; + uint64_t tmp[4]; - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; - for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); - if (j < 4) { - y[i].scales[j] = ls; - y[i].scales[j+4] = lm; - } else { - y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); - y[i].scales[j-4] |= ((ls >> 4) << 6); - y[i].scales[j-0] |= ((lm >> 4) << 6); - } - } - y[i].d = lm_ggml_fp32_to_fp16(max_scale/63.f); - y[i].dmin = lm_ggml_fp32_to_fp16(max_min/63.f); + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q8_1 * restrict y0 = &y[i]; - uint8_t sc, m; - for (int j = 0; j < QK_K/32; ++j) { - get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = lm_ggml_fp16_to_fp32(y[i].d) * sc; - if (!d) continue; - const float dm = lm_ggml_fp16_to_fp32(y[i].dmin) * m; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(31, l)); - L[32*j + ii] = l; - } - } + summs += LM_GGML_FP16_TO_FP32(x0->m) * y0->s; - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; - memset(qh, 0, QK_K/8); + const v128_t m4b = wasm_i8x16_splat(0x0F); - uint8_t m1 = 1, m2 = 2; - for (int n = 0; n < QK_K; n += 64) { - for (int j = 0; j < 32; ++j) { - int l1 = L[n + j]; - if (l1 > 15) { - l1 -= 16; qh[j] |= m1; - } - int l2 = L[n + j + 32]; - if (l2 > 15) { - l2 -= 16; qh[j] |= m2; - } - ql[j] = l1 | (l2 << 4); - } - m1 <<= 2; m2 <<= 2; - ql += 32; - } -#else - float max_scale = 0, amax = 0; - for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1); - float abs_scale = fabsf(scales[j]); - if (abs_scale > amax) { - amax = abs_scale; - max_scale = scales[j]; - } - } + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); - float iscale = -128.f/max_scale; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*scales[j]); - y[i].scales[j] = MAX(-128, MIN(127, l)); - } - y[i].d = lm_ggml_fp32_to_fp16(1/iscale); + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; - for (int j = 0; j < QK_K/16; ++j) { - const float d = lm_ggml_fp16_to_fp32(y[i].d) * y[i].scales[j]; - if (!d) continue; - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-16, MIN(15, l)); - L[16*j + ii] = l + 16; - } - } + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; - memset(qh, 0, QK_K/8); + const v128_t v0 = wasm_v128_load(x0->qs); - for (int j = 0; j < 32; ++j) { - int jm = j%8; - int is = j/8; - int l1 = L[j]; - if (l1 > 15) { - l1 -= 16; qh[jm] |= (1 << is); - } - int l2 = L[j + 32]; - if (l2 > 15) { - l2 -= 16; qh[jm] |= (1 << (4 + is)); - } - ql[j] = l1 | (l2 << 4); - } -#endif + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); - x += QK_K; + // add high bit + const v128_t v0lf = wasm_v128_or(v0l, qhl); + const v128_t v0hf = wasm_v128_or(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(LM_GGML_FP16_TO_FP32(x0->d) * y0->d))); } -} -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + float summs = 0.0f; + + // Main loop for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d)); - const uint8_t * ql = x[i].qs; - const uint8_t * qh = x[i].qh; + summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; -#if QK_K == 256 + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); + bx = _mm256_or_si256(bx, bxhi); - const float d = lm_ggml_fp16_to_fp32(x[i].d); - const float min = lm_ggml_fp16_to_fp32(x[i].dmin); + const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - int is = 0; - uint8_t sc, m; - uint8_t u1 = 1, u2 = 2; - for (int j = 0; j < QK_K; j += 64) { - get_scale_min_k4(is + 0, x[i].scales, &sc, &m); - const float d1 = d * sc; const float m1 = min * m; - get_scale_min_k4(is + 1, x[i].scales, &sc, &m); - const float d2 = d * sc; const float m2 = min * m; - for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1; - for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2; - ql += 32; is += 2; - u1 <<= 2; u2 <<= 2; - } -#else - float d = lm_ggml_fp16_to_fp32(x[i].d); - const int8_t * restrict s = x[i].scales; - for (int l = 0; l < 8; ++l) { - y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); - y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); - y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); - y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); - y[l+32] = d * s[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); - y[l+40] = d * s[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); - y[l+48] = d * s[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); - y[l+56] = d * s[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); - } - y += QK_K; -#endif + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); } -} -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { - assert(k % QK_K == 0); - block_q5_K * restrict y = vy; - quantize_row_q5_K_reference(x, y, k); -} + *s = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8(0x10); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d)); + + summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_and_si128(bxhil, mask); + bxhih = _mm_and_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = MM256_SET_M128I(bxh, bxl); + + const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); -size_t lm_ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms + const __m256 q = mul_sum_us8_pairs_float(bx, by); - for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; - quantize_row_q5_K_reference(src + j, y, k); + acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); } - return (n/QK_K*sizeof(block_q5_K)); -} -// ====================== 6-bit (de)-quantization + *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + uint32_t qh; - int8_t L[QK_K]; - float scales[QK_K/16]; + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + // temporary registers for shift operations + vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); + vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); - float max_scale = 0; - float max_abs_scale = 0; + // load qh + vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl); - for (int ib = 0; ib < QK_K/16; ++ib) { + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl); + vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); + vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl); - const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1); - scales[ib] = scale; + // ((qh >> (j + 12)) ) & 0x10; + vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl); + vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl); - const float abs_scale = fabsf(scale); - if (abs_scale > max_abs_scale) { - max_abs_scale = abs_scale; - max_scale = scale; - } + // narrowing + vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl); + vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); - } + vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl); + vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); - if (!max_abs_scale) { - memset(&y[i], 0, sizeof(block_q6_K)); - y[i].d = lm_ggml_fp32_to_fp16(0.f); - x += QK_K; - continue; - } + // load + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - float iscale = -128.f/max_scale; - y[i].d = lm_ggml_fp32_to_fp16(1/iscale); - for (int ib = 0; ib < QK_K/16; ++ib) { - y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); - } + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - for (int j = 0; j < QK_K/16; ++j) { - float d = lm_ggml_fp16_to_fp32(y[i].d) * y[i].scales[j]; - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-32, MIN(31, l)); - L[16*j + ii] = l + 32; - } - } + vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; -#if QK_K == 256 - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - const uint8_t q1 = L[j + l + 0] & 0xF; - const uint8_t q2 = L[j + l + 32] & 0xF; - const uint8_t q3 = L[j + l + 64] & 0xF; - const uint8_t q4 = L[j + l + 96] & 0xF; - ql[l+ 0] = q1 | (q3 << 4); - ql[l+32] = q2 | (q4 << 4); - qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6); - } - ql += 64; - qh += 32; - } -#else - for (int l = 0; l < 32; ++l) { - const uint8_t q1 = L[l + 0] & 0xF; - const uint8_t q2 = L[l + 32] & 0xF; - ql[l] = q1 | (q2 << 4); - } - for (int l = 0; l < 16; ++l) { - qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6); - } -#endif + vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); + vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); - x += QK_K; + vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; } -} -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + *s = sumf; +#else + // scalar + float sumf = 0.0; for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); - const float d = lm_ggml_fp16_to_fp32(x[i].d); + int sumi = 0; - const uint8_t * restrict ql = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict sc = x[i].scales; + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; -#if QK_K == 256 - for (int n = 0; n < QK_K; n += 128) { - for (int l = 0; l < 32; ++l) { - int is = l/16; - const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l + 0] = d * sc[is + 0] * q1; - y[l + 32] = d * sc[is + 2] * q2; - y[l + 64] = d * sc[is + 4] * q3; - y[l + 96] = d * sc[is + 6] * q4; - } - y += 128; - ql += 64; - qh += 32; - sc += 8; - } -#else - for (int l = 0; l < 16; ++l) { - const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l+ 0] = d * sc[0] * q1; - y[l+16] = d * sc[1] * q2; - y[l+32] = d * sc[2] * q3; - y[l+48] = d * sc[3] * q4; + const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - y += 64; -#endif + sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; } -} -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { - assert(k % QK_K == 0); - block_q6_K * restrict y = vy; - quantize_row_q6_K_reference(x, y, k); + *s = sumf; +#endif } -size_t lm_ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms +void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; - for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; - quantize_row_q6_K_reference(src + j, y, k); + assert(n % qk == 0); + + const block_q8_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); + +#else + const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); + const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); + const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); + const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); + + const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); + const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); + const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); + const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); + + const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); + const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); + const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); + const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); +#endif } - return (n/QK_K*sizeof(block_q6_K)); -} -//===================================== Q8_K ============================================== + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - for (int i = 0; i < nb; i++) { + const __m256 q = mul_sum_i8_pairs_float(bx, by); - float max = 0; - float amax = 0; - for (int j = 0; j < QK_K; ++j) { - float ax = fabsf(x[j]); - if (ax > amax) { - amax = ax; max = x[j]; - } - } - if (!amax) { - y[i].d = 0; - memset(y[i].qs, 0, QK_K); - x += QK_K; - continue; - } - const float iscale = -128.f/max; - for (int j = 0; j < QK_K; ++j) { - int v = nearest_int(iscale*x[j]); - y[i].qs[j] = MIN(127, v); - } - for (int j = 0; j < QK_K/16; ++j) { - int sum = 0; - for (int ii = 0; ii < 16; ++ii) { - sum += y[i].qs[j*16 + ii]; - } - y[i].bsums[j] = sum; - } - y[i].d = 1/iscale; - x += QK_K; + // Multiply q with scale and accumulate +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d, q, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); +#endif } -} -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; + *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK_K; ++j) { - *y++ = x[i].d * x[i].qs[j]; - } + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)); } -} -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { - quantize_row_q8_K_reference(x, y, k); -} + *s = sumf; +#else + // scalar + float sumf = 0.0; -//===================================== Dot ptoducts ================================= + for (int i = 0; i < nb; i++) { + int sumi = 0; -// -// Helper functions -// -#if __AVX__ || __AVX2__ || __AVX512F__ + for (int j = 0; j < qk; j++) { + sumi += x[i].qs[j]*y[i].qs[j]; + } -// horizontally add 8 floats -static inline float hsum_float_8(const __m256 x) { - __m128 res = _mm256_extractf128_ps(x, 1); - res = _mm_add_ps(res, _mm256_castps256_ps128(x)); - res = _mm_add_ps(res, _mm_movehl_ps(res, res)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - return _mm_cvtss_f32(res); -} + sumf += sumi*(LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)); + } -// shuffles to pick the required scales in dot products -static inline __m256i get_scale_shuffle_q3k(int i) { - static const uint8_t k_shuffle[128] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - }; - return _mm256_loadu_si256((const __m256i*)k_shuffle + i); -} -static inline __m256i get_scale_shuffle_k4(int i) { - static const uint8_t k_shuffle[256] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, - 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - }; - return _mm256_loadu_si256((const __m256i*)k_shuffle + i); -} -static inline __m128i get_scale_shuffle(int i) { - static const uint8_t k_shuffle[128] = { - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, - 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, - 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 - }; - return _mm_loadu_si128((const __m128i*)k_shuffle + i); -} + *s = sumf; #endif +} #if QK_K == 256 void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { @@ -1334,8 +3564,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -1413,8 +3643,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -1480,8 +3710,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float dall = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -1588,8 +3818,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); size_t vl = 16; @@ -1675,8 +3905,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1793,8 +4023,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -1845,8 +4075,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -1960,8 +4190,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); isum[0] = isum[1] = isum[2] = isum[3] = 0; for (int l = 0; l < 16; ++l) { @@ -2014,7 +4244,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const uint8_t * restrict qh = x[i].hmask; @@ -2122,7 +4352,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -2227,7 +4457,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -2448,7 +4678,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d*sum_t; @@ -2513,7 +4743,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2615,7 +4845,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -2686,7 +4916,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -2871,7 +5101,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res q8 += 8; a += 8; for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l]; } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2911,8 +5141,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2994,8 +5224,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -3060,8 +5290,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -3143,8 +5373,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res size_t vl = 8; - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); @@ -3254,9 +5484,9 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = lm_ggml_fp16_to_fp32(x[i].dmin) * y[i].d; + const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -3358,8 +5588,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = lm_ggml_fp16_to_fp32(x[i].d[0]) * y[i].d; - const float m = lm_ggml_fp16_to_fp32(x[i].d[1]) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; + const float m = LM_GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; const __m256 vd = _mm256_set1_ps(d); const uint16_t * a = (const uint16_t *)x[i].scales; @@ -3404,8 +5634,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = lm_ggml_fp16_to_fp32(x[i].d[0]) * y[i].d; - const float m = lm_ggml_fp16_to_fp32(x[i].d[1]) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; + const float m = LM_GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; const __m256 vd = _mm256_set1_ps(d); const uint16_t * a = (const uint16_t *)x[i].scales; @@ -3461,8 +5691,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; - sumf -= y[i].d * lm_ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d[0]); + sumf -= y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]); size_t vl = 32; @@ -3511,9 +5741,9 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; - sumf -= y[i].d * lm_ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + sumf -= y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d[0]); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]); for (int j = 0; j < QK_K/32; ++j) { for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; @@ -3561,8 +5791,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -3650,8 +5880,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const int8_t * restrict q8 = y[i].qs; #if QK_K == 256 - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -3732,8 +5962,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * lm_ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -3837,8 +6067,8 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const uint8_t * restrict hm = x[i].qh; const int8_t * restrict q8 = y[i].qs; - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; - const float dmin = lm_ggml_fp16_to_fp32(x[i].dmin) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); @@ -3960,9 +6190,9 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = lm_ggml_fp16_to_fp32(x[i].dmin) * y[i].d; + const float dmin = LM_GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -4060,7 +6290,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); @@ -4106,7 +6336,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); @@ -4243,7 +6473,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); } - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const int8_t * restrict sc = x[i].scales; for (int j = 0; j < QK_K/16; ++j) { @@ -4286,7 +6516,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d_all = lm_ggml_fp16_to_fp32(x[i].d); + const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -4418,7 +6648,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -4498,7 +6728,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -4610,7 +6840,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -4727,7 +6957,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -4825,7 +7055,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -4882,7 +7112,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int i = 0; i < nb; ++i) { - const float d = y[i].d * lm_ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -5041,7 +7271,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = lm_ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; diff --git a/cpp/k_quants.h b/cpp/ggml-quants.h similarity index 63% rename from cpp/k_quants.h rename to cpp/ggml-quants.h index 59cd1d57..73e7a206 100644 --- a/cpp/k_quants.h +++ b/cpp/ggml-quants.h @@ -1,11 +1,63 @@ #pragma once -#include "ggml.h" +#include "ggml-impl.h" + +// GGML internal header #include -#include #include +#define QK4_0 32 +typedef struct { + lm_ggml_fp16_t d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(lm_ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + lm_ggml_fp16_t d; // delta + lm_ggml_fp16_t m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == 2 * sizeof(lm_ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK5_0 32 +typedef struct { + lm_ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; +static_assert(sizeof(block_q5_0) == sizeof(lm_ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); + +#define QK5_1 32 +typedef struct { + lm_ggml_fp16_t d; // delta + lm_ggml_fp16_t m; // min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; +static_assert(sizeof(block_q5_1) == 2 * sizeof(lm_ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); + +#define QK8_0 32 +typedef struct { + lm_ggml_fp16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(lm_ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +typedef struct { + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_1]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); + +// +// Super-block quantization structures +// + // Super-block size #ifdef LM_GGML_QKK_64 #define QK_K 64 @@ -15,18 +67,6 @@ #define K_SCALE_SIZE 12 #endif -#ifndef static_assert -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) -#define static_assert(cond, msg) _Static_assert(cond, msg) -#else -#define static_assert(cond, msg) struct global_scope_noop_trick -#endif -#endif - -// -// Super-block quantization structures -// - // 2-bit quantization // weight is represented as x = a * q + b // 16 blocks of 16 elements each @@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ // Quantization +void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); +void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); +void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); +void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); +void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); +void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); + void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); @@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); +void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); + void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); @@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); // Dequantization +void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); +void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); +void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); +void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); +void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); +//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); + void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); @@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); // Dot product +void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); + void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -// Quantization with histogram collection -size_t lm_ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t lm_ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t lm_ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t lm_ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t lm_ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); - diff --git a/cpp/ggml.c b/cpp/ggml.c index 525ae1c8..c0d6ff31 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -1,10 +1,8 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#define _USE_MATH_DEFINES // For M_PI on MSVC -#include "ggml.h" - -#ifdef LM_GGML_USE_K_QUANTS -#include "k_quants.h" -#endif +#include "ggml-impl.h" +#include "ggml-quants.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -30,18 +28,6 @@ #include #endif -// static_assert should be a #define, but if it's not, -// fall back to the _Static_assert C11 keyword. -// if C99 - static_assert is noop -// ref: https://stackoverflow.com/a/53923785/4039976 -#ifndef static_assert -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) -#define static_assert(cond, msg) _Static_assert(cond, msg) -#else -#define static_assert(cond, msg) struct global_scope_noop_trick -#endif -#endif - #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) @@ -109,23 +95,11 @@ typedef void * thread_ret_t; #include #endif + #ifdef LM_GGML_USE_CPU_HBM #include #endif -// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 -#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __FMA__ -#define __FMA__ -#endif -#ifndef __F16C__ -#define __F16C__ -#endif -#ifndef __SSE3__ -#define __SSE3__ -#endif -#endif - /*#define LM_GGML_PERF*/ #define LM_GGML_DEBUG 0 #define LM_GGML_GELU_FP16 @@ -251,228 +225,27 @@ inline static void * lm_ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif -#undef MIN -#undef MAX -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - // floating point type used to accumulate sums typedef double lm_ggml_float; -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -#if defined(__ARM_NEON) && !defined(_MSC_VER) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#define LM_GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define LM_GGML_COMPUTE_FP32_TO_FP16(x) (x) - -#define LM_GGML_FP16_TO_FP32(x) ((float) (x)) -#define LM_GGML_FP32_TO_FP16(x) (x) - -#else - -#ifdef __wasm_simd128__ -#include -#else -#ifdef __POWER9_VECTOR__ -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define LM_GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define LM_GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define LM_GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define LM_GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif - -#elif defined(__POWER9_VECTOR__) - -#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x) -#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x) -#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x) - -static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) { - register double d; - register lm_ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} - -#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x) -#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x) - -#endif // __F16C__ - -#endif // __ARM_NEON - // // global data // // precomputed gelu table for f16 (128 KB) -static lm_ggml_fp16_t table_gelu_f16[1 << 16]; +static lm_ggml_fp16_t lm_ggml_table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) -static lm_ggml_fp16_t table_gelu_quick_f16[1 << 16]; +static lm_ggml_fp16_t lm_ggml_table_gelu_quick_f16[1 << 16]; // precomputed silu table for f16 (128 KB) -static lm_ggml_fp16_t table_silu_f16[1 << 16]; +static lm_ggml_fp16_t lm_ggml_table_silu_f16[1 << 16]; // precomputed exp table for f16 (128 KB) -static lm_ggml_fp16_t table_exp_f16[1 << 16]; - -// precomputed f32 table for f16 (256 KB) -static float table_f32_f16[1 << 16]; - -#if defined(__ARM_NEON) || defined(__wasm_simd128__) -#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s -#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) -#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) -#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) -#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) -#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) -#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) -#define B8(c,s ) B7(c,s, c), B7(c,s, s) - -// precomputed tables for expanding 8bits to 8 bytes: -static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 -static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 -#endif - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32, -// so we define LM_GGML_FP16_TO_FP32 and LM_GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(LM_GGML_FP16_TO_FP32) || !defined(LM_GGML_FP32_TO_FP16) - -inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return table_f32_f16[s]; -} - -#define LM_GGML_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x) -#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x) +static lm_ggml_fp16_t lm_ggml_table_exp_f16[1 << 16]; -#endif +// precomputed f32 table for f16 (256 KB) (ggml-impl.h) +float lm_ggml_table_f32_f16[1 << 16]; // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API @@ -587,3071 +360,816 @@ int64_t lm_ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -// -// quantization -// +static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); +static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y); -#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) - -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -// multiply int8_t, add results pairwise twice -static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(x, x); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(y, x); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - const __m128i ones = _mm_set1_epi16(1); - return _mm_madd_epi16(ones, dot); -} - -#if __AVX__ || __AVX2__ || __AVX512F__ -// horizontally add 8 floats -static inline float hsum_float_8(const __m256 x) { - __m128 res = _mm256_extractf128_ps(x, 1); - res = _mm_add_ps(res, _mm256_castps256_ps128(x)); - res = _mm_add_ps(res, _mm_movehl_ps(res, res)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - return _mm_cvtss_f32(res); -} - -// horizontally add 8 int32_t -static inline int hsum_i32_8(const __m256i a) { - const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); - const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); - const __m128i sum64 = _mm_add_epi32(hi64, sum128); - const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); -} - -// horizontally add 4 int32_t -static inline int hsum_i32_4(const __m128i a) { - const __m128i hi64 = _mm_unpackhi_epi64(a, a); - const __m128i sum64 = _mm_add_epi32(hi64, a); - const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); -} - -#if defined(__AVX2__) || defined(__AVX512F__) -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m256i shuf_mask = _mm256_set_epi64x( - 0x0303030303030303, 0x0202020202020202, - 0x0101010101010101, 0x0000000000000000); - __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); - const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytes = _mm256_or_si256(bytes, bit_mask); - return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - return _mm256_and_si256(lowMask, bytes); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m256i x) { - const __m256i ones = _mm256_set1_epi16(1); - const __m256i summed_pairs = _mm256_madd_epi16(ones, x); - return _mm256_cvtepi32_ps(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { -#if __AVXVNNI__ - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); - return _mm256_cvtepi32_ps(summed_pairs); -#else - // Perform multiplication and create 16-bit values - const __m256i dot = _mm256_maddubs_epi16(ax, sy); - return sum_i16_pairs_float(dot); -#endif -} +static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { + [LM_GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, + }, + [LM_GGML_TYPE_I16] = { + .type_name = "i16", + .blck_size = 1, + .type_size = sizeof(int16_t), + .is_quantized = false, + }, + [LM_GGML_TYPE_I32] = { + .type_name = "i32", + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, + }, + [LM_GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, + .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f32, + .vec_dot_type = LM_GGML_TYPE_F32, + }, + [LM_GGML_TYPE_F16] = { + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(lm_ggml_fp16_t), + .is_quantized = false, + .to_float = (lm_ggml_to_float_t) lm_ggml_fp16_to_fp32_row, + .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row, + .from_float_reference = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row, + .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f16, + .vec_dot_type = LM_GGML_TYPE_F16, + }, + [LM_GGML_TYPE_Q4_0] = { + .type_name = "q4_0", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q4_0, + .from_float = quantize_row_q4_0, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_0_reference, + .vec_dot = lm_ggml_vec_dot_q4_0_q8_0, + .vec_dot_type = LM_GGML_TYPE_Q8_0, + }, + [LM_GGML_TYPE_Q4_1] = { + .type_name = "q4_1", + .blck_size = QK4_1, + .type_size = sizeof(block_q4_1), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q4_1, + .from_float = quantize_row_q4_1, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_1_reference, + .vec_dot = lm_ggml_vec_dot_q4_1_q8_1, + .vec_dot_type = LM_GGML_TYPE_Q8_1, + }, + [4] = { // LM_GGML_TYPE_Q4_2 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = LM_GGML_TYPE_COUNT, + }, + [5] = { // LM_GGML_TYPE_Q4_3 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = LM_GGML_TYPE_COUNT, + }, + [LM_GGML_TYPE_Q5_0] = { + .type_name = "q5_0", + .blck_size = QK5_0, + .type_size = sizeof(block_q5_0), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q5_0, + .from_float = quantize_row_q5_0, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_0_reference, + .vec_dot = lm_ggml_vec_dot_q5_0_q8_0, + .vec_dot_type = LM_GGML_TYPE_Q8_0, + }, + [LM_GGML_TYPE_Q5_1] = { + .type_name = "q5_1", + .blck_size = QK5_1, + .type_size = sizeof(block_q5_1), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q5_1, + .from_float = quantize_row_q5_1, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_1_reference, + .vec_dot = lm_ggml_vec_dot_q5_1_q8_1, + .vec_dot_type = LM_GGML_TYPE_Q8_1, + }, + [LM_GGML_TYPE_Q8_0] = { + .type_name = "q8_0", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q8_0, + .from_float = quantize_row_q8_0, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q8_0_reference, + .vec_dot = lm_ggml_vec_dot_q8_0_q8_0, + .vec_dot_type = LM_GGML_TYPE_Q8_0, + }, + [LM_GGML_TYPE_Q8_1] = { + .type_name = "q8_1", + .blck_size = QK8_1, + .type_size = sizeof(block_q8_1), + .is_quantized = true, + .from_float = quantize_row_q8_1, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q8_1_reference, + .vec_dot_type = LM_GGML_TYPE_Q8_1, + }, + [LM_GGML_TYPE_Q2_K] = { + .type_name = "q2_K", + .blck_size = QK_K, + .type_size = sizeof(block_q2_K), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q2_K, + .from_float = quantize_row_q2_K, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q2_K_reference, + .vec_dot = lm_ggml_vec_dot_q2_K_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, + [LM_GGML_TYPE_Q3_K] = { + .type_name = "q3_K", + .blck_size = QK_K, + .type_size = sizeof(block_q3_K), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q3_K, + .from_float = quantize_row_q3_K, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q3_K_reference, + .vec_dot = lm_ggml_vec_dot_q3_K_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, + [LM_GGML_TYPE_Q4_K] = { + .type_name = "q4_K", + .blck_size = QK_K, + .type_size = sizeof(block_q4_K), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q4_K, + .from_float = quantize_row_q4_K, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_K_reference, + .vec_dot = lm_ggml_vec_dot_q4_K_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, + [LM_GGML_TYPE_Q5_K] = { + .type_name = "q5_K", + .blck_size = QK_K, + .type_size = sizeof(block_q5_K), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q5_K, + .from_float = quantize_row_q5_K, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_K_reference, + .vec_dot = lm_ggml_vec_dot_q5_K_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, + [LM_GGML_TYPE_Q6_K] = { + .type_name = "q6_K", + .blck_size = QK_K, + .type_size = sizeof(block_q6_K), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_q6_K, + .from_float = quantize_row_q6_K, + .from_float_reference = (lm_ggml_from_float_t) quantize_row_q6_K_reference, + .vec_dot = lm_ggml_vec_dot_q6_K_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, + [LM_GGML_TYPE_Q8_K] = { + .type_name = "q8_K", + .blck_size = QK_K, + .type_size = sizeof(block_q8_K), + .is_quantized = true, + .from_float = quantize_row_q8_K, + } +}; -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { -#if __AVXVNNIINT8__ - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); - return _mm256_cvtepi32_ps(summed_pairs); -#else - // Get absolute values of x vectors - const __m256i ax = _mm256_sign_epi8(x, x); - // Sign the values of the y vectors - const __m256i sy = _mm256_sign_epi8(y, x); - return mul_sum_us8_pairs_float(ax, sy); -#endif +// For internal test use +lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { + LM_GGML_ASSERT(type < LM_GGML_TYPE_COUNT); + return type_traits[type]; } -static inline __m128i packNibbles( __m256i bytes ) -{ - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh -#if __AVX512F__ - const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 - bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh - return _mm256_cvtepi16_epi8(bytes); // abcd_efgh -#else - const __m256i lowByte = _mm256_set1_epi16( 0xFF ); - __m256i high = _mm256_andnot_si256( lowByte, bytes ); - __m256i low = _mm256_and_si256( lowByte, bytes ); - high = _mm256_srli_epi16( high, 4 ); - bytes = _mm256_or_si256( low, high ); - - // Compress uint16_t lanes into bytes - __m128i r0 = _mm256_castsi256_si128( bytes ); - __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); - return _mm_packus_epi16( r0, r1 ); -#endif -} -#elif defined(__AVX__) -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); - const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); - __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); - __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); - const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytesl = _mm_or_si128(bytesl, bit_mask); - bytesh = _mm_or_si128(bytesh, bit_mask); - bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); - bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); - return MM256_SET_M128I(bytesh, bytesl); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - // Load 16 bytes from memory - __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); - __m128i tmph = _mm_srli_epi16(tmpl, 4); - const __m128i lowMask = _mm_set1_epi8(0xF); - tmpl = _mm_and_si128(lowMask, tmpl); - tmph = _mm_and_si128(lowMask, tmph); - return MM256_SET_M128I(tmph, tmpl); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { - const __m128i ones = _mm_set1_epi16(1); - const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); - const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); - const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); - return _mm256_cvtepi32_ps(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { - const __m128i axl = _mm256_castsi256_si128(ax); - const __m128i axh = _mm256_extractf128_si256(ax, 1); - const __m128i syl = _mm256_castsi256_si128(sy); - const __m128i syh = _mm256_extractf128_si256(sy, 1); - // Perform multiplication and create 16-bit values - const __m128i dotl = _mm_maddubs_epi16(axl, syl); - const __m128i doth = _mm_maddubs_epi16(axh, syh); - return sum_i16_pairs_float(doth, dotl); -} - -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { - const __m128i xl = _mm256_castsi256_si128(x); - const __m128i xh = _mm256_extractf128_si256(x, 1); - const __m128i yl = _mm256_castsi256_si128(y); - const __m128i yh = _mm256_extractf128_si256(y, 1); - // Get absolute values of x vectors - const __m128i axl = _mm_sign_epi8(xl, xl); - const __m128i axh = _mm_sign_epi8(xh, xh); - // Sign the values of the y vectors - const __m128i syl = _mm_sign_epi8(yl, xl); - const __m128i syh = _mm_sign_epi8(yh, xh); - // Perform multiplication and create 16-bit values - const __m128i dotl = _mm_maddubs_epi16(axl, syl); - const __m128i doth = _mm_maddubs_epi16(axh, syh); - return sum_i16_pairs_float(doth, dotl); -} - -static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) -{ - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh - const __m128i lowByte = _mm_set1_epi16( 0xFF ); - __m128i high = _mm_andnot_si128( lowByte, bytes1 ); - __m128i low = _mm_and_si128( lowByte, bytes1 ); - high = _mm_srli_epi16( high, 4 ); - bytes1 = _mm_or_si128( low, high ); - high = _mm_andnot_si128( lowByte, bytes2 ); - low = _mm_and_si128( lowByte, bytes2 ); - high = _mm_srli_epi16( high, 4 ); - bytes2 = _mm_or_si128( low, high ); - - return _mm_packus_epi16( bytes1, bytes2); -} -#endif -#elif defined(__SSSE3__) -// horizontally add 4x4 floats -static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { - __m128 res_0 =_mm_hadd_ps(a, b); - __m128 res_1 =_mm_hadd_ps(c, d); - __m128 res =_mm_hadd_ps(res_0, res_1); - res =_mm_hadd_ps(res, res); - res =_mm_hadd_ps(res, res); - - return _mm_cvtss_f32(res); -} -#endif // __AVX__ || __AVX2__ || __AVX512F__ -#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) - -#if defined(__ARM_NEON) - -#if !defined(__aarch64__) - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -#endif -#endif - -#define QK4_0 32 -typedef struct { - lm_ggml_fp16_t d; // delta - uint8_t qs[QK4_0 / 2]; // nibbles / quants -} block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(lm_ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); - -#define QK4_1 32 -typedef struct { - lm_ggml_fp16_t d; // delta - lm_ggml_fp16_t m; // min - uint8_t qs[QK4_1 / 2]; // nibbles / quants -} block_q4_1; -static_assert(sizeof(block_q4_1) == 2 * sizeof(lm_ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); - -#define QK5_0 32 -typedef struct { - lm_ggml_fp16_t d; // delta - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} block_q5_0; -static_assert(sizeof(block_q5_0) == sizeof(lm_ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); - -#define QK5_1 32 -typedef struct { - lm_ggml_fp16_t d; // delta - lm_ggml_fp16_t m; // min - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_1 / 2]; // nibbles / quants -} block_q5_1; -static_assert(sizeof(block_q5_1) == 2 * sizeof(lm_ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); - -#define QK8_0 32 -typedef struct { - lm_ggml_fp16_t d; // delta - int8_t qs[QK8_0]; // quants -} block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(lm_ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); - -#define QK8_1 32 -typedef struct { - float d; // delta - float s; // d * sum(qs[i]) - int8_t qs[QK8_1]; // quants -} block_q8_1; -static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); - -// reference implementation for deterministic creation of model files -static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { - static const int qk = QK4_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; - } - } - - const float d = max / -8; - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - - for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - - y[i].qs[j] = xi0; - y[i].qs[j] |= xi1 << 4; - } - } -} - -static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_0_reference(x, y, k); -} - -static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { - const int qk = QK4_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - y[i].m = LM_GGML_FP32_TO_FP16(min); - - for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - - y[i].qs[j] = xi0; - y[i].qs[j] |= xi1 << 4; - } - } -} - -static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_1_reference(x, y, k); -} - -static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { - static const int qk = QK5_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; - } - } - - const float d = max / -16; - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - - uint32_t qh = 0; - - for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; - - const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - - y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); - - // get the 5-th bit and store it in qh at the right position - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); - } - - memcpy(&y[i].qh, &qh, sizeof(qh)); - } -} - -static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_0_reference(x, y, k); -} - -static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { - const int qk = QK5_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 5) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - y[i].m = LM_GGML_FP32_TO_FP16(min); - - uint32_t qh = 0; - - for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); - - // get the 5-th bit and store it in qh at the right position - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); - } - - memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); - } -} - -static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_1_reference(x, y, k); -} - -// reference implementation for deterministic creation of model files -static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = x[i*QK8_0 + j]; - amax = MAX(amax, fabsf(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = x[i*QK8_0 + j]*id; - - y[i].qs[j] = roundf(x0); - } - } -} - -static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { - assert(QK8_0 == 32); - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); - - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); - } - } -#elif defined(__wasm_simd128__) - for (int i = 0; i < nb; i++) { - v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), - wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), - wasm_f32x4_extract_lane(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); - - y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); - y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); - y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); - y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); - } - } -#elif defined(__AVX2__) || defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Quantize these floats - const float d = maxScalar / 127.f; - y[i].d = LM_GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - -#if defined(__AVX2__) - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - _mm256_storeu_si256((__m256i *)y[i].qs, i0); -#else - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); - _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); -#endif - } -#elif defined(__riscv_v_intrinsic) - - size_t vl = __riscv_vsetvl_e32m4(QK8_0); - - for (int i = 0; i < nb; i++) { - // load elements - vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl); - - vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); - vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); - float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = LM_GGML_FP32_TO_FP16(d); - - vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); - - // convert to integer - vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); - vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); - - // store result - __riscv_vse8_v_i8m1(y[i].qs , vs, vl); - } -#else - // scalar - quantize_row_q8_0_reference(x, y, k); -#endif -} - -// reference implementation for deterministic creation of model files -static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { - assert(QK8_1 == 32); - assert(k % QK8_1 == 0); - const int nb = k / QK8_1; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_1; j++) { - const float v = x[i*QK8_1 + j]; - amax = MAX(amax, fabsf(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - int sum = 0; - - for (int j = 0; j < QK8_1/2; ++j) { - const float v0 = x[i*QK8_1 + j]*id; - const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; - - y[i].qs[ j] = roundf(v0); - y[i].qs[QK8_1/2 + j] = roundf(v1); - - sum += y[i].qs[ j]; - sum += y[i].qs[QK8_1/2 + j]; - } - - y[i].s = sum*d; - } -} - -static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK8_1 == 0); - const int nb = k / QK8_1; - - block_q8_1 * restrict y = vy; - -#if defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - int32x4_t accv = vdupq_n_s32(0); - - for (int j = 0; j < 8; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); - - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); - - accv = vaddq_s32(accv, vi); - } - - y[i].s = d * vaddvq_s32(accv); - } -#elif defined(__wasm_simd128__) - for (int i = 0; i < nb; i++) { - v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), - wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), - wasm_f32x4_extract_lane(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - v128_t accv = wasm_i32x4_splat(0); - - for (int j = 0; j < 8; j++) { - const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); - - y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); - y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); - y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); - y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); - - accv = wasm_i32x4_add(accv, vi); - } - - y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + - wasm_i32x4_extract_lane(accv, 1) + - wasm_i32x4_extract_lane(accv, 2) + - wasm_i32x4_extract_lane(accv, 3)); - } -#elif defined(__AVX2__) || defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Quantize these floats - const float d = maxScalar / 127.f; - y[i].d = d; - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - -#if defined(__AVX2__) - // Compute the sum of the quants and set y[i].s - y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); - - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - _mm256_storeu_si256((__m256i *)y[i].qs, i0); -#else - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Compute the sum of the quants and set y[i].s - const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); - const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); - _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); -#endif - } -#elif defined(__riscv_v_intrinsic) - - size_t vl = __riscv_vsetvl_e32m4(QK8_1); - - for (int i = 0; i < nb; i++) { - // load elements - vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl); - - vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); - vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl); - vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); - float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); - - // convert to integer - vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); - vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); - - // store result - __riscv_vse8_v_i8m1(y[i].qs , vs, vl); - - // compute sum for y[i].s - vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl); - vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl); - - // set y[i].s - int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); - y[i].s = sum*d; - } -#else - // scalar - quantize_row_q8_1_reference(x, y, k); -#endif -} - -static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { - static const int qk = QK4_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d); - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F) - 8; - const int x1 = (x[i].qs[j] >> 4) - 8; - - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; - } - } -} - -static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { - static const int qk = QK4_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d); - const float m = LM_GGML_FP16_TO_FP32(x[i].m); - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F); - const int x1 = (x[i].qs[j] >> 4); - - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; - } - } -} - -static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { - static const int qk = QK5_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d); - - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; - - const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; - } - } -} - -static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { - static const int qk = QK5_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d); - const float m = LM_GGML_FP16_TO_FP32(x[i].m); - - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; - - const int x0 = (x[i].qs[j] & 0x0F) | xh_0; - const int x1 = (x[i].qs[j] >> 4) | xh_1; - - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; - } - } -} - -static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) { - static const int qk = QK8_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - const block_q8_0 * restrict x = vx; - - for (int i = 0; i < nb; i++) { - const float d = LM_GGML_FP16_TO_FP32(x[i].d); - - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; - } - } -} - -static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y); -static void lm_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { - [LM_GGML_TYPE_I8] = { - .type_name = "i8", - .blck_size = 1, - .type_size = sizeof(int8_t), - .is_quantized = false, - }, - [LM_GGML_TYPE_I16] = { - .type_name = "i16", - .blck_size = 1, - .type_size = sizeof(int16_t), - .is_quantized = false, - }, - [LM_GGML_TYPE_I32] = { - .type_name = "i32", - .blck_size = 1, - .type_size = sizeof(int32_t), - .is_quantized = false, - }, - [LM_GGML_TYPE_F32] = { - .type_name = "f32", - .blck_size = 1, - .type_size = sizeof(float), - .is_quantized = false, - .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f32, - .vec_dot_type = LM_GGML_TYPE_F32, - }, - [LM_GGML_TYPE_F16] = { - .type_name = "f16", - .blck_size = 1, - .type_size = sizeof(lm_ggml_fp16_t), - .is_quantized = false, - .to_float = (lm_ggml_to_float_t) lm_ggml_fp16_to_fp32_row, - .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row, - .from_float_reference = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row, - .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f16, - .vec_dot_type = LM_GGML_TYPE_F16, - }, - [LM_GGML_TYPE_Q4_0] = { - .type_name = "q4_0", - .blck_size = QK4_0, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q4_0, - .from_float = quantize_row_q4_0, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_0_reference, - .vec_dot = lm_ggml_vec_dot_q4_0_q8_0, - .vec_dot_type = LM_GGML_TYPE_Q8_0, - }, - [LM_GGML_TYPE_Q4_1] = { - .type_name = "q4_1", - .blck_size = QK4_1, - .type_size = sizeof(block_q4_1), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q4_1, - .from_float = quantize_row_q4_1, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_1_reference, - .vec_dot = lm_ggml_vec_dot_q4_1_q8_1, - .vec_dot_type = LM_GGML_TYPE_Q8_1, - }, - [LM_GGML_TYPE_Q5_0] = { - .type_name = "q5_0", - .blck_size = QK5_0, - .type_size = sizeof(block_q5_0), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q5_0, - .from_float = quantize_row_q5_0, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_0_reference, - .vec_dot = lm_ggml_vec_dot_q5_0_q8_0, - .vec_dot_type = LM_GGML_TYPE_Q8_0, - }, - [LM_GGML_TYPE_Q5_1] = { - .type_name = "q5_1", - .blck_size = QK5_1, - .type_size = sizeof(block_q5_1), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q5_1, - .from_float = quantize_row_q5_1, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_1_reference, - .vec_dot = lm_ggml_vec_dot_q5_1_q8_1, - .vec_dot_type = LM_GGML_TYPE_Q8_1, - }, - [LM_GGML_TYPE_Q8_0] = { - .type_name = "q8_0", - .blck_size = QK8_0, - .type_size = sizeof(block_q8_0), - .is_quantized = true, - .to_float = dequantize_row_q8_0, - .from_float = quantize_row_q8_0, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q8_0_reference, - .vec_dot = lm_ggml_vec_dot_q8_0_q8_0, - .vec_dot_type = LM_GGML_TYPE_Q8_0, - }, - [LM_GGML_TYPE_Q8_1] = { - .type_name = "q8_1", - .blck_size = QK8_1, - .type_size = sizeof(block_q8_1), - .is_quantized = true, - .from_float = quantize_row_q8_1, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q8_1_reference, - .vec_dot_type = LM_GGML_TYPE_Q8_1, - }, -#ifdef LM_GGML_USE_K_QUANTS - [LM_GGML_TYPE_Q2_K] = { - .type_name = "q2_K", - .blck_size = QK_K, - .type_size = sizeof(block_q2_K), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q2_K, - .from_float = quantize_row_q2_K, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q2_K_reference, - .vec_dot = lm_ggml_vec_dot_q2_K_q8_K, - .vec_dot_type = LM_GGML_TYPE_Q8_K, - }, - [LM_GGML_TYPE_Q3_K] = { - .type_name = "q3_K", - .blck_size = QK_K, - .type_size = sizeof(block_q3_K), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q3_K, - .from_float = quantize_row_q3_K, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q3_K_reference, - .vec_dot = lm_ggml_vec_dot_q3_K_q8_K, - .vec_dot_type = LM_GGML_TYPE_Q8_K, - }, - [LM_GGML_TYPE_Q4_K] = { - .type_name = "q4_K", - .blck_size = QK_K, - .type_size = sizeof(block_q4_K), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q4_K, - .from_float = quantize_row_q4_K, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q4_K_reference, - .vec_dot = lm_ggml_vec_dot_q4_K_q8_K, - .vec_dot_type = LM_GGML_TYPE_Q8_K, - }, - [LM_GGML_TYPE_Q5_K] = { - .type_name = "q5_K", - .blck_size = QK_K, - .type_size = sizeof(block_q5_K), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q5_K, - .from_float = quantize_row_q5_K, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q5_K_reference, - .vec_dot = lm_ggml_vec_dot_q5_K_q8_K, - .vec_dot_type = LM_GGML_TYPE_Q8_K, - }, - [LM_GGML_TYPE_Q6_K] = { - .type_name = "q6_K", - .blck_size = QK_K, - .type_size = sizeof(block_q6_K), - .is_quantized = true, - .to_float = (lm_ggml_to_float_t) dequantize_row_q6_K, - .from_float = quantize_row_q6_K, - .from_float_reference = (lm_ggml_from_float_t) quantize_row_q6_K_reference, - .vec_dot = lm_ggml_vec_dot_q6_K_q8_K, - .vec_dot_type = LM_GGML_TYPE_Q8_K, - }, - [LM_GGML_TYPE_Q8_K] = { - .type_name = "q8_K", - .blck_size = QK_K, - .type_size = sizeof(block_q8_K), - .is_quantized = true, - .from_float = quantize_row_q8_K, - } -#endif -}; - -// For internal test use -lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { - LM_GGML_ASSERT(type < LM_GGML_TYPE_COUNT); - return type_traits[type]; -} - -// -// simd mappings -// - -// we define a common set of C macros which map to specific intrinsics based on the current architecture -// we then implement the fundamental computation operations below using only these macros -// adding support for new architectures requires to define the corresponding SIMD macros -// -// LM_GGML_F32_STEP / LM_GGML_F16_STEP -// number of elements to process in a single step -// -// LM_GGML_F32_EPR / LM_GGML_F16_EPR -// number of elements to fit in a single register -// - -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) - -#define LM_GGML_SIMD - -// F32 NEON - -#define LM_GGML_F32_STEP 16 -#define LM_GGML_F32_EPR 4 - -#define LM_GGML_F32x4 float32x4_t -#define LM_GGML_F32x4_ZERO vdupq_n_f32(0.0f) -#define LM_GGML_F32x4_SET1(x) vdupq_n_f32(x) -#define LM_GGML_F32x4_LOAD vld1q_f32 -#define LM_GGML_F32x4_STORE vst1q_f32 -#define LM_GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) -#define LM_GGML_F32x4_ADD vaddq_f32 -#define LM_GGML_F32x4_MUL vmulq_f32 -#define LM_GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define LM_GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = LM_GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - res = LM_GGML_F32x4_REDUCE_ONE(x[0]); \ -} - -#define LM_GGML_F32_VEC LM_GGML_F32x4 -#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO -#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 -#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD -#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE -#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD -#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL -#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - -// F16 NEON - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define LM_GGML_F16_STEP 32 - #define LM_GGML_F16_EPR 8 - - #define LM_GGML_F16x8 float16x8_t - #define LM_GGML_F16x8_ZERO vdupq_n_f16(0.0f) - #define LM_GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define LM_GGML_F16x8_LOAD vld1q_f16 - #define LM_GGML_F16x8_STORE vst1q_f16 - #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) - #define LM_GGML_F16x8_ADD vaddq_f16 - #define LM_GGML_F16x8_MUL vmulq_f16 - #define LM_GGML_F16x8_REDUCE(res, x) \ - do { \ - int offset = LM_GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ - const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } while (0) - - #define LM_GGML_F16_VEC LM_GGML_F16x8 - #define LM_GGML_F16_VEC_ZERO LM_GGML_F16x8_ZERO - #define LM_GGML_F16_VEC_SET1 LM_GGML_F16x8_SET1 - #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x8_LOAD(p) - #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE(p, r[i]) - #define LM_GGML_F16_VEC_FMA LM_GGML_F16x8_FMA - #define LM_GGML_F16_VEC_ADD LM_GGML_F16x8_ADD - #define LM_GGML_F16_VEC_MUL LM_GGML_F16x8_MUL - #define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x8_REDUCE -#else - // if FP16 vector arithmetic is not supported, we use FP32 instead - // and take advantage of the vcvt_ functions to convert to/from FP16 - - #define LM_GGML_F16_STEP 16 - #define LM_GGML_F16_EPR 4 - - #define LM_GGML_F32Cx4 float32x4_t - #define LM_GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) - #define LM_GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define LM_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) - #define LM_GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) - #define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) - #define LM_GGML_F32Cx4_ADD vaddq_f32 - #define LM_GGML_F32Cx4_MUL vmulq_f32 - #define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE - - #define LM_GGML_F16_VEC LM_GGML_F32Cx4 - #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO - #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1 - #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p) - #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i]) - #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA - #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD - #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL - #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE -#endif - -#elif defined(__AVX__) - -#define LM_GGML_SIMD - -// F32 AVX - -#define LM_GGML_F32_STEP 32 -#define LM_GGML_F32_EPR 8 - -#define LM_GGML_F32x8 __m256 -#define LM_GGML_F32x8_ZERO _mm256_setzero_ps() -#define LM_GGML_F32x8_SET1(x) _mm256_set1_ps(x) -#define LM_GGML_F32x8_LOAD _mm256_loadu_ps -#define LM_GGML_F32x8_STORE _mm256_storeu_ps -#if defined(__FMA__) - #define LM_GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) -#else - #define LM_GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) -#endif -#define LM_GGML_F32x8_ADD _mm256_add_ps -#define LM_GGML_F32x8_MUL _mm256_mul_ps -#define LM_GGML_F32x8_REDUCE(res, x) \ -do { \ - int offset = LM_GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm256_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm256_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm256_add_ps(x[i], x[offset+i]); \ - } \ - const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ - _mm256_extractf128_ps(x[0], 1)); \ - const __m128 t1 = _mm_hadd_ps(t0, t0); \ - res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ -} while (0) -// TODO: is this optimal ? - -#define LM_GGML_F32_VEC LM_GGML_F32x8 -#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x8_ZERO -#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x8_SET1 -#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x8_LOAD -#define LM_GGML_F32_VEC_STORE LM_GGML_F32x8_STORE -#define LM_GGML_F32_VEC_FMA LM_GGML_F32x8_FMA -#define LM_GGML_F32_VEC_ADD LM_GGML_F32x8_ADD -#define LM_GGML_F32_VEC_MUL LM_GGML_F32x8_MUL -#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x8_REDUCE - -// F16 AVX - -#define LM_GGML_F16_STEP 32 -#define LM_GGML_F16_EPR 8 - -// F16 arithmetic is not supported by AVX, so we use F32 instead - -#define LM_GGML_F32Cx8 __m256 -#define LM_GGML_F32Cx8_ZERO _mm256_setzero_ps() -#define LM_GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) - -#if defined(__F16C__) -// the _mm256_cvt intrinsics require F16C -#define LM_GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) -#define LM_GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) -#else -static inline __m256 __avx_f32cx8_load(lm_ggml_fp16_t *x) { - float tmp[8]; - - for (int i = 0; i < 8; i++) { - tmp[i] = LM_GGML_FP16_TO_FP32(x[i]); - } - - return _mm256_loadu_ps(tmp); -} -static inline void __avx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) { - float arr[8]; - - _mm256_storeu_ps(arr, y); - - for (int i = 0; i < 8; i++) - x[i] = LM_GGML_FP32_TO_FP16(arr[i]); -} -#define LM_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) -#define LM_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) -#endif - -#define LM_GGML_F32Cx8_FMA LM_GGML_F32x8_FMA -#define LM_GGML_F32Cx8_ADD _mm256_add_ps -#define LM_GGML_F32Cx8_MUL _mm256_mul_ps -#define LM_GGML_F32Cx8_REDUCE LM_GGML_F32x8_REDUCE - -#define LM_GGML_F16_VEC LM_GGML_F32Cx8 -#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx8_ZERO -#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx8_SET1 -#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx8_LOAD(p) -#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx8_STORE(p, r[i]) -#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx8_FMA -#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx8_ADD -#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx8_MUL -#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx8_REDUCE - -#elif defined(__POWER9_VECTOR__) - -#define LM_GGML_SIMD - -// F32 POWER9 - -#define LM_GGML_F32_STEP 32 -#define LM_GGML_F32_EPR 4 - -#define LM_GGML_F32x4 vector float -#define LM_GGML_F32x4_ZERO 0.0f -#define LM_GGML_F32x4_SET1 vec_splats -#define LM_GGML_F32x4_LOAD(p) vec_xl(0, p) -#define LM_GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) -#define LM_GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) -#define LM_GGML_F32x4_ADD vec_add -#define LM_GGML_F32x4_MUL vec_mul -#define LM_GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = LM_GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset+i]); \ - } \ - res = vec_extract(x[0], 0) + \ - vec_extract(x[0], 1) + \ - vec_extract(x[0], 2) + \ - vec_extract(x[0], 3); \ -} - -#define LM_GGML_F32_VEC LM_GGML_F32x4 -#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO -#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 -#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD -#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE -#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD -#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL -#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - -// F16 POWER9 -#define LM_GGML_F16_STEP LM_GGML_F32_STEP -#define LM_GGML_F16_EPR LM_GGML_F32_EPR -#define LM_GGML_F16_VEC LM_GGML_F32x4 -#define LM_GGML_F16_VEC_ZERO LM_GGML_F32x4_ZERO -#define LM_GGML_F16_VEC_SET1 LM_GGML_F32x4_SET1 -#define LM_GGML_F16_VEC_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32x4_REDUCE -// Use vec_xl, not vec_ld, in case the load address is not aligned. -#define LM_GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ - vec_extract_fp32_from_shorth(vec_xl(0, p - LM_GGML_F16_EPR)) : \ - vec_extract_fp32_from_shortl(vec_xl(0, p)) -#define LM_GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] -#define LM_GGML_F16_VEC_STORE(p, r, i) \ - if (i & 0x1) \ - vec_xst(vec_pack_to_short_fp32(r[i - LM_GGML_ENDIAN_BYTE(1)], \ - r[i - LM_GGML_ENDIAN_BYTE(0)]), \ - 0, p - LM_GGML_F16_EPR) - -#elif defined(__wasm_simd128__) - -#define LM_GGML_SIMD - -// F32 WASM - -#define LM_GGML_F32_STEP 16 -#define LM_GGML_F32_EPR 4 - -#define LM_GGML_F32x4 v128_t -#define LM_GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) -#define LM_GGML_F32x4_SET1(x) wasm_f32x4_splat(x) -#define LM_GGML_F32x4_LOAD wasm_v128_load -#define LM_GGML_F32x4_STORE wasm_v128_store -#define LM_GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) -#define LM_GGML_F32x4_ADD wasm_f32x4_add -#define LM_GGML_F32x4_MUL wasm_f32x4_mul -#define LM_GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = LM_GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - res = wasm_f32x4_extract_lane(x[0], 0) + \ - wasm_f32x4_extract_lane(x[0], 1) + \ - wasm_f32x4_extract_lane(x[0], 2) + \ - wasm_f32x4_extract_lane(x[0], 3); \ -} - -#define LM_GGML_F32_VEC LM_GGML_F32x4 -#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO -#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 -#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD -#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE -#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD -#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL -#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - -// F16 WASM - -#define LM_GGML_F16_STEP 16 -#define LM_GGML_F16_EPR 4 - -inline static v128_t __wasm_f16x4_load(const lm_ggml_fp16_t * p) { - float tmp[4]; - - tmp[0] = LM_GGML_FP16_TO_FP32(p[0]); - tmp[1] = LM_GGML_FP16_TO_FP32(p[1]); - tmp[2] = LM_GGML_FP16_TO_FP32(p[2]); - tmp[3] = LM_GGML_FP16_TO_FP32(p[3]); - - return wasm_v128_load(tmp); -} - -inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) { - float tmp[4]; - - wasm_v128_store(tmp, x); - - p[0] = LM_GGML_FP32_TO_FP16(tmp[0]); - p[1] = LM_GGML_FP32_TO_FP16(tmp[1]); - p[2] = LM_GGML_FP32_TO_FP16(tmp[2]); - p[3] = LM_GGML_FP32_TO_FP16(tmp[3]); -} - -#define LM_GGML_F16x4 v128_t -#define LM_GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) -#define LM_GGML_F16x4_SET1(x) wasm_f32x4_splat(x) -#define LM_GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) -#define LM_GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) -#define LM_GGML_F16x4_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F16x4_ADD wasm_f32x4_add -#define LM_GGML_F16x4_MUL wasm_f32x4_mul -#define LM_GGML_F16x4_REDUCE(res, x) \ -{ \ - int offset = LM_GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - res = wasm_f32x4_extract_lane(x[0], 0) + \ - wasm_f32x4_extract_lane(x[0], 1) + \ - wasm_f32x4_extract_lane(x[0], 2) + \ - wasm_f32x4_extract_lane(x[0], 3); \ -} - -#define LM_GGML_F16_VEC LM_GGML_F16x4 -#define LM_GGML_F16_VEC_ZERO LM_GGML_F16x4_ZERO -#define LM_GGML_F16_VEC_SET1 LM_GGML_F16x4_SET1 -#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x4_LOAD(p) -#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x4_STORE(p, r[i]) -#define LM_GGML_F16_VEC_FMA LM_GGML_F16x4_FMA -#define LM_GGML_F16_VEC_ADD LM_GGML_F16x4_ADD -#define LM_GGML_F16_VEC_MUL LM_GGML_F16x4_MUL -#define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x4_REDUCE - -#elif defined(__SSE3__) - -#define LM_GGML_SIMD - -// F32 SSE - -#define LM_GGML_F32_STEP 32 -#define LM_GGML_F32_EPR 4 - -#define LM_GGML_F32x4 __m128 -#define LM_GGML_F32x4_ZERO _mm_setzero_ps() -#define LM_GGML_F32x4_SET1(x) _mm_set1_ps(x) -#define LM_GGML_F32x4_LOAD _mm_loadu_ps -#define LM_GGML_F32x4_STORE _mm_storeu_ps -#if defined(__FMA__) - // TODO: Does this work? - #define LM_GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) -#else - #define LM_GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) -#endif -#define LM_GGML_F32x4_ADD _mm_add_ps -#define LM_GGML_F32x4_MUL _mm_mul_ps -#define LM_GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = LM_GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm_add_ps(x[i], x[offset+i]); \ - } \ - const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ - res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ -} -// TODO: is this optimal ? - -#define LM_GGML_F32_VEC LM_GGML_F32x4 -#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO -#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 -#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD -#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE -#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD -#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL -#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - -// F16 SSE - -#define LM_GGML_F16_STEP 32 -#define LM_GGML_F16_EPR 4 - -static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) { - float tmp[4]; - - tmp[0] = LM_GGML_FP16_TO_FP32(x[0]); - tmp[1] = LM_GGML_FP16_TO_FP32(x[1]); - tmp[2] = LM_GGML_FP16_TO_FP32(x[2]); - tmp[3] = LM_GGML_FP16_TO_FP32(x[3]); - - return _mm_loadu_ps(tmp); -} - -static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) { - float arr[4]; - - _mm_storeu_ps(arr, y); - - x[0] = LM_GGML_FP32_TO_FP16(arr[0]); - x[1] = LM_GGML_FP32_TO_FP16(arr[1]); - x[2] = LM_GGML_FP32_TO_FP16(arr[2]); - x[3] = LM_GGML_FP32_TO_FP16(arr[3]); -} - -#define LM_GGML_F32Cx4 __m128 -#define LM_GGML_F32Cx4_ZERO _mm_setzero_ps() -#define LM_GGML_F32Cx4_SET1(x) _mm_set1_ps(x) -#define LM_GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) -#define LM_GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) -#define LM_GGML_F32Cx4_FMA LM_GGML_F32x4_FMA -#define LM_GGML_F32Cx4_ADD _mm_add_ps -#define LM_GGML_F32Cx4_MUL _mm_mul_ps -#define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE - -#define LM_GGML_F16_VEC LM_GGML_F32Cx4 -#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO -#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1 -#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p) -#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i]) -#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA -#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD -#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL -#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE - -#endif - -// LM_GGML_F32_ARR / LM_GGML_F16_ARR -// number of registers to use per step -#ifdef LM_GGML_SIMD -#define LM_GGML_F32_ARR (LM_GGML_F32_STEP/LM_GGML_F32_EPR) -#define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR) -#endif - -// -// fundamental operations -// - -inline static void lm_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void lm_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void lm_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void lm_ggml_vec_set_f16(const int n, lm_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } -inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } -inline static void lm_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } -inline static void lm_ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } -inline static void lm_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } -inline static void lm_ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void lm_ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } -inline static void lm_ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } -inline static void lm_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } -inline static void lm_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } - -static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { -#ifdef LM_GGML_SIMD - float sumf = 0.0f; - const int np = (n & ~(LM_GGML_F32_STEP - 1)); - - LM_GGML_F32_VEC sum[LM_GGML_F32_ARR] = { LM_GGML_F32_VEC_ZERO }; - - LM_GGML_F32_VEC ax[LM_GGML_F32_ARR]; - LM_GGML_F32_VEC ay[LM_GGML_F32_ARR]; - - for (int i = 0; i < np; i += LM_GGML_F32_STEP) { - for (int j = 0; j < LM_GGML_F32_ARR; j++) { - ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR); - ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR); - - sum[j] = LM_GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); - } - } - - // reduce sum0..sum3 to sum0 - LM_GGML_F32_VEC_REDUCE(sumf, sum); - - // leftovers - for (int i = np; i < n; ++i) { - sumf += x[i]*y[i]; - } -#else - // scalar - lm_ggml_float sumf = 0.0; - for (int i = 0; i < n; ++i) { - sumf += (lm_ggml_float)(x[i]*y[i]); - } -#endif - - *s = sumf; -} - -static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y) { - lm_ggml_float sumf = 0.0; - -#if defined(LM_GGML_SIMD) - const int np = (n & ~(LM_GGML_F16_STEP - 1)); - - LM_GGML_F16_VEC sum[LM_GGML_F16_ARR] = { LM_GGML_F16_VEC_ZERO }; - - LM_GGML_F16_VEC ax[LM_GGML_F16_ARR]; - LM_GGML_F16_VEC ay[LM_GGML_F16_ARR]; - - for (int i = 0; i < np; i += LM_GGML_F16_STEP) { - for (int j = 0; j < LM_GGML_F16_ARR; j++) { - ax[j] = LM_GGML_F16_VEC_LOAD(x + i + j*LM_GGML_F16_EPR, j); - ay[j] = LM_GGML_F16_VEC_LOAD(y + i + j*LM_GGML_F16_EPR, j); - - sum[j] = LM_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); - } - } - - // reduce sum0..sum3 to sum0 - LM_GGML_F16_VEC_REDUCE(sumf, sum); - - // leftovers - for (int i = np; i < n; ++i) { - sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i])); - } -#else - for (int i = 0; i < n; ++i) { - sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i])); - } -#endif - - *s = sumf; -} - -static void lm_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - LM_GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( d, q, acc ); - } - - *s = hsum_float_8(acc); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); - - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); - - __m128i bx = _mm_and_si128(lowMask, tmp); - __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx, by); - - bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx, by); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); - - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); - } - - *s = hsum_float_8(acc); -#elif defined(__SSSE3__) - // set constants - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - // Initialize accumulator with zeros - __m128 acc_0 = _mm_setzero_ps(); - __m128 acc_1 = _mm_setzero_ps(); - __m128 acc_2 = _mm_setzero_ps(); - __m128 acc_3 = _mm_setzero_ps(); - - // First round without accumulation - { - _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[0].d) * LM_GGML_FP16_TO_FP32(y[0].d) ); - - const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); - __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); - bx_1 = _mm_sub_epi8(bx_1, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - - _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[1].d) * LM_GGML_FP16_TO_FP32(y[1].d) ); - - const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); - - __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); - __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); - bx_2 = _mm_sub_epi8(bx_2, off); - const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - - __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); - __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); - bx_3 = _mm_sub_epi8(bx_3, off); - const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); - - // Convert int32_t to float - __m128 p0 = _mm_cvtepi32_ps(i32_0); - __m128 p1 = _mm_cvtepi32_ps(i32_1); - __m128 p2 = _mm_cvtepi32_ps(i32_2); - __m128 p3 = _mm_cvtepi32_ps(i32_3); - - // Apply the scale - acc_0 = _mm_mul_ps( d_0_1, p0 ); - acc_1 = _mm_mul_ps( d_0_1, p1 ); - acc_2 = _mm_mul_ps( d_2_3, p2 ); - acc_3 = _mm_mul_ps( d_2_3, p3 ); - } - - // Main loop - LM_GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 2; i < nb; i+=2) { - _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d) ); - - const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); - __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx_1 = _mm_sub_epi8(bx_1, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - - _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( LM_GGML_FP16_TO_FP32(x[i + 1].d) * LM_GGML_FP16_TO_FP32(y[i + 1].d) ); - - const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); - - __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); - __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); - bx_2 = _mm_sub_epi8(bx_2, off); - const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - - __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); - __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); - bx_3 = _mm_sub_epi8(bx_3, off); - const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); - - // Convert int32_t to float - __m128 p0 = _mm_cvtepi32_ps(i32_0); - __m128 p1 = _mm_cvtepi32_ps(i32_1); - __m128 p2 = _mm_cvtepi32_ps(i32_2); - __m128 p3 = _mm_cvtepi32_ps(i32_3); - - // Apply the scale - __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); - __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); - __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); - __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); - - // Acummulate - acc_0 = _mm_add_ps(p0_d, acc_0); - acc_1 = _mm_add_ps(p1_d, acc_1); - acc_2 = _mm_add_ps(p2_d, acc_2); - acc_3 = _mm_add_ps(p3_d, acc_3); - } - - *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); - - for (int i = 0; i < nb; i++) { - // load elements - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - - // mask and store lower part of x, and then upper part - vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - - vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - - // subtract offset - vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl); - vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl); - - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += sumi*LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d); - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[i].qs[j] & 0x0F) - 8; - const int v1 = (x[i].qs[j] >> 4) - 8; - - sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); - } - - sumf += sumi*LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d); - } - - *s = sumf; -#endif -} - -static void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_1; - const int nb = n / qk; - - assert(n % qk == 0); - - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; - - // TODO: add WASM SIMD -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs = 0; - - LM_GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i + 0]; - const block_q8_1 * restrict y1 = &y[i + 1]; - - summs += LM_GGML_FP16_TO_FP32(x0->m) * y0->s + LM_GGML_FP16_TO_FP32(x1->m) * y1->s; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; -#elif defined(__AVX2__) || defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - float summs = 0; - - // Main loop - for (int i = 0; i < nb; ++i) { - const float d0 = LM_GGML_FP16_TO_FP32(x[i].d); - const float d1 = y[i].d; - - summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; - - const __m256 d0v = _mm256_set1_ps( d0 ); - const __m256 d1v = _mm256_set1_ps( d1 ); - - // Compute combined scales - const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); - - // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - - const __m256 xy = mul_sum_us8_pairs_float(bx, by); - - // Accumulate d0*d1*x*y -#if defined(__AVX2__) - acc = _mm256_fmadd_ps( d0d1, xy, acc ); -#else - acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); -#endif - } - - *s = hsum_float_8(acc) + summs; -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); - - for (int i = 0; i < nb; i++) { - // load elements - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - - // mask and store lower part of x, and then upper part - vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - - vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[i].qs[j] & 0x0F); - const int v1 = (x[i].qs[j] >> 4); - - sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); - } - - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } - - *s = sumf; -#endif -} - -static void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - assert(qk == QK5_0); - - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - uint32_t qh0; - uint32_t qh1; - - uint64_t tmp0[4]; - uint64_t tmp1[4]; - - LM_GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q5_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - // extract the 5th bit via lookup table ((!b) << 4) - memcpy(&qh0, x0->qh, sizeof(qh0)); - memcpy(&qh1, x1->qh, sizeof(qh1)); - - tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; - tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; - tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; - tmp0[3] = table_b2b_1[(qh0 >> 24) ]; - - tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; - tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; - tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; - tmp1[3] = table_b2b_1[(qh1 >> 24) ]; - - const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); - const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); - const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); - const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) - const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); - const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); - const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); - const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__wasm_simd128__) - v128_t sumv = wasm_f32x4_splat(0.0f); - - uint32_t qh; - uint64_t tmp[4]; - - // TODO: check if unrolling this is better - for (int i = 0; i < nb; ++i) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q8_0 * restrict y0 = &y[i]; - - const v128_t m4b = wasm_i8x16_splat(0x0F); - - // extract the 5th bit - memcpy(&qh, x0->qh, sizeof(qh)); - - tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_1[(qh >> 24) ]; - - const v128_t qhl = wasm_v128_load(tmp + 0); - const v128_t qhh = wasm_v128_load(tmp + 2); - - const v128_t v0 = wasm_v128_load(x0->qs); - - // 4-bit -> 8-bit - const v128_t v0l = wasm_v128_and (v0, m4b); - const v128_t v0h = wasm_u8x16_shr(v0, 4); - - // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) - const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); - const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); - - // load y - const v128_t v1l = wasm_v128_load(y0->qs); - const v128_t v1h = wasm_v128_load(y0->qs + 16); - - // int8x16 -> int16x8 - const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); - const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); - const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); - const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); - - const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); - const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); - const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); - const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - - // dot product - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( - wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), - wasm_i32x4_dot_i16x8(v0lfh, v1lh)), - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(LM_GGML_FP16_TO_FP32(x0->d) * LM_GGML_FP16_TO_FP32(y0->d)))); - } - - *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; i++) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - __m256i bxhi = bytes_from_bits_32(x[i].qh); - bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); - bx = _mm256_or_si256(bx, bxhi); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps(d, q, acc); - } - - *s = hsum_float_8(acc); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - __m128i mask = _mm_set1_epi8((char)0xF0); - - // Main loop - for (int i = 0; i < nb; i++) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i bxhi = bytes_from_bits_32(x[i].qh); - __m128i bxhil = _mm256_castsi256_si128(bxhi); - __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); - bxhil = _mm_andnot_si128(bxhil, mask); - bxhih = _mm_andnot_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); - bxl = _mm_or_si128(bxl, bxhil); - bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); - - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); - } - - *s = hsum_float_8(acc); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - uint32_t qh; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); +// +// simd mappings +// - // These tempory registers are for masking and shift operations - vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); - vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl); +// we define a common set of C macros which map to specific intrinsics based on the current architecture +// we then implement the fundamental computation operations below using only these macros +// adding support for new architectures requires to define the corresponding SIMD macros +// +// LM_GGML_F32_STEP / LM_GGML_F16_STEP +// number of elements to process in a single step +// +// LM_GGML_F32_EPR / LM_GGML_F16_EPR +// number of elements to fit in a single register +// - vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl); - vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) - for (int i = 0; i < nb; i++) { - memcpy(&qh, x[i].qh, sizeof(uint32_t)); +#define LM_GGML_SIMD - // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl); - vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl); - vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); +// F32 NEON - // ((qh & (1u << (j + 16))) >> (j + 12)); - vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl); - vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl); +#define LM_GGML_F32_STEP 16 +#define LM_GGML_F32_EPR 4 - // narrowing - vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl); - vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); +#define LM_GGML_F32x4 float32x4_t +#define LM_GGML_F32x4_ZERO vdupq_n_f32(0.0f) +#define LM_GGML_F32x4_SET1(x) vdupq_n_f32(x) +#define LM_GGML_F32x4_LOAD vld1q_f32 +#define LM_GGML_F32x4_STORE vst1q_f32 +#define LM_GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) +#define LM_GGML_F32x4_ADD vaddq_f32 +#define LM_GGML_F32x4_MUL vmulq_f32 +#define LM_GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) +#define LM_GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ + } \ + res = LM_GGML_F32x4_REDUCE_ONE(x[0]); \ +} - vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl); - vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); +#define LM_GGML_F32_VEC LM_GGML_F32x4 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - // load - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); +// F16 NEON - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #define LM_GGML_F16_STEP 32 + #define LM_GGML_F16_EPR 8 - vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); + #define LM_GGML_F16x8 float16x8_t + #define LM_GGML_F16x8_ZERO vdupq_n_f16(0.0f) + #define LM_GGML_F16x8_SET1(x) vdupq_n_f16(x) + #define LM_GGML_F16x8_LOAD vld1q_f16 + #define LM_GGML_F16x8_STORE vst1q_f16 + #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) + #define LM_GGML_F16x8_ADD vaddq_f16 + #define LM_GGML_F16x8_MUL vmulq_f16 + #define LM_GGML_F16x8_REDUCE(res, x) \ + do { \ + int offset = LM_GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ + res = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + } while (0) - vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); - vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); + #define LM_GGML_F16_VEC LM_GGML_F16x8 + #define LM_GGML_F16_VEC_ZERO LM_GGML_F16x8_ZERO + #define LM_GGML_F16_VEC_SET1 LM_GGML_F16x8_SET1 + #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x8_LOAD(p) + #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE(p, r[i]) + #define LM_GGML_F16_VEC_FMA LM_GGML_F16x8_FMA + #define LM_GGML_F16_VEC_ADD LM_GGML_F16x8_ADD + #define LM_GGML_F16_VEC_MUL LM_GGML_F16x8_MUL + #define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x8_REDUCE +#else + // if FP16 vector arithmetic is not supported, we use FP32 instead + // and take advantage of the vcvt_ functions to convert to/from FP16 - vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + #define LM_GGML_F16_STEP 16 + #define LM_GGML_F16_EPR 4 - vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl); - vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl); + #define LM_GGML_F32Cx4 float32x4_t + #define LM_GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) + #define LM_GGML_F32Cx4_SET1(x) vdupq_n_f32(x) + #define LM_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define LM_GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) + #define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) + #define LM_GGML_F32Cx4_ADD vaddq_f32 + #define LM_GGML_F32Cx4_MUL vmulq_f32 + #define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + #define LM_GGML_F16_VEC LM_GGML_F32Cx4 + #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO + #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1 + #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p) + #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i]) + #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA + #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD + #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL + #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE +#endif - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); +#elif defined(__AVX__) - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); +#define LM_GGML_SIMD - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); +// F32 AVX - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi; - } +#define LM_GGML_F32_STEP 32 +#define LM_GGML_F32_EPR 8 - *s = sumf; +#define LM_GGML_F32x8 __m256 +#define LM_GGML_F32x8_ZERO _mm256_setzero_ps() +#define LM_GGML_F32x8_SET1(x) _mm256_set1_ps(x) +#define LM_GGML_F32x8_LOAD _mm256_loadu_ps +#define LM_GGML_F32x8_STORE _mm256_storeu_ps +#if defined(__FMA__) + #define LM_GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) #else - // scalar - float sumf = 0.0; + #define LM_GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) +#endif +#define LM_GGML_F32x8_ADD _mm256_add_ps +#define LM_GGML_F32x8_MUL _mm256_mul_ps +#define LM_GGML_F32x8_REDUCE(res, x) \ +do { \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ + _mm256_extractf128_ps(x[0], 1)); \ + const __m128 t1 = _mm_hadd_ps(t0, t0); \ + res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ +} while (0) +// TODO: is this optimal ? + +#define LM_GGML_F32_VEC LM_GGML_F32x8 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x8_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x8_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x8_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x8_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x8_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x8_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x8_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x8_REDUCE - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); +// F16 AVX - int sumi = 0; +#define LM_GGML_F16_STEP 32 +#define LM_GGML_F16_EPR 8 - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); +// F16 arithmetic is not supported by AVX, so we use F32 instead - const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; +#define LM_GGML_F32Cx8 __m256 +#define LM_GGML_F32Cx8_ZERO _mm256_setzero_ps() +#define LM_GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) - sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); - } +#if defined(__F16C__) +// the _mm256_cvt intrinsics require F16C +#define LM_GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) +#define LM_GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#else +static inline __m256 __avx_f32cx8_load(lm_ggml_fp16_t *x) { + float tmp[8]; - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi; + for (int i = 0; i < 8; i++) { + tmp[i] = LM_GGML_FP16_TO_FP32(x[i]); } - *s = sumf; -#endif + return _mm256_loadu_ps(tmp); } +static inline void __avx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) { + float arr[8]; -static void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_1; - const int nb = n / qk; - - assert(n % qk == 0); - assert(qk == QK5_1); - - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + _mm256_storeu_ps(arr, y); -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs0 = 0.0f; - float summs1 = 0.0f; - - uint32_t qh0; - uint32_t qh1; - - uint64_t tmp0[4]; - uint64_t tmp1[4]; - - LM_GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q5_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i]; - const block_q8_1 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - summs0 += LM_GGML_FP16_TO_FP32(x0->m) * y0->s; - summs1 += LM_GGML_FP16_TO_FP32(x1->m) * y1->s; - - // extract the 5th bit via lookup table ((b) << 4) - memcpy(&qh0, x0->qh, sizeof(qh0)); - memcpy(&qh1, x1->qh, sizeof(qh1)); - - tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; - tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; - tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; - tmp0[3] = table_b2b_0[(qh0 >> 24) ]; - - tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; - tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; - tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; - tmp1[3] = table_b2b_0[(qh1 >> 24) ]; - - const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); - const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); - const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); - const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // add high bit - const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); - const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); - const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); - const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); + for (int i = 0; i < 8; i++) + x[i] = LM_GGML_FP32_TO_FP16(arr[i]); +} +#define LM_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define LM_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) #endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; -#elif defined(__wasm_simd128__) - v128_t sumv = wasm_f32x4_splat(0.0f); - float summs = 0.0f; - - uint32_t qh; - uint64_t tmp[4]; - - // TODO: check if unrolling this is better - for (int i = 0; i < nb; ++i) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q8_1 * restrict y0 = &y[i]; - - summs += LM_GGML_FP16_TO_FP32(x0->m) * y0->s; +#define LM_GGML_F32Cx8_FMA LM_GGML_F32x8_FMA +#define LM_GGML_F32Cx8_ADD _mm256_add_ps +#define LM_GGML_F32Cx8_MUL _mm256_mul_ps +#define LM_GGML_F32Cx8_REDUCE LM_GGML_F32x8_REDUCE - const v128_t m4b = wasm_i8x16_splat(0x0F); +#define LM_GGML_F16_VEC LM_GGML_F32Cx8 +#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx8_ZERO +#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx8_SET1 +#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx8_LOAD(p) +#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx8_STORE(p, r[i]) +#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx8_FMA +#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx8_ADD +#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx8_MUL +#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx8_REDUCE - // extract the 5th bit - memcpy(&qh, x0->qh, sizeof(qh)); +#elif defined(__POWER9_VECTOR__) - tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_0[(qh >> 24) ]; +#define LM_GGML_SIMD - const v128_t qhl = wasm_v128_load(tmp + 0); - const v128_t qhh = wasm_v128_load(tmp + 2); +// F32 POWER9 - const v128_t v0 = wasm_v128_load(x0->qs); +#define LM_GGML_F32_STEP 32 +#define LM_GGML_F32_EPR 4 - // 4-bit -> 8-bit - const v128_t v0l = wasm_v128_and (v0, m4b); - const v128_t v0h = wasm_u8x16_shr(v0, 4); +#define LM_GGML_F32x4 vector float +#define LM_GGML_F32x4_ZERO 0.0f +#define LM_GGML_F32x4_SET1 vec_splats +#define LM_GGML_F32x4_LOAD(p) vec_xl(0, p) +#define LM_GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) +#define LM_GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) +#define LM_GGML_F32x4_ADD vec_add +#define LM_GGML_F32x4_MUL vec_mul +#define LM_GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + res = vec_extract(x[0], 0) + \ + vec_extract(x[0], 1) + \ + vec_extract(x[0], 2) + \ + vec_extract(x[0], 3); \ +} - // add high bit - const v128_t v0lf = wasm_v128_or(v0l, qhl); - const v128_t v0hf = wasm_v128_or(v0h, qhh); +#define LM_GGML_F32_VEC LM_GGML_F32x4 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - // load y - const v128_t v1l = wasm_v128_load(y0->qs); - const v128_t v1h = wasm_v128_load(y0->qs + 16); +// F16 POWER9 +#define LM_GGML_F16_STEP LM_GGML_F32_STEP +#define LM_GGML_F16_EPR LM_GGML_F32_EPR +#define LM_GGML_F16_VEC LM_GGML_F32x4 +#define LM_GGML_F16_VEC_ZERO LM_GGML_F32x4_ZERO +#define LM_GGML_F16_VEC_SET1 LM_GGML_F32x4_SET1 +#define LM_GGML_F16_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32x4_REDUCE +// Use vec_xl, not vec_ld, in case the load address is not aligned. +#define LM_GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ + vec_extract_fp32_from_shorth(vec_xl(0, p - LM_GGML_F16_EPR)) : \ + vec_extract_fp32_from_shortl(vec_xl(0, p)) +#define LM_GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] +#define LM_GGML_F16_VEC_STORE(p, r, i) \ + if (i & 0x1) \ + vec_xst(vec_pack_to_short_fp32(r[i - LM_GGML_ENDIAN_BYTE(1)], \ + r[i - LM_GGML_ENDIAN_BYTE(0)]), \ + 0, p - LM_GGML_F16_EPR) - // int8x16 -> int16x8 - const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); - const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); - const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); - const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); +#elif defined(__wasm_simd128__) - const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); - const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); - const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); - const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); +#define LM_GGML_SIMD - // dot product - sumv = wasm_f32x4_add(sumv, - wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), - wasm_i32x4_dot_i16x8(v0lfh, v1lh)), - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(LM_GGML_FP16_TO_FP32(x0->d) * y0->d))); - } +// F32 WASM - *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); +#define LM_GGML_F32_STEP 16 +#define LM_GGML_F32_EPR 4 - float summs = 0.0f; +#define LM_GGML_F32x4 v128_t +#define LM_GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) +#define LM_GGML_F32x4_SET1(x) wasm_f32x4_splat(x) +#define LM_GGML_F32x4_LOAD wasm_v128_load +#define LM_GGML_F32x4_STORE wasm_v128_store +#define LM_GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) +#define LM_GGML_F32x4_ADD wasm_f32x4_add +#define LM_GGML_F32x4_MUL wasm_f32x4_mul +#define LM_GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ +} - // Main loop - for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d)); +#define LM_GGML_F32_VEC LM_GGML_F32x4 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; +// F16 WASM - __m256i bx = bytes_from_nibbles_32(x[i].qs); - __m256i bxhi = bytes_from_bits_32(x[i].qh); - bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); - bx = _mm256_or_si256(bx, bxhi); +#define LM_GGML_F16_STEP 16 +#define LM_GGML_F16_EPR 4 - const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); +inline static v128_t __wasm_f16x4_load(const lm_ggml_fp16_t * p) { + float tmp[4]; - const __m256 q = mul_sum_us8_pairs_float(bx, by); + tmp[0] = LM_GGML_FP16_TO_FP32(p[0]); + tmp[1] = LM_GGML_FP16_TO_FP32(p[1]); + tmp[2] = LM_GGML_FP16_TO_FP32(p[2]); + tmp[3] = LM_GGML_FP16_TO_FP32(p[3]); - acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); - } + return wasm_v128_load(tmp); +} - *s = hsum_float_8(acc) + summs; -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - __m128i mask = _mm_set1_epi8(0x10); +inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) { + float tmp[4]; - float summs = 0.0f; + wasm_v128_store(tmp, x); - // Main loop - for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d)); + p[0] = LM_GGML_FP32_TO_FP16(tmp[0]); + p[1] = LM_GGML_FP32_TO_FP16(tmp[1]); + p[2] = LM_GGML_FP32_TO_FP16(tmp[2]); + p[3] = LM_GGML_FP32_TO_FP16(tmp[3]); +} - summs += LM_GGML_FP16_TO_FP32(x[i].m) * y[i].s; +#define LM_GGML_F16x4 v128_t +#define LM_GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) +#define LM_GGML_F16x4_SET1(x) wasm_f32x4_splat(x) +#define LM_GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) +#define LM_GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) +#define LM_GGML_F16x4_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F16x4_ADD wasm_f32x4_add +#define LM_GGML_F16x4_MUL wasm_f32x4_mul +#define LM_GGML_F16x4_REDUCE(res, x) \ +{ \ + int offset = LM_GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ +} - __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i bxhi = bytes_from_bits_32(x[i].qh); - __m128i bxhil = _mm256_castsi256_si128(bxhi); - __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); - bxhil = _mm_and_si128(bxhil, mask); - bxhih = _mm_and_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); - bxl = _mm_or_si128(bxl, bxhil); - bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); +#define LM_GGML_F16_VEC LM_GGML_F16x4 +#define LM_GGML_F16_VEC_ZERO LM_GGML_F16x4_ZERO +#define LM_GGML_F16_VEC_SET1 LM_GGML_F16x4_SET1 +#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x4_LOAD(p) +#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x4_STORE(p, r[i]) +#define LM_GGML_F16_VEC_FMA LM_GGML_F16x4_FMA +#define LM_GGML_F16_VEC_ADD LM_GGML_F16x4_ADD +#define LM_GGML_F16_VEC_MUL LM_GGML_F16x4_MUL +#define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x4_REDUCE - const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); +#elif defined(__SSE3__) - const __m256 q = mul_sum_us8_pairs_float(bx, by); +#define LM_GGML_SIMD - acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); - } +// F32 SSE - *s = hsum_float_8(acc) + summs; -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; +#define LM_GGML_F32_STEP 32 +#define LM_GGML_F32_EPR 4 - uint32_t qh; +#define LM_GGML_F32x4 __m128 +#define LM_GGML_F32x4_ZERO _mm_setzero_ps() +#define LM_GGML_F32x4_SET1(x) _mm_set1_ps(x) +#define LM_GGML_F32x4_LOAD _mm_loadu_ps +#define LM_GGML_F32x4_STORE _mm_storeu_ps +#if defined(__FMA__) + // TODO: Does this work? + #define LM_GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) +#else + #define LM_GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) +#endif +#define LM_GGML_F32x4_ADD _mm_add_ps +#define LM_GGML_F32x4_MUL _mm_mul_ps +#define LM_GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = LM_GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ + res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ +} +// TODO: is this optimal ? - size_t vl = __riscv_vsetvl_e8m1(qk/2); +#define LM_GGML_F32_VEC LM_GGML_F32x4 +#define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO +#define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1 +#define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD +#define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE +#define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD +#define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL +#define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE - // temporary registers for shift operations - vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); - vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); +// F16 SSE - for (int i = 0; i < nb; i++) { - memcpy(&qh, x[i].qh, sizeof(uint32_t)); +#define LM_GGML_F16_STEP 32 +#define LM_GGML_F16_EPR 4 - // load qh - vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl); +static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) { + float tmp[4]; - // ((qh >> (j + 0)) << 4) & 0x10; - vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl); - vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); - vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl); + tmp[0] = LM_GGML_FP16_TO_FP32(x[0]); + tmp[1] = LM_GGML_FP16_TO_FP32(x[1]); + tmp[2] = LM_GGML_FP16_TO_FP32(x[2]); + tmp[3] = LM_GGML_FP16_TO_FP32(x[3]); - // ((qh >> (j + 12)) ) & 0x10; - vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl); - vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl); + return _mm_loadu_ps(tmp); +} - // narrowing - vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl); - vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); +static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) { + float arr[4]; - vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl); - vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); + _mm_storeu_ps(arr, y); - // load - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); + x[0] = LM_GGML_FP32_TO_FP16(arr[0]); + x[1] = LM_GGML_FP32_TO_FP16(arr[1]); + x[2] = LM_GGML_FP32_TO_FP16(arr[2]); + x[3] = LM_GGML_FP32_TO_FP16(arr[3]); +} - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); +#define LM_GGML_F32Cx4 __m128 +#define LM_GGML_F32Cx4_ZERO _mm_setzero_ps() +#define LM_GGML_F32Cx4_SET1(x) _mm_set1_ps(x) +#define LM_GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) +#define LM_GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) +#define LM_GGML_F32Cx4_FMA LM_GGML_F32x4_FMA +#define LM_GGML_F32Cx4_ADD _mm_add_ps +#define LM_GGML_F32Cx4_MUL _mm_mul_ps +#define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE - vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); +#define LM_GGML_F16_VEC LM_GGML_F32Cx4 +#define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO +#define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1 +#define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p) +#define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i]) +#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA +#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD +#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL +#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE - vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); - vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); +#endif - vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); +// LM_GGML_F32_ARR / LM_GGML_F16_ARR +// number of registers to use per step +#ifdef LM_GGML_SIMD +#define LM_GGML_F32_ARR (LM_GGML_F32_STEP/LM_GGML_F32_EPR) +#define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR) +#endif - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); +// +// fundamental operations +// - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); +inline static void lm_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); +inline static void lm_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); +inline static void lm_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } +inline static void lm_ggml_vec_set_f16(const int n, lm_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - *s = sumf; -#else - // scalar - float sumf = 0.0; +inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } +inline static void lm_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void lm_ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void lm_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void lm_ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void lm_ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void lm_ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void lm_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void lm_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); +static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +#ifdef LM_GGML_SIMD + float sumf = 0.0f; + const int np = (n & ~(LM_GGML_F32_STEP - 1)); - int sumi = 0; + LM_GGML_F32_VEC sum[LM_GGML_F32_ARR] = { LM_GGML_F32_VEC_ZERO }; - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + LM_GGML_F32_VEC ax[LM_GGML_F32_ARR]; + LM_GGML_F32_VEC ay[LM_GGML_F32_ARR]; - const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; - const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; + for (int i = 0; i < np; i += LM_GGML_F32_STEP) { + for (int j = 0; j < LM_GGML_F32_ARR; j++) { + ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR); + ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR); - sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + sum[j] = LM_GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); } - - sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + LM_GGML_FP16_TO_FP32(x[i].m)*y[i].s; } - *s = sumf; -#endif -} - -static void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - LM_GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const int8x16_t x0_0 = vld1q_s8(x0->qs); - const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); - const int8x16_t x1_0 = vld1q_s8(x1->qs); - const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); - - // load y - const int8x16_t y0_0 = vld1q_s8(y0->qs); - const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); - const int8x16_t y1_0 = vld1q_s8(y1->qs); - const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); + // reduce sum0..sum3 to sum0 + LM_GGML_F32_VEC_REDUCE(sumf, sum); + // leftovers + for (int i = np; i < n; ++i) { + sumf += x[i]*y[i]; + } #else - const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); - const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); - const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); - const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); - - const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); - const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); - const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); - const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); - - const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); - const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); - const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); - const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#endif + // scalar + lm_ggml_float sumf = 0.0; + for (int i = 0; i < n; ++i) { + sumf += (lm_ggml_float)(x[i]*y[i]); } +#endif - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) || defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(x[i].d) * LM_GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + *s = sumf; +} - const __m256 q = mul_sum_i8_pairs_float(bx, by); +static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y) { + lm_ggml_float sumf = 0.0; - // Multiply q with scale and accumulate -#if defined(__AVX2__) - acc = _mm256_fmadd_ps( d, q, acc ); -#else - acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); -#endif - } +#if defined(LM_GGML_SIMD) + const int np = (n & ~(LM_GGML_F16_STEP - 1)); - *s = hsum_float_8(acc); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - size_t vl = __riscv_vsetvl_e8m1(qk); + LM_GGML_F16_VEC sum[LM_GGML_F16_ARR] = { LM_GGML_F16_VEC_ZERO }; - for (int i = 0; i < nb; i++) { - // load elements - vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); - vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + LM_GGML_F16_VEC ax[LM_GGML_F16_ARR]; + LM_GGML_F16_VEC ay[LM_GGML_F16_ARR]; - vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + for (int i = 0; i < np; i += LM_GGML_F16_STEP) { + for (int j = 0; j < LM_GGML_F16_ARR; j++) { + ax[j] = LM_GGML_F16_VEC_LOAD(x + i + j*LM_GGML_F16_EPR, j); + ay[j] = LM_GGML_F16_VEC_LOAD(y + i + j*LM_GGML_F16_EPR, j); - vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + sum[j] = LM_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + } + } - int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + // reduce sum0..sum3 to sum0 + LM_GGML_F16_VEC_REDUCE(sumf, sum); - sumf += sumi*(LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)); + // leftovers + for (int i = np; i < n; ++i) { + sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i])); } - - *s = sumf; #else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk; j++) { - sumi += x[i].qs[j]*y[i].qs[j]; - } - - sumf += sumi*(LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)); + for (int i = 0; i < n; ++i) { + sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i])); } +#endif *s = sumf; -#endif } // compute LM_GGML_VEC_DOT_UNROLL dot products at once @@ -3846,7 +1364,7 @@ inline static float lm_ggml_gelu_f32(float x) { inline static void lm_ggml_vec_gelu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) { const uint16_t * i16 = (const uint16_t *) x; for (int i = 0; i < n; ++i) { - y[i] = table_gelu_f16[i16[i]]; + y[i] = lm_ggml_table_gelu_f16[i16[i]]; } } @@ -3856,7 +1374,7 @@ inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x) for (int i = 0; i < n; ++i) { lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = LM_GGML_FP16_TO_FP32(table_gelu_f16[t]); + y[i] = LM_GGML_FP16_TO_FP32(lm_ggml_table_gelu_f16[t]); } } #else @@ -3874,7 +1392,7 @@ inline static float lm_ggml_gelu_quick_f32(float x) { //inline static void lm_ggml_vec_gelu_quick_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { -// y[i] = table_gelu_quick_f16[i16[i]]; +// y[i] = lm_ggml_table_gelu_quick_f16[i16[i]]; // } //} @@ -3884,7 +1402,7 @@ inline static void lm_ggml_vec_gelu_quick_f32(const int n, float * y, const floa for (int i = 0; i < n; ++i) { lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = LM_GGML_FP16_TO_FP32(table_gelu_quick_f16[t]); + y[i] = LM_GGML_FP16_TO_FP32(lm_ggml_table_gelu_quick_f16[t]); } } #else @@ -3903,7 +1421,7 @@ inline static float lm_ggml_silu_f32(float x) { //inline static void lm_ggml_vec_silu_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { -// y[i] = table_silu_f16[i16[i]]; +// y[i] = lm_ggml_table_silu_f16[i16[i]]; // } //} @@ -3913,7 +1431,7 @@ inline static void lm_ggml_vec_silu_f32(const int n, float * y, const float * x) for (int i = 0; i < n; ++i) { lm_ggml_fp16_t fp16 = LM_GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = LM_GGML_FP16_TO_FP32(table_silu_f16[t]); + y[i] = LM_GGML_FP16_TO_FP32(lm_ggml_table_silu_f16[t]); } } #else @@ -4629,11 +2147,11 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) { for (int i = 0; i < (1 << 16); ++i) { uint16_t ui = i; memcpy(&ii, &ui, sizeof(ii)); - const float f = table_f32_f16[i] = LM_GGML_COMPUTE_FP16_TO_FP32(ii); - table_gelu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_f32(f)); - table_gelu_quick_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_quick_f32(f)); - table_silu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_silu_f32(f)); - table_exp_f16[i] = LM_GGML_FP32_TO_FP16(expf(f)); + const float f = lm_ggml_table_f32_f16[i] = LM_GGML_COMPUTE_FP16_TO_FP32(ii); + lm_ggml_table_gelu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_f32(f)); + lm_ggml_table_gelu_quick_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_quick_f32(f)); + lm_ggml_table_silu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_silu_f32(f)); + lm_ggml_table_exp_f16[i] = LM_GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = lm_ggml_time_us(); UNUSED(t_end); @@ -5636,7 +3154,7 @@ static struct lm_ggml_tensor * lm_ggml_add_cast_impl( // TODO: support less-strict constraint // LM_GGML_ASSERT(lm_ggml_can_repeat(b, a)); LM_GGML_ASSERT(lm_ggml_can_repeat_rows(b, a)); - LM_GGML_ASSERT(lm_ggml_is_quantized(a->type)); // currently only supported for quantized input + LM_GGML_ASSERT(lm_ggml_is_quantized(a->type) || a->type == LM_GGML_TYPE_F16); // currently only supported for quantized input and f16 bool is_node = false; @@ -7328,8 +4846,13 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down, bool inplace) { @@ -7345,11 +4868,15 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl( struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); - int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &xpos_base, sizeof(float)); - memcpy(params + 7, &xpos_down, sizeof(bool)); + int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_ROPE; @@ -7367,7 +4894,9 @@ struct lm_ggml_tensor * lm_ggml_rope( int n_dims, int mode, int n_ctx) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false + ); } struct lm_ggml_tensor * lm_ggml_rope_inplace( @@ -7377,7 +4906,9 @@ struct lm_ggml_tensor * lm_ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ); } struct lm_ggml_tensor * lm_ggml_rope_custom( @@ -7387,9 +4918,17 @@ struct lm_ggml_tensor * lm_ggml_rope_custom( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false + ); } struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( @@ -7399,9 +4938,17 @@ struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return lm_ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true + ); } struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( @@ -7411,7 +4958,7 @@ struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( int n_dims, float base, bool down) { - return lm_ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); + return lm_ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); } // lm_ggml_rope_back @@ -7423,8 +4970,13 @@ struct lm_ggml_tensor * lm_ggml_rope_back( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down) { LM_GGML_ASSERT(lm_ggml_is_vector(b)); @@ -7441,11 +4993,15 @@ struct lm_ggml_tensor * lm_ggml_rope_back( struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a); - int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &xpos_base, sizeof(float)); - memcpy(params + 7, &xpos_down, sizeof(bool)); + int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); lm_ggml_set_op_params(result, params, sizeof(params)); result->op = LM_GGML_OP_ROPE_BACK; @@ -9410,9 +6966,15 @@ static void lm_ggml_compute_forward_add_f16_f32( LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); - LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_F16); - LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_fp16_t)); + if (dst->type == LM_GGML_TYPE_F32) { + LM_GGML_ASSERT( nb0 == sizeof(float)); + } + else { + LM_GGML_ASSERT(dst->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT( nb0 == sizeof(lm_ggml_fp16_t)); + } + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); // rows per thread @@ -9423,18 +6985,35 @@ static void lm_ggml_compute_forward_add_f16_f32( const int ir1 = MIN(ir0 + dr, nr); if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - lm_ggml_fp16_t * dst_ptr = (lm_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - lm_ggml_fp16_t * src0_ptr = (lm_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + if (dst->type == LM_GGML_TYPE_F16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + lm_ggml_fp16_t * dst_ptr = (lm_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + lm_ggml_fp16_t * src0_ptr = (lm_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_FP32_TO_FP16(LM_GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + lm_ggml_fp16_t * src0_ptr = (lm_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = LM_GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } } } } @@ -12996,7 +10575,7 @@ static void lm_ggml_compute_forward_soft_max_f32( // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(sp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt]); + const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); sum += (lm_ggml_float)val; dp[i] = val; } @@ -13361,30 +10940,75 @@ static void lm_ggml_compute_forward_clamp( // lm_ggml_compute_forward_rope +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / MAX(0.001f, high - low); + return 1 - MIN(1, MAX(0, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float lm_ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void lm_ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = MAX(0, floorf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = MIN(n_dims - 1, ceilf(lm_ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); +} + static void lm_ggml_compute_forward_rope_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { + struct lm_ggml_tensor * dst, + const bool forward) { if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } - float freq_base; - float freq_scale; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; // these two only relevant for xPos RoPE: float xpos_base; bool xpos_down; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool)); LM_GGML_TENSOR_UNARY_OP_LOCALS @@ -13412,10 +11036,18 @@ static void lm_ggml_compute_forward_rope_f32( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + lm_ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + const int32_t * pos = (const int32_t *) src1->data; for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -13425,18 +11057,18 @@ static void lm_ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -13454,13 +11086,17 @@ static void lm_ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); + sin_theta *= sin_sign; + // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta *= theta_scale; + theta_base *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -13474,12 +11110,20 @@ static void lm_ggml_compute_forward_rope_f32( } else { // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; + + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + sin_theta *= sin_sign; - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -13503,20 +11147,25 @@ static void lm_ggml_compute_forward_rope_f16( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { + struct lm_ggml_tensor * dst, + const bool forward) { if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } - float freq_base; - float freq_scale; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); LM_GGML_TENSOR_UNARY_OP_LOCALS @@ -13544,10 +11193,18 @@ static void lm_ggml_compute_forward_rope_f16( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + lm_ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + const int32_t * pos = (const int32_t *) src1->data; for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -13557,18 +11214,18 @@ static void lm_ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base) * sin_sign; const float cos_block_theta = cosf(block_theta); - const float sin_block_theta = sinf(block_theta); + const float sin_block_theta = sinf(block_theta) * sin_sign; - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -13586,10 +11243,13 @@ static void lm_ggml_compute_forward_rope_f16( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); + sin_theta *= sin_sign; - theta *= theta_scale; + theta_base *= theta_scale; const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -13603,12 +11263,20 @@ static void lm_ggml_compute_forward_rope_f16( } else { // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; - theta *= theta_scale; + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + sin_theta *= sin_sign; + + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -13636,11 +11304,11 @@ static void lm_ggml_compute_forward_rope( switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_f16(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f16(params, src0, src1, dst, true); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_f32(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f32(params, src0, src1, dst, true); } break; default: { @@ -13651,215 +11319,6 @@ static void lm_ggml_compute_forward_rope( // lm_ggml_compute_forward_rope_back -static void lm_ggml_compute_forward_rope_back_f32( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - - if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - // y = rope(x, src1) - // dx = rope_back(dy, src1) - // src0 is dy, src1 contains options - - float freq_base; - float freq_scale; - - // these two only relevant for xPos RoPE: - float xpos_base; - bool xpos_down; - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - - LM_GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - assert(nb0 == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = lm_ggml_nrows(dst); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - - const bool is_neox = mode & 2; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta = freq_scale * (float)p; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); - // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; - if (xpos_down) zeta = 1.0f / zeta; - - theta *= theta_scale; - - const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = dy[0]; - const float dy1 = dy[1]; - - dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta; - dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta; - } - } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); - - theta *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = dy[0]; - const float dy1 = dy[n_dims/2]; - - dx[0] = dy0*cos_theta + dy1*sin_theta; - dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta; - } - } - } - } - } - } -} - -static void lm_ggml_compute_forward_rope_back_f16( - const struct lm_ggml_compute_params * params, - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { - - if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { - return; - } - - // y = rope(x, src1) - // dx = rope_back(dy, src1) - // src0 is dy, src1 contains options - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - - LM_GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - assert(nb0 == sizeof(lm_ggml_fp16_t)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = lm_ggml_nrows(dst); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(10000.0, -2.0f/n_dims); - - const bool is_neox = mode & 2; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - float theta = (float)p; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); - - theta *= theta_scale; - - const lm_ggml_fp16_t * const dy = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dx = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = LM_GGML_FP16_TO_FP32(dy[0]); - const float dy1 = LM_GGML_FP16_TO_FP32(dy[1]); - - dx[0] = LM_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[1] = LM_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); - } - } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); - - theta *= theta_scale; - - const int64_t i0 = ib*n_dims + ic/2; - - const lm_ggml_fp16_t * const dy = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - lm_ggml_fp16_t * dx = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float dy0 = LM_GGML_FP16_TO_FP32(dy[0]); - const float dy1 = LM_GGML_FP16_TO_FP32(dy[n_dims/2]); - - dx[0] = LM_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[n_dims/2] = LM_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); - } - } - } - } - } - } -} - static void lm_ggml_compute_forward_rope_back( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, @@ -13868,11 +11327,11 @@ static void lm_ggml_compute_forward_rope_back( switch (src0->type) { case LM_GGML_TYPE_F16: { - lm_ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f16(params, src0, src1, dst, false); } break; case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + lm_ggml_compute_forward_rope_f32(params, src0, src1, dst, false); } break; default: { @@ -15285,7 +12744,7 @@ static void lm_ggml_compute_forward_flash_attn_f32( #else lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt[j]]); #endif sump[j] += (lm_ggml_float)val; SS[j] = val; @@ -15487,7 +12946,7 @@ static void lm_ggml_compute_forward_flash_attn_f16( } else { lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt[j]]); sump[j] += (lm_ggml_float)val; SS[j] = val; } @@ -15938,7 +13397,7 @@ static void lm_ggml_compute_forward_flash_attn_back_f32( #else lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(SR[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt[j]]); #endif sump[j] += (lm_ggml_float)val; SW[j] = val; @@ -16688,7 +14147,7 @@ static void lm_ggml_compute_forward_cross_entropy_loss_f32( #else lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt]); + const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); #endif sum += (lm_ggml_float)val; st[i] = val; @@ -16802,7 +14261,7 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32( #else lm_ggml_fp16_t s = LM_GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = LM_GGML_FP16_TO_FP32(table_exp_f16[scvt]); + const float val = LM_GGML_FP16_TO_FP32(lm_ggml_table_exp_f16[scvt]); #endif sum += (lm_ggml_float)val; ds0[i] = val; @@ -17915,17 +15374,20 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm // necessary for llama if (src0->grad) { //const int n_past = ((int32_t *) tensor->op_params)[0]; - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - float freq_base; - float freq_scale; - float xpos_base; - bool xpos_down; - memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down; + + memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool)); src0->grad = lm_ggml_add_or_set(ctx, src0->grad, @@ -17935,8 +15397,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm n_dims, mode, n_ctx, + n_orig_ctx, freq_base, freq_scale, + ext_factor, + attn_factor, + beta_fast, + beta_slow, xpos_base, xpos_down), zero_table); @@ -17946,17 +15413,20 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { if (src0->grad) { //const int n_past = ((int32_t *) tensor->op_params)[0]; - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - float freq_base; - float freq_scale; - float xpos_base; - bool xpos_down; - memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down; + + memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool)); src0->grad = lm_ggml_add_or_set(ctx, src0->grad, @@ -17966,8 +15436,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm n_dims, mode, n_ctx, + n_orig_ctx, freq_base, freq_scale, + ext_factor, + attn_factor, + beta_fast, + beta_slow, xpos_base, xpos_down, false), @@ -21001,7 +18476,6 @@ size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; result = lm_ggml_quantize_q8_0(src + start, block, n, n, hist); } break; -#ifdef LM_GGML_USE_K_QUANTS case LM_GGML_TYPE_Q2_K: { LM_GGML_ASSERT(start % QK_K == 0); @@ -21032,7 +18506,6 @@ size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * block_q6_K * block = (block_q6_K*)dst + start / QK_K; result = lm_ggml_quantize_q6_K(src + start, block, n, n, hist); } break; -#endif case LM_GGML_TYPE_F16: { int elemsize = sizeof(lm_ggml_fp16_t); @@ -21164,8 +18637,7 @@ static bool lm_gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offs return n == size; } -// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 -static bool lm_gguf_fread_str_cur(FILE * file, struct lm_gguf_str * p, size_t * offset) { +static bool lm_gguf_fread_str(FILE * file, struct lm_gguf_str * p, size_t * offset) { p->n = 0; p->data = NULL; @@ -21177,19 +18649,6 @@ static bool lm_gguf_fread_str_cur(FILE * file, struct lm_gguf_str * p, size_t * return ok; } -static bool lm_gguf_fread_str_v1(FILE * file, struct lm_gguf_str * p, size_t * offset) { - p->n = 0; - p->data = NULL; - - bool ok = true; - - uint32_t n = 0; - ok = ok && lm_gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; - ok = ok && lm_gguf_fread_el(file, p->data, p->n, offset); - - return ok; -} - struct lm_gguf_context * lm_gguf_init_empty(void) { struct lm_gguf_context * ctx = LM_GGML_ALIGNED_MALLOC(sizeof(struct lm_gguf_context)); @@ -21248,20 +18707,14 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg ctx->data = NULL; ok = ok && lm_gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); + ok = ok && lm_gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && lm_gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t n_tensors = 0; - uint32_t n_kv = 0; - - ok = ok && lm_gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); - ok = ok && lm_gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); - - ctx->header.n_tensors = n_tensors; - ctx->header.n_kv = n_kv; - } else { - ok = ok && lm_gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && lm_gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__); + fclose(file); + lm_gguf_free(ctx); + return NULL; } if (!ok) { @@ -21272,12 +18725,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg } } - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - bool (* lm_gguf_fread_str)(FILE *, struct lm_gguf_str *, size_t *) = lm_gguf_fread_str_cur; - if (ctx->header.version == 1) { - lm_gguf_fread_str = lm_gguf_fread_str_v1; - } - // read the kv pairs { ctx->kv = malloc(ctx->header.n_kv * sizeof(struct lm_gguf_kv)); @@ -21308,15 +18755,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg case LM_GGUF_TYPE_ARRAY: { ok = ok && lm_gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t n = 0; - ok = ok && lm_gguf_fread_el(file, &n, sizeof(n), &offset); - kv->value.arr.n = n; - } else { - ok = ok && lm_gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); - } + ok = ok && lm_gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); switch (kv->value.arr.type) { case LM_GGUF_TYPE_UINT8: @@ -21375,14 +18814,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg ok = ok && lm_gguf_fread_str(file, &info->name, &offset); ok = ok && lm_gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); for (uint32_t j = 0; j < info->n_dims; ++j) { - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t t = 0; - ok = ok && lm_gguf_fread_el(file, &t, sizeof(t), &offset); - info->ne[j] = t; - } else { - ok = ok && lm_gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); - } + ok = ok && lm_gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); } ok = ok && lm_gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && lm_gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); diff --git a/cpp/ggml.h b/cpp/ggml.h index 0416777d..3cbd5d8e 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -219,7 +219,7 @@ #define LM_GGML_MAX_CONTEXTS 64 #define LM_GGML_MAX_SRC 6 #define LM_GGML_MAX_NAME 64 -#define LM_GGML_MAX_OP_PARAMS 32 +#define LM_GGML_MAX_OP_PARAMS 64 #define LM_GGML_DEFAULT_N_THREADS 4 #if UINTPTR_MAX == 0xFFFFFFFF @@ -709,7 +709,7 @@ extern "C" { // Context tensor enumeration and lookup LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_first_tensor(struct lm_ggml_context * ctx); LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_next_tensor (struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor); - LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor(struct lm_ggml_context * ctx, const char * name); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor (struct lm_ggml_context * ctx, const char * name); LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_zero(struct lm_ggml_tensor * tensor); LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t value); @@ -1326,8 +1326,13 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); // in-place, returns view(a) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_custom_inplace( @@ -1337,8 +1342,17 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // compute correction dims for YaRN RoPE scaling + void lm_ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // xPos RoPE, in-place, returns view(a) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_xpos_inplace( @@ -1358,8 +1372,13 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down); @@ -1930,12 +1949,19 @@ extern "C" { // quantization // + // TODO: these would probably get removed in favor of the more general lm_ggml_quantize_chunk LM_GGML_API size_t lm_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); LM_GGML_API size_t lm_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); LM_GGML_API size_t lm_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); LM_GGML_API size_t lm_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); LM_GGML_API size_t lm_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); // diff --git a/cpp/llama.cpp b/cpp/llama.cpp index b1c3fd0c..b62b25a0 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -19,13 +19,11 @@ #ifdef LM_GGML_USE_MPI # include "ggml-mpi.h" #endif -#ifdef LM_GGML_USE_K_QUANTS -# ifndef QK_K -# ifdef LM_GGML_QKK_64 -# define QK_K 64 -# else -# define QK_K 256 -# endif +#ifndef QK_K +# ifdef LM_GGML_QKK_64 +# define QK_K 64 +# else +# define QK_K 256 # endif #endif @@ -56,13 +54,16 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include +#include #include #include #include @@ -71,11 +72,10 @@ #include #include #include +#include #include #include #include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -247,6 +247,10 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, @@ -288,9 +292,13 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -564,6 +572,22 @@ do { \ } \ } while (0) +static std::map LLAMA_ROPE_SCALING_TYPES = { + { LLAMA_ROPE_SCALING_NONE, "none" }, + { LLAMA_ROPE_SCALING_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_YARN, "yarn" }, +}; + +static int8_t llama_rope_scaling_type_from_string(const std::string & name) { + for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { + if (kv.second == name) { + return kv.first; + } + } + + return LLAMA_ROPE_SCALING_UNSPECIFIED; +} + // // ggml helpers // @@ -583,19 +607,37 @@ static void lm_ggml_graph_compute_helper(std::vector & buf, lm_ggml_cgr // llama helpers // +inline void * llama_host_malloc(size_t n) { +#ifdef LM_GGML_USE_CUBLAS + if (lm_ggml_cublas_loaded()) { + return lm_ggml_cuda_host_malloc(n); + } else { + return malloc(n); + } +#elif LM_GGML_USE_METAL + return lm_ggml_metal_host_malloc(n); +#elif LM_GGML_USE_CPU_HBM + return hbw_malloc(n); +#else + return malloc(n); +#endif +} + +inline void llama_host_free(void * ptr) { #ifdef LM_GGML_USE_CUBLAS -# define llama_host_malloc(n) lm_ggml_cuda_host_malloc(n) -# define llama_host_free(data) lm_ggml_cuda_host_free(data) + if (lm_ggml_cublas_loaded()) { + return lm_ggml_cuda_host_free(ptr); + } else { + return free(ptr); + } #elif LM_GGML_USE_METAL -# define llama_host_malloc(n) lm_ggml_metal_host_malloc(n) -# define llama_host_free(data) lm_ggml_metal_host_free(data) + return lm_ggml_metal_host_free(ptr); #elif LM_GGML_USE_CPU_HBM -# define llama_host_malloc(n) hbw_malloc(n) -# define llama_host_free(data) if (data != NULL) hbw_free(data) + return hbw_free(ptr); #else -# define llama_host_malloc(n) malloc(n) -# define llama_host_free(data) free(data) + return free(ptr); #endif +} #if defined(_WIN32) static std::string llama_format_win_err(DWORD err) { @@ -982,7 +1024,7 @@ struct llama_mlock { typedef void (*offload_func_t)(struct lm_ggml_tensor * tensor); -static void llama_nop(struct lm_ggml_tensor * tensor) { // don't offload by default +static void lm_ggml_offload_nop(struct lm_ggml_tensor * tensor) { (void) tensor; } @@ -1047,8 +1089,11 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; - float rope_freq_base_train; - float rope_freq_scale_train; + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_yarn_orig_ctx; + int8_t rope_scaling_type_train : 3; + bool rope_finetuned : 1; float f_clamp_kqv; float f_max_alibi_bias; @@ -1063,6 +1108,8 @@ struct llama_hparams { if (this->n_layer != other.n_layer) return true; if (this->n_rot != other.n_rot) return true; if (this->n_ff != other.n_ff) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; + if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; const float EPSILON = 1e-9; @@ -1093,8 +1140,16 @@ struct llama_cparams { uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - float rope_freq_base; - float rope_freq_scale; + float rope_freq_base; + float rope_freq_scale; + + uint32_t n_yarn_orig_ctx; + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; bool mul_mat_q; }; @@ -1126,13 +1181,13 @@ struct llama_layer { struct lm_ggml_tensor * ffn_norm_b; // ff - struct lm_ggml_tensor * w1; // ffn_gate - struct lm_ggml_tensor * w2; // ffn_down - struct lm_ggml_tensor * w3; // ffn_up + struct lm_ggml_tensor * ffn_gate; // w1 + struct lm_ggml_tensor * ffn_down; // w2 + struct lm_ggml_tensor * ffn_up; // w3 // ff bias - struct lm_ggml_tensor * b2; // ffn_down - struct lm_ggml_tensor * b3; // ffn_up + struct lm_ggml_tensor * ffn_down_b; // b2 + struct lm_ggml_tensor * ffn_up_b; // b3 }; struct llama_kv_cell { @@ -1174,9 +1229,11 @@ struct llama_kv_cache { } #ifdef LM_GGML_USE_CUBLAS - lm_ggml_cuda_free_data(k); - lm_ggml_cuda_free_data(v); -#endif // LM_GGML_USE_CUBLAS + if (lm_ggml_cublas_loaded()) { + lm_ggml_cuda_free_data(k); + lm_ggml_cuda_free_data(v); + } +#endif } }; @@ -1238,8 +1295,8 @@ struct llama_model { llama_hparams hparams = {}; llama_vocab vocab; - struct lm_ggml_tensor * tok_embeddings; - struct lm_ggml_tensor * pos_embeddings; + struct lm_ggml_tensor * tok_embd; + struct lm_ggml_tensor * pos_embd; struct lm_ggml_tensor * tok_norm; struct lm_ggml_tensor * tok_norm_b; @@ -1276,11 +1333,15 @@ struct llama_model { } #ifdef LM_GGML_USE_CUBLAS - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - lm_ggml_cuda_free_data(tensors_by_name[i].second); + if (lm_ggml_cublas_loaded()) { + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + lm_ggml_cuda_free_data(tensors_by_name[i].second); + } + lm_ggml_cuda_free_scratch(); } - lm_ggml_cuda_free_scratch(); -#elif defined(LM_GGML_USE_CLBLAST) +#endif + +#if defined(LM_GGML_USE_CLBLAST) for (size_t i = 0; i < tensors_by_name.size(); ++i) { lm_ggml_cl_free_data(tensors_by_name[i].second); } @@ -1392,23 +1453,26 @@ static bool llama_kv_cache_init( lm_ggml_set_name(cache.v, "cache_v"); (void) n_gpu_layers; + #ifdef LM_GGML_USE_CUBLAS - size_t vram_kv_cache = 0; + if (lm_ggml_cublas_loaded()) { + size_t vram_kv_cache = 0; - if (n_gpu_layers > (int)n_layer + 1) { - lm_ggml_cuda_assign_buffers_no_scratch(cache.v); - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += lm_ggml_nbytes(cache.v); - } - if (n_gpu_layers > (int)n_layer + 2) { - lm_ggml_cuda_assign_buffers_no_scratch(cache.k); - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += lm_ggml_nbytes(cache.k); - } - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + if (n_gpu_layers > (int)n_layer + 1) { + lm_ggml_cuda_assign_buffers_no_scratch(cache.v); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += lm_ggml_nbytes(cache.v); + } + if (n_gpu_layers > (int)n_layer + 2) { + lm_ggml_cuda_assign_buffers_no_scratch(cache.k); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += lm_ggml_nbytes(cache.k); + } + if (vram_kv_cache > 0) { + LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + } } -#endif // LM_GGML_USE_CUBLAS +#endif return true; } @@ -1479,17 +1543,12 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { return 0; } -static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) { - if (c0 < 0) c0 = 0; - if (c1 < 0) c1 = cache.size; - - for (int32_t i = c0; i < c1; ++i) { +static void llama_kv_cache_clear(struct llama_kv_cache & cache) { + for (int32_t i = 0; i < (int32_t) cache.size; ++i) { cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); } - - // Searching for a free slot can start here since we know it will be empty. - cache.head = uint32_t(c0); + cache.head = 0; } static void llama_kv_cache_seq_rm( @@ -1503,8 +1562,14 @@ static void llama_kv_cache_seq_rm( if (p1 < 0) p1 = std::numeric_limits::max(); for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.cells[i].seq_id.erase(seq_id); + if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + if (seq_id < 0) { + cache.cells[i].seq_id.clear(); + } else if (cache.cells[i].has_seq_id(seq_id)) { + cache.cells[i].seq_id.erase(seq_id); + } else { + continue; + } if (cache.cells[i].seq_id.empty()) { cache.cells[i].pos = -1; if (new_head == cache.size) new_head = i; @@ -1565,14 +1630,14 @@ static void llama_kv_cache_seq_shift( for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.cells[i].pos += delta; + cache.has_shift = true; + cache.cells[i].pos += delta; + cache.cells[i].delta += delta; + if (cache.cells[i].pos < 0) { cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); if (new_head == cache.size) new_head = i; - } else { - cache.has_shift = true; - cache.cells[i].delta = delta; } } } @@ -1589,12 +1654,14 @@ static void llama_kv_cache_seq_shift( enum llama_fver { LM_GGUF_FILE_VERSION_V1 = 1, LM_GGUF_FILE_VERSION_V2 = 2, + LM_GGUF_FILE_VERSION_V3 = 3, }; static const char * llama_file_version_name(llama_fver version) { switch (version) { case LM_GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; - case LM_GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)"; + case LM_GGUF_FILE_VERSION_V2: return "GGUF V2"; + case LM_GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; } return "unknown"; @@ -1808,6 +1875,12 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } + if (backend == LM_GGML_BACKEND_GPU_SPLIT) { + if (ne.size() == 1) { + throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); + } + } + { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2023,14 +2096,30 @@ static void llm_load_hparams( hparams.n_head_kv = hparams.n_head; LM_GGUF_GET_KEY(ctx, hparams.n_head_kv, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + hparams.rope_finetuned = false; + LM_GGUF_GET_KEY(ctx, hparams.rope_finetuned, lm_gguf_get_val_bool, LM_GGUF_TYPE_BOOL, false, + kv(LLM_KV_ROPE_SCALING_FINETUNED)); + + hparams.n_yarn_orig_ctx = hparams.n_ctx_train; + LM_GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, lm_gguf_get_val_u32, LM_GGUF_TYPE_UINT32, false, + kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); + // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; LM_GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + std::string rope_scaling("linear"); + LM_GGUF_GET_KEY(ctx, rope_scaling, lm_gguf_get_val_str, LM_GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); + LM_GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + // rope_freq_scale (inverse of the kv) is optional - float ropescale = 1.0f; - LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - hparams.rope_freq_scale_train = 1.0f/ropescale; + float ropescale = 0.0f; + LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); + if (ropescale == 0.0f) { // try the old key name + LM_GGUF_GET_KEY(ctx, ropescale, lm_gguf_get_val_f32, LM_GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + } + hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // sanity check for n_rot (optional) { @@ -2380,6 +2469,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; + const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); @@ -2398,8 +2489,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); + LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); @@ -2465,18 +2559,22 @@ static void llm_load_tensors( } (void) main_gpu; + + enum lm_ggml_backend_type llama_backend_offload = LM_GGML_BACKEND_CPU; + enum lm_ggml_backend_type llama_backend_offload_split = LM_GGML_BACKEND_CPU; + #ifdef LM_GGML_USE_CUBLAS - LLAMA_LOG_INFO("%s: using " LM_GGML_CUDA_NAME " for GPU acceleration\n", __func__); - lm_ggml_cuda_set_main_device(main_gpu); -#define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_GPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_GPU_SPLIT + if (lm_ggml_cublas_loaded()) { + LLAMA_LOG_INFO("%s: using " LM_GGML_CUDA_NAME " for GPU acceleration\n", __func__); + lm_ggml_cuda_set_main_device(main_gpu); + + llama_backend_offload = LM_GGML_BACKEND_GPU; + llama_backend_offload_split = LM_GGML_BACKEND_GPU_SPLIT; + } #elif defined(LM_GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); -#define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_GPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_GPU -#else -#define LLAMA_BACKEND_OFFLOAD LM_GGML_BACKEND_CPU -#define LLAMA_BACKEND_OFFLOAD_SPLIT LM_GGML_BACKEND_CPU + LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); + llama_backend_offload = LM_GGML_BACKEND_GPU; + llama_backend_offload_split = LM_GGML_BACKEND_GPU; #endif // prepare memory for the weights @@ -2492,7 +2590,7 @@ static void llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); // output { @@ -2503,12 +2601,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2532,8 +2630,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2546,21 +2644,21 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + - lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.w1) + lm_ggml_nbytes(layer.w2) + lm_ggml_nbytes(layer.w3); + lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + + lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + + lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); { lm_ggml_backend_type backend_norm; lm_ggml_backend_type backend_output; @@ -2569,12 +2667,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2598,8 +2696,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2612,15 +2710,15 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + - lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.w1) + lm_ggml_nbytes(layer.w2) + lm_ggml_nbytes(layer.w3); + lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + + lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + + lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); } } } break; @@ -2628,7 +2726,7 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); // output { @@ -2639,12 +2737,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2670,8 +2768,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2691,21 +2789,21 @@ static void llm_load_tensors( layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.attn_norm_b) + lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.wo) + - lm_ggml_nbytes(layer.w2) + lm_ggml_nbytes(layer.w3); + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_STARCODER: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); - model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, LM_GGML_BACKEND_CPU); // output { @@ -2716,12 +2814,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2747,8 +2845,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2756,19 +2854,19 @@ static void llm_load_tensors( layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += @@ -2776,14 +2874,14 @@ static void llm_load_tensors( lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.bo) + lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_norm_b) + - lm_ggml_nbytes(layer.w2) + lm_ggml_nbytes(layer.b2) + - lm_ggml_nbytes(layer.w3) + lm_ggml_nbytes(layer.b3); + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_down_b) + + lm_ggml_nbytes(layer.ffn_up) + lm_ggml_nbytes(layer.ffn_up_b); } } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); { lm_ggml_backend_type backend_norm; @@ -2793,12 +2891,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2821,34 +2919,34 @@ static void llm_load_tensors( const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); + layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); } } break; case LLM_ARCH_BLOOM: { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, LM_GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, LM_GGML_BACKEND_CPU); + model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, LM_GGML_BACKEND_CPU); // output { @@ -2859,12 +2957,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2890,8 +2988,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2899,19 +2997,19 @@ static void llm_load_tensors( layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += @@ -2919,14 +3017,14 @@ static void llm_load_tensors( lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.bo) + lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_norm_b) + - lm_ggml_nbytes(layer.w3) + lm_ggml_nbytes(layer.b3) + - lm_ggml_nbytes(layer.w2) + lm_ggml_nbytes(layer.b2); + lm_ggml_nbytes(layer.ffn_up) + lm_ggml_nbytes(layer.ffn_up_b) + + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_down_b); } } } break; case LLM_ARCH_MPT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); // output { @@ -2937,12 +3035,12 @@ static void llm_load_tensors( // norm is not performance relevant on its own but keeping it in VRAM reduces data copying // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = LLAMA_BACKEND_OFFLOAD; + backend_norm = llama_backend_offload; #else - backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? LM_GGML_BACKEND_CPU : llama_backend_offload; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + backend_output = llama_backend_offload_split; } else { backend_norm = LM_GGML_BACKEND_CPU; backend_output = LM_GGML_BACKEND_CPU; @@ -2966,8 +3064,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT auto & layer = model.layers[i]; @@ -2977,8 +3075,8 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == LM_GGML_BACKEND_GPU) { vram_weights += @@ -2986,8 +3084,8 @@ static void llm_load_tensors( lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.w2) + - lm_ggml_nbytes(layer.w3); + lm_ggml_nbytes(layer.ffn_down) + + lm_ggml_nbytes(layer.ffn_up); } } } break; @@ -3017,10 +3115,10 @@ static void llm_load_tensors( #ifdef LM_GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = hparams.n_layer + 3; -#elif defined(LM_GGML_USE_CLBLAST) + const int max_offloadable_layers = hparams.n_layer + 3; +#elif LM_GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; #endif // LM_GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); @@ -3056,21 +3154,11 @@ static void llm_load_tensors( model.t_load_us = lm_ggml_time_us() - model.t_start_us; } -static bool llama_model_load( - const std::string & fname, - llama_model & model, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { - llama_model_loader ml(fname, use_mmap); + llama_model_loader ml(fname, params.use_mmap); - model.hparams.vocab_only = vocab_only; + model.hparams.vocab_only = params.vocab_only; llm_load_arch (ml, model); llm_load_hparams(ml, model); @@ -3082,15 +3170,15 @@ static bool llama_model_load( throw std::runtime_error("vocab size mismatch"); } - if (vocab_only) { + if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); return true; } llm_load_tensors( - ml, model, n_gpu_layers, - main_gpu, tensor_split, - use_mlock, progress_callback, progress_callback_user_data); + ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + params.progress_callback, params.progress_callback_user_data + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; @@ -3099,2754 +3187,1882 @@ static bool llama_model_load( return true; } -static struct lm_ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - LM_GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int n_gpu_layers = model.n_gpu_layers; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; +// +// llm_build +// - const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; +using llm_build_cb = std::function; - //printf("n_kv = %d\n", n_kv); +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; - auto & buf_compute = lctx.buf_compute; +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; - struct lm_ggml_context * ctx0 = lm_ggml_init(params); +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); +static struct lm_ggml_tensor * llm_build_inp_embd( + struct lm_ggml_context * ctx, + const llama_hparams & hparams, + const llama_batch & batch, + struct lm_ggml_tensor * tok_embd, + const llm_build_cb & cb) { + const int64_t n_embd = hparams.n_embd; - struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, batch.n_tokens); + cb(inp_tokens, "inp_tokens", -1); - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - } - lm_ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + inpL = lm_ggml_get_rows(ctx, tok_embd, inp_tokens); } else { #ifdef LM_GGML_USE_MPI LM_GGML_ASSERT(false && "not implemented"); #endif - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); - - lm_ggml_allocr_alloc(lctx.alloc, inpL); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); - } + inpL = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens); } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; + return inpL; +} + +// Persimmon: n_rot = n_embd_head/2 +// Other: n_rot = n_embd_head +static void llm_build_k_shift( + struct lm_ggml_context * ctx, + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache & kv, + struct lm_ggml_cgraph * graph, + llm_rope_type type, + int64_t n_ctx, + int64_t n_rot, + float freq_base, + float freq_scale, + const llm_build_cb & cb) { + const int64_t n_layer = hparams.n_layer; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_head = hparams.n_embd_head(); + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; + const float ext_factor = cparams.yarn_ext_factor; + const float attn_factor = cparams.yarn_attn_factor; + const float beta_fast = cparams.yarn_beta_fast; + const float beta_slow = cparams.yarn_beta_slow; - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + LM_GGML_ASSERT(n_embd_head % n_rot == 0); -#ifdef LM_GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = lm_ggml_cuda_assign_buffers_no_alloc; + struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift", -1); + + int rope_type = 0; + + switch (type) { + case LLM_ROPE: rope_type = 0; break; + case LLM_ROPE_NEOX: rope_type = 2; break; + case LLM_ROPE_GLM: rope_type = 4; break; } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = lm_ggml_cuda_assign_buffers_no_alloc; + + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + lm_ggml_rope_custom_inplace(ctx, + lm_ggml_view_3d(ctx, kv.k, + n_rot, n_head_kv, n_ctx, + lm_ggml_element_size(kv.k)*n_embd_head, + lm_ggml_element_size(kv.k)*n_embd_gqa, + lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), + K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); + lm_ggml_build_forward_expand(graph, tmp); + } +} + +static void llm_build_kv_store( + struct lm_ggml_context * ctx, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct lm_ggml_cgraph * graph, + struct lm_ggml_tensor * k_cur, + struct lm_ggml_tensor * v_cur, + int64_t n_ctx, + int32_t n_tokens, + int32_t kv_head, + const llm_build_cb & cb, + int64_t il) { + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + // compute the transposed [n_tokens, n_embd] V matrix + struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, lm_ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + //struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + cb(v_cur_t, "v_cur_t", il); + + struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa, + (lm_ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k_cache_view, "k_cache_view", il); + + struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa, + ( n_ctx)*lm_ggml_element_size(kv.v), + (il*n_ctx)*lm_ggml_element_size(kv.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv.v)); + cb(v_cache_view, "v_cache_view", il); + + // important: storing RoPE-ed version of K in the KV cache! + lm_ggml_build_forward_expand(graph, lm_ggml_cpy(ctx, k_cur, k_cache_view)); + lm_ggml_build_forward_expand(graph, lm_ggml_cpy(ctx, v_cur_t, v_cache_view)); +} + +static struct lm_ggml_tensor * llm_build_norm( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * cur, + const llama_hparams & hparams, + struct lm_ggml_tensor * mw, + struct lm_ggml_tensor * mb, + llm_norm_type type, + const llm_build_cb & cb, + int il) { + switch (type) { + case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = lm_ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break; } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = lm_ggml_cuda_assign_buffers_no_alloc; + + if (mw || mb) { + cb(cur, "norm", il); } -#endif // LM_GGML_USE_CUBLAS - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } + if (mw) { + cur = lm_ggml_mul(ctx, cur, mw); + if (mb) { + cb(cur, "norm_w", il); } } - // KQ_pos - contains the positions - struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - lm_ggml_set_name(KQ_pos, "KQ_pos"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } + if (mb) { + cur = lm_ggml_add(ctx, cur, mb); } - // shift the entire K-cache if needed - if (do_rope_shift) { - struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - lm_ggml_set_name(K_shift, "K_shift"); - lm_ggml_allocr_alloc(lctx.alloc, K_shift); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } + return cur; +} + +static struct lm_ggml_tensor * llm_build_ffn( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * cur, + struct lm_ggml_tensor * up, + struct lm_ggml_tensor * up_b, + struct lm_ggml_tensor * gate, + struct lm_ggml_tensor * gate_b, + struct lm_ggml_tensor * down, + struct lm_ggml_tensor * down_b, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct lm_ggml_tensor * tmp = lm_ggml_mul_mat(ctx, up, cur); + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = lm_ggml_add(ctx, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = lm_ggml_mul_mat(ctx, gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = lm_ggml_mul_mat(ctx, gate, cur); + cb(cur, "ffn_gate", il); + } break; } - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * tmp = - lm_ggml_rope_custom_inplace(ctx0, - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); - lm_ggml_build_forward_expand(gf, tmp); + if (gate_b) { + cur = lm_ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); } + } else { + cur = tmp; } - for (int il = 0; il < n_layer; ++il) { - lm_ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; + switch (type_op) { + case LLM_FFN_SILU: + { + cur = lm_ggml_silu(ctx, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = lm_ggml_gelu(ctx, cur); + cb(cur, "ffn_gelu", il); + } break; + case LLM_FFN_RELU: + { + cur = lm_ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = lm_ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); -#ifdef LM_GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS + cur = lm_ggml_sqr(ctx, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + } - struct lm_ggml_tensor * inpSA = inpL; + if (type_gate == LLM_FFN_PAR) { + cur = lm_ggml_mul(ctx, cur, tmp); + cb(cur, "ffn_gate_par", il); + } - // norm - { - cur = lm_ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - lm_ggml_set_name(cur, "rms_norm_0"); + cur = lm_ggml_mul_mat(ctx, down, cur); + if (down_b) { + cb(cur, "ffn_down", il); + } - // cur = cur*attn_norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - lm_ggml_set_name(cur, "attention_norm_0"); - } + if (down_b) { + cur = lm_ggml_add(ctx, cur, down_b); + } - // self-attention - { - // compute Q and K and RoPE them - struct lm_ggml_tensor * tmpk = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - lm_ggml_set_name(tmpk, "tmpk"); + return cur; +} - struct lm_ggml_tensor * tmpq = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - lm_ggml_set_name(tmpq, "tmpq"); +// if max_alibi_bias > 0 then apply ALiBi +static struct lm_ggml_tensor * llm_build_kqv( + struct lm_ggml_context * ctx, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct lm_ggml_tensor * wo, + struct lm_ggml_tensor * wo_b, + struct lm_ggml_tensor * q_cur, + struct lm_ggml_tensor * kq_scale, + struct lm_ggml_tensor * kq_mask, + int64_t n_ctx, + int32_t n_tokens, + int32_t n_kv, + float max_alibi_bias, + const llm_build_cb & cb, + int il) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); - struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Kcur); - lm_ggml_set_name(Kcur, "Kcur"); + struct lm_ggml_tensor * q = lm_ggml_permute(ctx, q_cur, 0, 2, 1, 3); + cb(q, "q", il); - struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - lm_ggml_set_name(Qcur, "Qcur"); + struct lm_ggml_tensor * k = + lm_ggml_view_3d(ctx, kv.k, + n_embd_head, n_kv, n_head_kv, + lm_ggml_element_size(kv.k)*n_embd_gqa, + lm_ggml_element_size(kv.k)*n_embd_head, + lm_ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il); + cb(k, "k", il); + + struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + + kq = lm_ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); + + if (max_alibi_bias > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to lm_ggml_add + kq = lm_ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); + } + + kq = lm_ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); + + kq = lm_ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + + // split cached v into n_head heads + struct lm_ggml_tensor * v = + lm_ggml_view_3d(ctx, kv.v, + n_kv, n_embd_head, n_head_kv, + lm_ggml_element_size(kv.v)*n_ctx, + lm_ggml_element_size(kv.v)*n_ctx*n_embd_head, + lm_ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il); + cb(v, "v", il); + + struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv", il); + + struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + struct lm_ggml_tensor * cur = lm_ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + cb(cur, "kqv_merged_cont", il); + + cur = lm_ggml_mul_mat(ctx, wo, cur); + if (wo_b) { + cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = lm_ggml_add(ctx, cur, wo_b); + } + + return cur; +} + +struct llm_build_context { + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_batch & batch; + const llama_kv_cache & kv_self; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head; + const int64_t n_embd_gqa; + + const float freq_base; + const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; + const float norm_eps; + const float norm_rms_eps; + + const int32_t n_tokens; + const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t kv_head; // index of where we store new KV data in the cache + const int32_t n_orig_ctx; + + const bool do_rope_shift; + + const llm_build_cb & cb; + + llama_buffer & buf_compute; + + struct lm_ggml_context * ctx0 = nullptr; + + // TODO: consider making the entire interface noexcept + llm_build_context( + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) : + model (lctx.model), + hparams (model.hparams), + cparams (lctx.cparams), + batch (batch), + kv_self (lctx.kv_self), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head), + n_head_kv (hparams.n_head_kv), + n_embd_head (hparams.n_embd_head()), + n_embd_gqa (hparams.n_embd_gqa()), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (batch.n_tokens), + n_kv (worst_case ? n_ctx : kv_self.n), + kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), + do_rope_shift (worst_case || kv_self.has_shift), + cb (cb), + buf_compute (lctx.buf_compute) { + LM_GGML_ASSERT(!!kv_self.ctx); + + // all initializations should be done in init() + } + + void init() { + struct lm_ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ true, + }; - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix + ctx0 = lm_ggml_init(params); + } - struct lm_ggml_tensor * tmpv = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - lm_ggml_set_name(tmpv, "tmpv"); + void free() { + if (ctx0) { + lm_ggml_free(ctx0); + ctx0 = nullptr; + } + } - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - lm_ggml_set_name(Vcur, "Vcur"); + struct lm_ggml_cgraph * build_llama() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - lm_ggml_set_name(k, "k"); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - offload_func_v(v); - lm_ggml_set_name(v, "v"); + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - // important: storing RoPE-ed version of K in the KV cache! - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - struct lm_ggml_tensor * Q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - lm_ggml_set_name(Q, "Q"); - - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - lm_ggml_set_name(K, "K"); - - // K * Q - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - lm_ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - lm_ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - lm_ggml_set_name(V, "V"); - -#if 1 - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - lm_ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct lm_ggml_tensor * V_cont = lm_ggml_cpy(ctx0, V, lm_ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - lm_ggml_set_name(KQV_merged, "KQV_merged"); + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - // projection (no bias) - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_wo"); + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); } - struct lm_ggml_tensor * inpFF = lm_ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - lm_ggml_set_name(inpFF, "inpFF"); + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; - // feed-forward network - { // norm - { - cur = lm_ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - lm_ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - lm_ggml_set_name(cur, "ffn_norm"); - } + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct lm_ggml_tensor * tmp = lm_ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - lm_ggml_set_name(tmp, "result_w3"); + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_w1"); + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - // SILU activation - cur = lm_ggml_silu(ctx0, cur); - offload_func(cur); - lm_ggml_set_name(cur, "silu"); + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - cur = lm_ggml_mul(ctx0, cur, tmp); - offload_func(cur); - lm_ggml_set_name(cur, "silu_x_result_w3"); + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_w2"); - } + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); - cur = lm_ggml_add(ctx0, cur, inpFF); - offload_func(cur); - lm_ggml_set_name(cur, "inpFF_+_result_w2"); + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - // input for next layer - inpL = cur; - } + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } - cur = inpL; + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - // norm - { - cur = lm_ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - lm_ggml_set_name(cur, "rms_norm_2"); + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } - // cur = cur*norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - lm_ggml_set_name(cur, "result_norm"); - } + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - // lm_head - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); + // input for next layer + inpL = cur; + } - lm_ggml_build_forward_expand(gf, cur); + cur = inpL; - lm_ggml_free(ctx0); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - return gf; -} + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); -static struct lm_ggml_cgraph * llm_build_baichaun( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; + lm_ggml_build_forward_expand(gf, cur); - const auto & kv_self = lctx.kv_self; + return gf; + } - LM_GGML_ASSERT(!!kv_self.ctx); + struct lm_ggml_cgraph * build_baichuan() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - const int n_gpu_layers = model.n_gpu_layers; + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } - auto & buf_compute = lctx.buf_compute; + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct lm_ggml_context * ctx0 = lm_ggml_init(params); + // self-attention + { + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + switch (model.type) { + case MODEL_7B: + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + break; + case MODEL_13B: + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); + break; + default: + LM_GGML_ASSERT(false); + } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * inpL; + // apply ALiBi for 13B model + const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); + } - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - } - lm_ggml_set_name(inp_tokens, "inp_tokens"); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - lm_ggml_allocr_alloc(lctx.alloc, inpL); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); + // input for next layer + inpL = cur; } - } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; + cur = inpL; - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); -#ifdef LM_GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } + lm_ggml_build_forward_expand(gf, cur); - // KQ_pos - contains the positions - struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - lm_ggml_set_name(KQ_pos, "KQ_pos"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } + return gf; } - // shift the entire K-cache if needed - if (do_rope_shift) { - struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - lm_ggml_set_name(K_shift, "K_shift"); - lm_ggml_allocr_alloc(lctx.alloc, K_shift); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } - - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * tmp = - lm_ggml_rope_custom_inplace(ctx0, - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); - lm_ggml_build_forward_expand(gf, tmp); - } - } + struct lm_ggml_cgraph * build_falcon() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - for (int il = 0; il < n_layer; ++il) { - lm_ggml_format_name(inpL, "layer_inp_%d", il); + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - offload_func_t offload_func = llama_nop; + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); -#ifdef LM_GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - struct lm_ggml_tensor * inpSA = inpL; + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - // norm - { - cur = lm_ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - lm_ggml_set_name(cur, "rms_norm_0"); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - // cur = cur*attn_norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - lm_ggml_set_name(cur, "attention_norm_0"); + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } - // self-attention - { - // compute Q and K and RoPE them - struct lm_ggml_tensor * tmpk = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - lm_ggml_set_name(tmpk, "tmpk"); - - struct lm_ggml_tensor * tmpq = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - lm_ggml_set_name(tmpq, "tmpq"); - - struct lm_ggml_tensor * Kcur; - struct lm_ggml_tensor * Qcur; - switch (model.type) { - case MODEL_7B: - Kcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = lm_ggml_rope_custom(ctx0, lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - break; - case MODEL_13B: - Kcur = lm_ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); - Qcur = lm_ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens); - break; - default: - LM_GGML_ASSERT(false); - } - - offload_func_kq(Kcur); - lm_ggml_set_name(Kcur, "Kcur"); + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * attn_norm; - offload_func_kq(Qcur); - lm_ggml_set_name(Qcur, "Qcur"); + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); - // store key and value to memory + // self-attention { - // compute the transposed [n_tokens, n_embd] V matrix + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm_2", il); + } else { + cur = attn_norm; + } + + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct lm_ggml_tensor * tmpv = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - lm_ggml_set_name(tmpv, "tmpv"); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - lm_ggml_set_name(Vcur, "Vcur"); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - lm_ggml_set_name(k, "k"); + // using mode = 2 for neox mode + Qcur = lm_ggml_rope_custom( + ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - offload_func_v(v); - lm_ggml_set_name(v, "v"); + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - // important: storing RoPE-ed version of K in the KV cache! - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct lm_ggml_tensor * Q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - lm_ggml_set_name(Q, "Q"); - - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - lm_ggml_set_name(K, "K"); - - // K * Q - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - lm_ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct lm_ggml_tensor * KQ_masked; - struct lm_ggml_tensor * KQ_scaled_alibi; - - switch (model.type) { - case MODEL_7B: - KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); - break; - case MODEL_13B: - // TODO: replace with lm_ggml_add() - KQ_scaled_alibi = lm_ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - lm_ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - KQ_masked = lm_ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - break; - default: - LM_GGML_ASSERT(false); - } + struct lm_ggml_tensor * ffn_inp = cur; - // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - lm_ggml_set_name(V, "V"); - - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - lm_ggml_set_name(KQV, "KQV"); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - lm_ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_wo"); - } - - struct lm_ggml_tensor * inpFF = lm_ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - lm_ggml_set_name(inpFF, "inpFF"); - - // feed-forward network - { - // norm + // feed forward { - cur = lm_ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - lm_ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - lm_ggml_set_name(cur, "ffn_norm"); + cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - struct lm_ggml_tensor * tmp = lm_ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - lm_ggml_set_name(tmp, "result_w3"); - - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = lm_ggml_silu(ctx0, cur); - offload_func(cur); - lm_ggml_set_name(cur, "silu"); - - cur = lm_ggml_mul(ctx0, cur, tmp); - offload_func(cur); - lm_ggml_set_name(cur, "silu_x_result_w3"); - - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_w2"); - } - - cur = lm_ggml_add(ctx0, cur, inpFF); - offload_func(cur); - lm_ggml_set_name(cur, "inpFF_+_result_w2"); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - { - cur = lm_ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - lm_ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - lm_ggml_set_name(cur, "result_norm"); - } - - // lm_head - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); - - lm_ggml_build_forward_expand(gf, cur); - - lm_ggml_free(ctx0); - - return gf; -} - -static struct lm_ggml_cgraph * llm_build_refact( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - LM_GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int n_gpu_layers = model.n_gpu_layers; + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + cur = lm_ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); - // printf("n_kv = %d\n", n_kv); - - auto & buf_compute = lctx.buf_compute; - - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct lm_ggml_context * ctx0 = lm_ggml_init(params); - - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * inpL; - - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); + // input for next layer + inpL = cur; } - lm_ggml_set_name(inp_tokens, "inp_tokens"); - inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif + cur = inpL; - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); - - lm_ggml_allocr_alloc(lctx.alloc, inpL); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); - } - } + // norm + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + lm_ggml_build_forward_expand(gf, cur); -#ifdef LM_GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS - - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } + return gf; } - for (int il = 0; il < n_layer; ++il) { - lm_ggml_format_name(inpL, "layer_inp_%d", il); + struct lm_ggml_cgraph * build_starcoder() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - offload_func_t offload_func = llama_nop; - -#ifdef LM_GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * pos; + struct lm_ggml_tensor * inpL; - struct lm_ggml_tensor * inpSA = inpL; + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - // norm - { - cur = lm_ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - lm_ggml_set_name(cur, "rms_norm_0"); + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - // cur = cur*attn_norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - lm_ggml_set_name(cur, "attention_norm_0"); - } + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - // self-attention - { - // compute Q and K - struct lm_ggml_tensor * tmpk = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - lm_ggml_set_name(tmpk, "tmpk"); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - struct lm_ggml_tensor * tmpq = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - lm_ggml_set_name(tmpq, "tmpq"); + pos = lm_ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); - struct lm_ggml_tensor * Kcur = lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - offload_func_kq(Kcur); - lm_ggml_set_name(Kcur, "Kcur"); + inpL = lm_ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); - struct lm_ggml_tensor * Qcur = lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - offload_func_kq(Qcur); - lm_ggml_set_name(Qcur, "Qcur"); + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); - // store key and value to memory + // self-attention { - // compute the transposed [n_tokens, n_embd] V matrix + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - struct lm_ggml_tensor * tmpv = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - lm_ggml_set_name(tmpv, "tmpv"); + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - lm_ggml_set_name(Vcur, "Vcur"); + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - lm_ggml_set_name(k, "k"); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - offload_func_v(v); - lm_ggml_set_name(v, "v"); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - struct lm_ggml_tensor * Q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - lm_ggml_set_name(Q, "Q"); - - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - lm_ggml_set_name(K, "K"); - - // K * Q - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - lm_ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct lm_ggml_tensor * KQ_scaled_alibi = lm_ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - lm_ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); - lm_ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - lm_ggml_set_name(V, "V"); - -#if 1 - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - lm_ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct lm_ggml_tensor * V_cont = lm_ggml_cpy(ctx0, V, lm_ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - lm_ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_wo"); - } + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } - struct lm_ggml_tensor * inpFF = lm_ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - lm_ggml_set_name(inpFF, "inpFF"); + // add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - // feed-forward network - { - // norm + // FF { - cur = lm_ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - lm_ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - lm_ggml_set_name(cur, "ffn_norm"); + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - struct lm_ggml_tensor * tmp = lm_ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - lm_ggml_set_name(tmp, "result_w3"); - - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = lm_ggml_silu(ctx0, cur); - offload_func(cur); - lm_ggml_set_name(cur, "silu"); - - cur = lm_ggml_mul(ctx0, cur, tmp); - offload_func(cur); - lm_ggml_set_name(cur, "silu_x_result_w3"); - - cur = lm_ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_w2"); + inpL = lm_ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - cur = lm_ggml_add(ctx0, cur, inpFF); - offload_func(cur); - lm_ggml_set_name(cur, "inpFF_+_result_w2"); + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - // input for next layer - inpL = cur; - } - - cur = inpL; + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - // norm - { - cur = lm_ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - lm_ggml_set_name(cur, "rms_norm_2"); + lm_ggml_build_forward_expand(gf, cur); - // cur = cur*norm(broadcasted) - cur = lm_ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - lm_ggml_set_name(cur, "result_norm"); + return gf; } - // lm_head - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); + struct lm_ggml_cgraph * build_persimmon() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - lm_ggml_build_forward_expand(gf, cur); + const int64_t n_rot = n_embd_head / 2; - lm_ggml_free(ctx0); + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - return gf; -} + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "imp_embd", -1); -static struct lm_ggml_cgraph * llm_build_falcon( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - const auto & kv_self = lctx.kv_self; + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - LM_GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; - - const int n_gpu_layers = model.n_gpu_layers; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - - const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; - - //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", - // kv_head, n_kv, n_tokens, n_ctx, lm_ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); - - auto & buf_compute = lctx.buf_compute; - - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct lm_ggml_context * ctx0 = lm_ggml_init(params); - - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * inpL; + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - } - lm_ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif - - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); - - lm_ggml_allocr_alloc(lctx.alloc, inpL); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef LM_GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS - - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - lm_ggml_set_name(KQ_pos, "KQ_pos"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - - // shift the entire K-cache if needed - if (do_rope_shift) { - struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - lm_ggml_set_name(K_shift, "K_shift"); - lm_ggml_allocr_alloc(lctx.alloc, K_shift); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * tmp = - lm_ggml_rope_custom_inplace(ctx0, - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); - lm_ggml_build_forward_expand(gf, tmp); - } - } - - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * attn_norm; + struct lm_ggml_tensor * residual = inpL; - offload_func_t offload_func = llama_nop; - -#ifdef LM_GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS - - // self-attention - // TODO: refactor into common function (shared with LLaMA) - { - attn_norm = lm_ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); - - attn_norm = lm_ggml_add(ctx0, - lm_ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); - offload_func(attn_norm->src[0]); - offload_func(attn_norm); - - if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = lm_ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - - cur = lm_ggml_add(ctx0, - lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), - model.layers[il].attn_norm_2_b); - offload_func(cur->src[0]); - offload_func(cur); - } else { // Falcon 7B - cur = attn_norm; - } - - // compute QKV - - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); - - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. - - const size_t wsize = lm_ggml_type_size(cur->type); - - // TODO: these 2 lm_ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct lm_ggml_tensor * tmpq = lm_ggml_cont(ctx0, lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); - offload_func_kq(tmpq); - - struct lm_ggml_tensor * tmpk = lm_ggml_cont(ctx0, lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); - offload_func_kq(tmpk); - - struct lm_ggml_tensor * tmpv = lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_v(tmpv); - - // using mode = 2 for neox mode - struct lm_ggml_tensor * Qcur = lm_ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - struct lm_ggml_tensor * Kcur = lm_ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Kcur); + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + // self attention { - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - lm_ggml_set_name(Vcur, "Vcur"); - - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - lm_ggml_set_name(k, "k"); - - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - offload_func_v(v); - - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } - - struct lm_ggml_tensor * Q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - lm_ggml_set_name(Q, "Q"); - - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - lm_ggml_set_name(K, "K"); - - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - lm_ggml_set_name(KQ, "KQ"); - - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - lm_ggml_set_name(KQ_masked, "KQ_masked"); - - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - lm_ggml_set_name(V, "V"); + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - lm_ggml_set_name(KQV, "KQV"); - - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - lm_ggml_set_name(KQV_merged, "KQV_merged"); - - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_wo"); - } - - struct lm_ggml_tensor * attn_out = cur; - - // feed forward - { - struct lm_ggml_tensor * inpFF = attn_norm; - - cur = lm_ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - offload_func(cur); - - cur = lm_ggml_gelu(ctx0, cur); - offload_func(cur); - cur = lm_ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - } + // split qkv + LM_GGML_ASSERT(n_head_kv == n_head); - cur = lm_ggml_add(ctx0, cur, attn_out); - offload_func(cur); - cur = lm_ggml_add(ctx0, cur, inpL); - offload_func(cur); + struct lm_ggml_tensor * tmpqkv = lm_ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); + cb(tmpqkv, "tmpqkv", il); - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - { - cur = lm_ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); - - cur = lm_ggml_add(ctx0, - lm_ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); - lm_ggml_set_name(cur, "result_norm"); - } - - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); - - lm_ggml_build_forward_expand(gf, cur); - - lm_ggml_free(ctx0); - - return gf; -} - -static struct lm_ggml_cgraph * llm_build_starcoder( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - LM_GGML_ASSERT(!!kv_self.ctx); + struct lm_ggml_tensor * tmpqkv_perm = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); + cb(tmpqkv_perm, "tmpqkv", il); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float norm_eps = hparams.f_norm_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - - auto & buf_compute = lctx.buf_compute; - - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct lm_ggml_context * ctx0 = lm_ggml_init(params); + struct lm_ggml_tensor * tmpq = lm_ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + 0 + ); + cb(tmpq, "tmpq", il); - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + struct lm_ggml_tensor * tmpk = lm_ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens + ); + cb(tmpk, "tmpk", il); + + // Q/K Layernorm + tmpq = llm_build_norm(ctx0, tmpq, hparams, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, cb, il); + cb(tmpq, "tmpq", il); + + tmpk = llm_build_norm(ctx0, tmpk, hparams, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, cb, il); + cb(tmpk, "tmpk", il); + + // RoPE the first n_rot of q/k, pass the other half, and concat. + struct lm_ggml_tensor * qrot = lm_ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + lm_ggml_element_size(tmpq) * n_embd_head, + lm_ggml_element_size(tmpq) * n_embd_head * n_head, + 0 + ); + cb(qrot, "qrot", il); + + struct lm_ggml_tensor * krot = lm_ggml_view_3d( + ctx0, tmpk, n_rot, n_head, n_tokens, + lm_ggml_element_size(tmpk) * n_embd_head, + lm_ggml_element_size(tmpk) * n_embd_head * n_head, + 0 + ); + cb(krot, "krot", il); + + // get the second half of tmpq, e.g tmpq[n_rot:, :, :] + struct lm_ggml_tensor * qpass = lm_ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + lm_ggml_element_size(tmpq) * n_embd_head, + lm_ggml_element_size(tmpq) * n_embd_head * n_head, + lm_ggml_element_size(tmpq) * n_rot + ); + cb(qpass, "qpass", il); + + struct lm_ggml_tensor * kpass = lm_ggml_view_3d( + ctx0, tmpk, n_rot, n_head, n_tokens, + lm_ggml_element_size(tmpk) * n_embd_head, + lm_ggml_element_size(tmpk) * n_embd_head * n_head, + lm_ggml_element_size(tmpk) * n_rot + ); + cb(kpass, "kpass", il); + + struct lm_ggml_tensor * qrotated = lm_ggml_rope_custom( + ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(qrotated, "qrotated", il); - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * token; - struct lm_ggml_tensor * position; - struct lm_ggml_tensor * inpL; + struct lm_ggml_tensor * krotated = lm_ggml_rope_custom( + ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(krotated, "krotated", il); - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + // ggml currently only supports concatenation on dim=2 + // so we need to permute qrot, qpass, concat, then permute back. + qrotated = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); + cb(qrotated, "qrotated", il); - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - } - lm_ggml_set_name(inp_tokens, "inp_tokens"); + krotated = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, krotated, 2, 1, 0, 3)); + cb(krotated, "krotated", il); - token = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif + qpass = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qpass, 2, 1, 0, 3)); + cb(qpass, "qpass", il); - token = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); + kpass = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, kpass, 2, 1, 0, 3)); + cb(kpass, "kpass", il); - lm_ggml_allocr_alloc(lctx.alloc, token); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(token)); - } - } + struct lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, qrotated, qpass); + cb(Qcur, "Qcur", il); - { - // Compute position embeddings. - struct lm_ggml_tensor * inp_positions = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - lm_ggml_allocr_alloc(lctx.alloc, inp_positions); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - for (int i = 0; i < n_tokens; ++i) { - ((int32_t *) inp_positions->data)[i] = batch.pos[i]; - } - } - lm_ggml_set_name(inp_positions, "inp_positions"); + struct lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, krotated, kpass); + cb(Kcur, "Kcur", il); - position = lm_ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); - } + struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); + cb(Q, "Q", il); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + Kcur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); + cb(Kcur, "Kcur", il); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); + struct lm_ggml_tensor * Vcur = lm_ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + ); + cb(Vcur, "Vcur", il); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } + // TODO: not tested, could be broken + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - } - } - - inpL = lm_ggml_add(ctx0, token, position); - lm_ggml_set_name(inpL, "inpL"); - - for (int il = 0; il < n_layer; ++il) { - { - // Norm - cur = lm_ggml_norm(ctx0, inpL, norm_eps); - cur = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - } - - { - // Self Attention - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); - - struct lm_ggml_tensor * tmpq = lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct lm_ggml_tensor * tmpk = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); - struct lm_ggml_tensor * tmpv = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); - struct lm_ggml_tensor * Qcur = tmpq; - struct lm_ggml_tensor * Kcur = tmpk; + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); + // feed-forward network { - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - lm_ggml_set_name(Vcur, "Vcur"); - - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - lm_ggml_set_name(k, "k"); - - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } - - struct lm_ggml_tensor * Q = - lm_ggml_permute(ctx0, - lm_ggml_cpy(ctx0, - Qcur, - lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), - 0, 2, 1, 3); - lm_ggml_set_name(Q, "Q"); - - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - lm_ggml_set_name(K, "K"); - - // K * Q - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - lm_ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale_inplace(ctx0, KQ, KQ_scale); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); - lm_ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max_inplace(ctx0, KQ_masked); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - lm_ggml_set_name(V, "V"); - - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - lm_ggml_set_name(KQV, "KQV"); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - lm_ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - } - - // Projection - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - - // Add the input - cur = lm_ggml_add(ctx0, cur, inpL); - - struct lm_ggml_tensor * inpFF = cur; - - // FF - { - // Norm - { - cur = lm_ggml_norm(ctx0, inpFF, norm_eps); - cur = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - - // GELU activation - cur = lm_ggml_gelu(ctx0, cur); + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - // Projection - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + inpL = cur; } - inpL = lm_ggml_add(ctx0, cur, inpFF); - } + cur = inpL; - // Output Norm - { - cur = lm_ggml_norm(ctx0, inpL, norm_eps); - cur = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); - } - lm_ggml_set_name(cur, "result_norm"); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - lm_ggml_build_forward_expand(gf, cur); - lm_ggml_free(ctx0); + lm_ggml_build_forward_expand(gf, cur); - return gf; -} + return gf; + } -static struct lm_ggml_cgraph * llm_build_persimmon( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; + struct lm_ggml_cgraph * build_refact() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - const auto & kv_self = lctx.kv_self; + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - LM_GGML_ASSERT(!!kv_self.ctx); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - const auto & cparams = lctx.cparams; - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_head = hparams.n_head; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const size_t n_rot = n_embd_head / 2; + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - const int n_gpu_layers = model.n_gpu_layers; + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * inpSA = inpL; + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + // self-attention + { + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - const bool do_rope_shift = lm_ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - auto & buf_compute = lctx.buf_compute; - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct lm_ggml_context * ctx0 = lm_ggml_init(params); + Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * inpL; + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - } - lm_ggml_set_name(inp_tokens, "inp_tokens"); - inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); - lm_ggml_allocr_alloc(lctx.alloc, inpL); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); - } - } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - } - } - struct lm_ggml_tensor * KQ_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - lm_ggml_set_name(KQ_pos, "KQ_pos"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - if (do_rope_shift) { - struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - lm_ggml_set_name(K_shift, "K_shift"); - lm_ggml_allocr_alloc(lctx.alloc, K_shift); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * tmp = - // we rotate only the first n_rot dimensions. - lm_ggml_rope_custom_inplace(ctx0, - lm_ggml_view_3d(ctx0, kv_self.k, - n_rot, n_head, n_ctx, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) - ), - K_shift, n_rot, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); - lm_ggml_build_forward_expand(gf, tmp); - } - } - for (int il=0; il < n_layer; ++il) { - struct lm_ggml_tensor * residual = inpL; - offload_func_t offload_func = llama_nop; - { - cur = lm_ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - cur = lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - cur = lm_ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - offload_func(cur); - lm_ggml_format_name(cur, "input_layernorm_%d", il); - } - // self attention - { - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); - cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); - - // split qkv - LM_GGML_ASSERT(n_head_kv == n_head); - lm_ggml_set_name(cur, format("qkv_%d", il).c_str()); - struct lm_ggml_tensor * tmpqkv = lm_ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - offload_func_kq(tmpqkv); - struct lm_ggml_tensor * tmpqkv_perm = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - offload_func_kq(tmpqkv_perm); - lm_ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); - struct lm_ggml_tensor * tmpq = lm_ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - 0 - ); - offload_func_kq(tmpq); - struct lm_ggml_tensor * tmpk = lm_ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens - ); - offload_func_kq(tmpk); - // Q/K Layernorm - tmpq = lm_ggml_norm(ctx0, tmpq, norm_eps); - offload_func_kq(tmpq); - tmpq = lm_ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - offload_func_kq(tmpq); - tmpq = lm_ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - offload_func_kq(tmpq); - - tmpk = lm_ggml_norm(ctx0, tmpk, norm_eps); - offload_func_v(tmpk); - tmpk = lm_ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - offload_func_v(tmpk); - tmpk = lm_ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - offload_func_v(tmpk); - - // RoPE the first n_rot of q/k, pass the other half, and concat. - struct lm_ggml_tensor * qrot = lm_ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpq) * n_embd_head, - lm_ggml_element_size(tmpq) * n_embd_head * n_head, - 0 - ); - offload_func_kq(qrot); - lm_ggml_format_name(qrot, "qrot_%d", il); - struct lm_ggml_tensor * krot = lm_ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpk) * n_embd_head, - lm_ggml_element_size(tmpk) * n_embd_head * n_head, - 0 - ); - offload_func_kq(krot); - lm_ggml_format_name(krot, "krot_%d", il); - - // get the second half of tmpq, e.g tmpq[n_rot:, :, :] - struct lm_ggml_tensor * qpass = lm_ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpq) * n_embd_head, - lm_ggml_element_size(tmpq) * n_embd_head * n_head, - lm_ggml_element_size(tmpq) * n_rot - ); - offload_func_kq(qpass); - lm_ggml_format_name(qpass, "qpass_%d", il); - struct lm_ggml_tensor * kpass = lm_ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - lm_ggml_element_size(tmpk) * n_embd_head, - lm_ggml_element_size(tmpk) * n_embd_head * n_head, - lm_ggml_element_size(tmpk) * n_rot - ); - offload_func_kq(kpass); - lm_ggml_format_name(kpass, "kpass_%d", il); + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - struct lm_ggml_tensor * qrotated = lm_ggml_rope_custom( - ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale - ); - offload_func_kq(qrotated); - struct lm_ggml_tensor * krotated = lm_ggml_rope_custom( - ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale - ); - offload_func_kq(krotated); - // ggml currently only supports concatenation on dim=2 - // so we need to permute qrot, qpass, concat, then permute back. - qrotated = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - offload_func_kq(qrotated); - krotated = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - offload_func_kq(krotated); - - qpass = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - offload_func_kq(qpass); - kpass = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - offload_func_kq(kpass); - - struct lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, qrotated, qpass); - offload_func_kq(Qcur); - struct lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, krotated, kpass); - offload_func_kq(Kcur); - - struct lm_ggml_tensor * Q = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - offload_func_kq(Q); - - Kcur = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - offload_func_kq(Kcur); - { - struct lm_ggml_tensor * tmpv = lm_ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - lm_ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 - ); - offload_func_v(tmpv); - // store K, V in cache - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - lm_ggml_set_name(Vcur, "Vcur"); - - struct lm_ggml_tensor * k = lm_ggml_view_1d( - ctx0, kv_self.k, n_tokens*n_embd_gqa, - (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) - ); - offload_func_kq(k); - lm_ggml_set_name(k, "k"); - - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - offload_func_v(v); - lm_ggml_set_name(v, "v"); - - // important: storing RoPE-ed version of K in the KV cache! - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } - struct lm_ggml_tensor * K = lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - - offload_func_kq(K); - lm_ggml_format_name(K, "K_%d", il); - - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - lm_ggml_set_name(KQ, "KQ"); - - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - lm_ggml_set_name(KQ_masked, "KQ_masked"); - - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_kq(KQ_soft_max); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - lm_ggml_set_name(V, "V"); - - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - lm_ggml_set_name(KQV, "KQV"); - - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - lm_ggml_set_name(KQV_merged, "KQV_merged"); - - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - cur = lm_ggml_add(ctx0, cur, model.layers[il].bo); - offload_func(cur); - lm_ggml_set_name(cur, "result_wo"); - } - - struct lm_ggml_tensor * inpFF = lm_ggml_add(ctx0, residual, cur); - offload_func(inpFF); - lm_ggml_set_name(inpFF, "inpFF"); - { - // MLP + // feed-forward network { - // Norm - cur = lm_ggml_norm(ctx0, inpFF, norm_eps); - offload_func(cur); - cur = lm_ggml_add(ctx0, - lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm), - model.layers[il].ffn_norm_b - ); - lm_ggml_set_name(cur, "ffn_norm"); - offload_func(cur); + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = lm_ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); - - cur = lm_ggml_add(ctx0, cur, model.layers[il].b3); - offload_func(cur); - lm_ggml_set_name(cur, "result_ffn_up"); - - cur = lm_ggml_sqr(ctx0, lm_ggml_relu(ctx0, cur)); - lm_ggml_set_name(cur, "result_ffn_act"); - offload_func(cur); - offload_func(cur->src[0]); - - cur = lm_ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - cur = lm_ggml_add(ctx0, - cur, - model.layers[il].b2); - offload_func(cur); - lm_ggml_set_name(cur, "outFF"); - } - cur = lm_ggml_add(ctx0, cur, inpFF); - offload_func(cur); - lm_ggml_set_name(cur, "inpFF_+_outFF"); - inpL = cur; - } - cur = inpL; - { - cur = lm_ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); - cur = lm_ggml_mul(ctx0, cur, model.output_norm); - offload_func_nr(cur); - - cur = lm_ggml_add(ctx0, cur, model.output_norm_b); - // offload_func_nr(cur); - lm_ggml_set_name(cur, "result_norm"); - } - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); - lm_ggml_build_forward_expand(gf, cur); - lm_ggml_free(ctx0); - return gf; -} + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); -static struct lm_ggml_cgraph * llm_build_bloom( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; + // input for next layer + inpL = cur; + } - const auto & kv_self = lctx.kv_self; + cur = inpL; - LM_GGML_ASSERT(!!kv_self.ctx); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + lm_ggml_build_forward_expand(gf, cur); - const float norm_eps = hparams.f_norm_eps; + return gf; + } - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + struct lm_ggml_cgraph * build_bloom() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - auto & buf_compute = lctx.buf_compute; + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - params.no_alloc = true; + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - struct lm_ggml_context * ctx0 = lm_ggml_init(params); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + inpL = llm_build_norm(ctx0, inpL, hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); + cb(inpL, "inp_norm", -1); - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * token; - struct lm_ggml_tensor * inpL; + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + // self-attention + { + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - } - lm_ggml_set_name(inp_tokens, "inp_tokens"); + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - token = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - token = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - lm_ggml_allocr_alloc(lctx.alloc, token); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(token)); - } - } + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); + } - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; + // Add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } + + inpL = lm_ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - } - // norm - { - inpL = lm_ggml_norm(ctx0, token, norm_eps); - inpL = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; } - lm_ggml_set_name(inpL, "inpL"); + struct lm_ggml_cgraph * build_mpt() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); - for (int il = 0; il < n_layer; ++il) { - { - // Norm - cur = lm_ggml_norm(ctx0, inpL, norm_eps); - cur = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - } + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - { - // Self Attention - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - struct lm_ggml_tensor * tmpq = lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct lm_ggml_tensor * tmpk = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); - struct lm_ggml_tensor * tmpv = lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + // KQ_scale + struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - struct lm_ggml_tensor * Qcur = tmpq; - struct lm_ggml_tensor * Kcur = tmpk; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); - // store key and value to memory + for (int il = 0; il < n_layer; ++il) { + struct lm_ggml_tensor * attn_norm; + + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + NULL, + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); + + // self-attention { - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - lm_ggml_set_name(Vcur, "Vcur"); + cur = attn_norm; - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - lm_ggml_set_name(k, "k"); + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); + if (hparams.f_clamp_kqv > 0.0f) { + cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + } - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct lm_ggml_tensor * Q = - lm_ggml_permute(ctx0, - lm_ggml_cpy(ctx0, - Qcur, - lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), - 0, 2, 1, 3); - lm_ggml_set_name(Q, "Q"); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - lm_ggml_set_name(K, "K"); + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // K * Q - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - lm_ggml_set_name(KQ, "KQ"); + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale_inplace(ctx0, KQ, KQ_scale); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); + } - struct lm_ggml_tensor * KQ_scaled_alibi = lm_ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - lm_ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + // Add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - // KQ_masked = mask_past(KQ_scaled) - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - lm_ggml_set_name(KQ_masked, "KQ_masked"); + // feed forward + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + NULL, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } - // KQ = soft_max(KQ_masked) - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max_inplace(ctx0, KQ_masked); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cur = lm_ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); - // split cached V into n_head heads - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - lm_ggml_set_name(V, "V"); + // input for next layer + inpL = cur; + } - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - lm_ggml_set_name(KQV, "KQV"); + cur = inpL; - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - lm_ggml_set_name(KQV_merged, "KQV_merged"); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - } + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - // Projection - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + lm_ggml_build_forward_expand(gf, cur); - // Add the input - cur = lm_ggml_add(ctx0, cur, inpL); + return gf; + } +}; - struct lm_ggml_tensor * inpFF = cur; +// +// tensor offloading helpers +// +// TODO: will be removed with backend v2 + +enum llm_offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; - // FF - { - // Norm - { - cur = lm_ggml_norm(ctx0, inpFF, norm_eps); - cur = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); +// TODO: will be removed with backend v2 +struct llm_offload_trie { + struct node { + ~node() { + for (int i = 0; i < 256; ++i) { + if (children[i]) { + delete children[i]; + } } + } - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + node * children[256] = { nullptr }; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; + }; + + llm_offload_trie() { + root = new node; + } - // GELU activation - cur = lm_ggml_gelu(ctx0, cur); + llm_offload_trie(const std::unordered_map & map) { + root = new node; - // Projection - cur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + for (const auto & kv : map) { + add(kv.first, kv.second); } + } - inpL = lm_ggml_add(ctx0, cur, inpFF); + ~llm_offload_trie() { + delete root; } - // Output Norm - { - cur = lm_ggml_norm(ctx0, inpL, norm_eps); - cur = lm_ggml_add(ctx0, lm_ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + void add(const char * name, llm_offload_func_e func) { + node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + cur->children[c] = new node; + } + + cur = cur->children[c]; + } + + cur->func = func; } - lm_ggml_set_name(cur, "result_norm"); - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); + llm_offload_func_e find(const char * name) const { + const node * cur = root; - lm_ggml_build_forward_expand(gf, cur); + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; - lm_ggml_free(ctx0); + if (!c) { + break; + } - return gf; -} + if (!cur->children[c]) { + return OFFLOAD_FUNC_NOP; + } -static struct lm_ggml_cgraph * llm_build_mpt( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; + cur = cur->children[c]; + } - const auto & kv_self = lctx.kv_self; + return cur->func; + } - LM_GGML_ASSERT(!!kv_self.ctx); + node * root = nullptr; +}; - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); +// TODO: will be removed with backend v2 +static const std::unordered_map k_offload_map = { + //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + { "pos_embd", OFFLOAD_FUNC_NR }, + + { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope) + { "KQ_scale", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, + + { "inp_norm", OFFLOAD_FUNC_NR }, + { "inp_norm_w", OFFLOAD_FUNC_NR }, + { "inp_norm_wb", OFFLOAD_FUNC_NR }, + + { "norm", OFFLOAD_FUNC }, + { "norm_w", OFFLOAD_FUNC }, + { "norm_wb", OFFLOAD_FUNC }, + + { "attn_norm", OFFLOAD_FUNC }, + { "attn_norm_2", OFFLOAD_FUNC }, + + { "wqkv", OFFLOAD_FUNC_KQ }, + { "bqkv", OFFLOAD_FUNC_KQ }, + { "wqkv_clamped", OFFLOAD_FUNC_KQ }, + + { "tmpk", OFFLOAD_FUNC_KQ }, + { "tmpq", OFFLOAD_FUNC_KQ }, + { "tmpv", OFFLOAD_FUNC_V }, + { "Kcur", OFFLOAD_FUNC_KQ }, + { "Qcur", OFFLOAD_FUNC_KQ }, + { "Vcur", OFFLOAD_FUNC_V }, + + { "krot", OFFLOAD_FUNC_KQ }, + { "qrot", OFFLOAD_FUNC_KQ }, + { "kpass", OFFLOAD_FUNC_KQ }, + { "qpass", OFFLOAD_FUNC_KQ }, + { "krotated", OFFLOAD_FUNC_KQ }, + { "qrotated", OFFLOAD_FUNC_KQ }, + + { "q", OFFLOAD_FUNC_KQ }, + { "k", OFFLOAD_FUNC_KQ }, + { "kq", OFFLOAD_FUNC_KQ }, + { "kq_scaled", OFFLOAD_FUNC_KQ }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "kq_masked", OFFLOAD_FUNC_KQ }, + { "kq_soft_max", OFFLOAD_FUNC_V }, + { "v", OFFLOAD_FUNC_V }, + { "kqv", OFFLOAD_FUNC_V }, + { "kqv_merged", OFFLOAD_FUNC_V }, + { "kqv_merged_cont", OFFLOAD_FUNC_V }, + { "kqv_wo", OFFLOAD_FUNC_V }, + { "kqv_out", OFFLOAD_FUNC_V }, + + { "ffn_inp", OFFLOAD_FUNC }, + { "ffn_norm", OFFLOAD_FUNC }, + + { "ffn_up", OFFLOAD_FUNC }, + { "ffn_up_b", OFFLOAD_FUNC }, + { "ffn_gate", OFFLOAD_FUNC }, + { "ffn_gate_b", OFFLOAD_FUNC }, + { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_down", OFFLOAD_FUNC }, + { "ffn_down_b", OFFLOAD_FUNC }, + { "ffn_out", OFFLOAD_FUNC }, + + { "ffn_silu", OFFLOAD_FUNC }, + { "ffn_gelu", OFFLOAD_FUNC }, + { "ffn_relu", OFFLOAD_FUNC }, + { "ffn_sqr(relu)", OFFLOAD_FUNC }, + + { "l_out", OFFLOAD_FUNC }, + + { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output", OFFLOAD_FUNC_OUT }, +}; - const float norm_eps = hparams.f_norm_eps; - const float clamp_kqv = hparams.f_clamp_kqv; - const float max_alibi_bias = hparams.f_max_alibi_bias; +static llm_offload_trie k_offload_func_trie(k_offload_map); - const int n_gpu_layers = model.n_gpu_layers; +static struct lm_ggml_cgraph * llama_build_graph( + llama_context & lctx, + const llama_batch & batch) { + const auto & model = lctx.model; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + // check if we should build the worst-case graph (for memory measurement) + const bool worst_case = lm_ggml_allocr_is_measure(lctx.alloc); - auto & buf_compute = lctx.buf_compute; + // keep track of the input that has already been allocated + bool alloc_inp_tokens = false; + bool alloc_inp_embd = false; + bool alloc_inp_pos = false; + bool alloc_inp_KQ_scale = false; + bool alloc_inp_KQ_mask = false; + bool alloc_inp_K_shift = false; - struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; +#ifdef LM_GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = true; // TODO: set to false after finishing refactoring +#endif - params.no_alloc = true; + int n_non_view = 0; // number of non-view tensors that have been processed by the callback - struct lm_ggml_context * ctx0 = lm_ggml_init(params); + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + // TODO: will be removed with backend v2 + llm_build_cb cb = [&](struct lm_ggml_tensor * cur, const char * name, int il) { + if (il >= 0) { + lm_ggml_format_name(cur, "%s-%d", name, il); + } else { + lm_ggml_set_name(cur, name); + } - lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + // + // allocate input tensors and set input data + // + // TODO: will be removed with backend v2 - struct lm_ggml_tensor * cur; - struct lm_ggml_tensor * inpL; + if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + lm_ggml_allocr_alloc(lctx.alloc, cur); - //int warmup = 0; - if (batch.token) { - struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + if (!lm_ggml_allocr_is_measure(lctx.alloc) && batch.token) { + const int64_t n_tokens = cur->ne[0]; - lm_ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens)); - //warmup = ((uint32_t*) inp_tokens->data)[0] == 0; + memcpy(cur->data, batch.token, n_tokens*lm_ggml_element_size(cur)); + } + + alloc_inp_tokens = true; } - lm_ggml_set_name(inp_tokens, "inp_tokens"); + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + lm_ggml_allocr_alloc(lctx.alloc, cur); - inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef LM_GGML_USE_MPI - LM_GGML_ASSERT(false && "not implemented"); -#endif + if (!lm_ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; - inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens); + memcpy(cur->data, batch.embd, n_tokens*n_embd*lm_ggml_element_size(cur)); + } - lm_ggml_allocr_alloc(lctx.alloc, inpL); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL)); + alloc_inp_embd = true; } - } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; + if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + lm_ggml_allocr_alloc(lctx.alloc, cur); - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + if (!lm_ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + const int64_t n_tokens = cur->ne[0]; -#ifdef LM_GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = lm_ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = lm_ggml_cuda_assign_buffers_no_alloc; - } -#endif // LM_GGML_USE_CUBLAS + int32_t * data = (int32_t *) cur->data; - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - lm_ggml_set_name(KQ_mask, "KQ_mask"); - lm_ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, lm_ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; } } + + alloc_inp_pos = true; } - } - for (int il = 0; il < n_layer; ++il) { - struct lm_ggml_tensor * attn_norm; + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + lm_ggml_allocr_alloc(lctx.alloc, cur); - offload_func_t offload_func = llama_nop; + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + lm_ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } -#ifdef LM_GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = lm_ggml_cuda_assign_buffers_no_alloc; + alloc_inp_KQ_scale = true; } -#endif // LM_GGML_USE_CUBLAS - // self-attention - // TODO: refactor into common function (shared with LLaMA) - { - attn_norm = lm_ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); + if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { + lm_ggml_allocr_alloc(lctx.alloc, cur); - attn_norm = lm_ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - offload_func(attn_norm); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; - if (1) { - cur = attn_norm; + float * data = (float *) cur->data; + memset(data, 0, lm_ggml_nbytes(cur)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } } - // compute QKV + alloc_inp_KQ_mask = true; + } + + if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { + lm_ggml_allocr_alloc(lctx.alloc, cur); - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; - if (clamp_kqv > 0.0f) { - cur = lm_ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - offload_func_kq(cur); + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } } - const size_t wsize = lm_ggml_type_size(cur->type); + alloc_inp_K_shift = true; + } - struct lm_ggml_tensor * Qcur = lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); - offload_func_kq(Qcur); + // view tensors are not processed further + if (cur->view_src != nullptr) { + return; + } - struct lm_ggml_tensor * Kcur = lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); - offload_func_kq(Kcur); + if (cur->op != LM_GGML_OP_NONE) { + n_non_view++; + } - struct lm_ggml_tensor * tmpv = lm_ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_kq(Kcur); + // + // offload layers + // + // TODO: will be removed with backend v2 - lm_ggml_set_name(Qcur, "Qcur"); - lm_ggml_set_name(Kcur, "Kcur"); +//#define LLAMA_OFFLOAD_DEBUG - { - struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, lm_ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - lm_ggml_set_name(Vcur, "Vcur"); - - struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - lm_ggml_set_name(k, "k"); - - struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*lm_ggml_element_size(kv_self.v), - (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v)); - offload_func_v(v); - - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v)); - } + if (!do_offload) { + return; + } - struct lm_ggml_tensor * Q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - lm_ggml_set_name(Q, "Q"); - - struct lm_ggml_tensor * K = - lm_ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - lm_ggml_element_size(kv_self.k)*n_embd_gqa, - lm_ggml_element_size(kv_self.k)*n_embd_head, - lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - lm_ggml_set_name(K, "K"); - - struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - lm_ggml_set_name(KQ, "KQ"); - - struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - lm_ggml_set_name(KQ_scaled, "KQ_scaled"); - - // TODO: replace with lm_ggml_add() - struct lm_ggml_tensor * KQ_scaled_alibi = - lm_ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - offload_func_kq(KQ_scaled_alibi); - lm_ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); - lm_ggml_set_name(KQ_masked, "KQ_masked"); - - struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - lm_ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - struct lm_ggml_tensor * V = - lm_ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - lm_ggml_element_size(kv_self.v)*n_ctx, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - lm_ggml_set_name(V, "V"); - - struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - lm_ggml_set_name(KQV, "KQV"); - - struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - lm_ggml_set_name(KQV_merged, "KQV_merged"); - - cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - lm_ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - lm_ggml_set_name(cur, "result_wo"); - } - - // Add the input - cur = lm_ggml_add(ctx0, cur, inpL); - offload_func(cur); - - struct lm_ggml_tensor * attn_out = cur; - - // feed forward - { - // Norm - { - cur = lm_ggml_norm(ctx0, attn_out, norm_eps); - offload_func(cur); + const int n_layer = model.hparams.n_layer; - cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - } + const int n_gpu_layers = model.n_gpu_layers; + const int i_gpu_start = n_layer - n_gpu_layers; - cur = lm_ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); + // should we offload the final norm? yes if we are not computing embeddings + const bool offload_emb = lctx.embedding.empty(); - cur = lm_ggml_gelu(ctx0, cur); - offload_func(cur); - cur = lm_ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - } + static const std::unordered_map> k_offload_func_name = { + { OFFLOAD_FUNC_NOP, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, +#ifdef LM_GGML_USE_CUBLAS + { OFFLOAD_FUNC, "GPU (CUDA)" }, + { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, + { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, + { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, + { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, +#else + { OFFLOAD_FUNC, "CPU" }, + { OFFLOAD_FUNC_KQ, "CPU" }, + { OFFLOAD_FUNC_V, "CPU" }, + { OFFLOAD_FUNC_NR, "CPU" }, + { OFFLOAD_FUNC_EMB, "CPU" }, +#endif // LM_GGML_USE_CUBLAS + }; - cur = lm_ggml_add(ctx0, cur, attn_out); - offload_func(cur); - // input for next layer - inpL = cur; - } + // check the global map for what offload function to use for this tensor + llm_offload_func_e func_e = k_offload_func_trie.find(name); - cur = inpL; + if (func_e == OFFLOAD_FUNC_NOP) { +#ifdef LLAMA_OFFLOAD_DEBUG + // if a tensor hasn't been offloaded, we warn the user + if (worst_case) { + LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, + cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); + } +#endif - // norm - { - cur = lm_ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + return; + } - cur = lm_ggml_mul(ctx0, cur, model.output_norm); - lm_ggml_set_name(cur, "result_norm"); - } + // count the number of layers and respect the provided n_gpu_layers + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: + break; + case OFFLOAD_FUNC: + if (n_gpu_layers < n_layer) { + if (il < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } + break; + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_V: + if (n_gpu_layers <= n_layer + 1) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_KQ: + if (n_gpu_layers <= n_layer + 2) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_EMB: + if (!offload_emb || n_gpu_layers < n_layer) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + default: LM_GGML_ASSERT(false); + } - cur = lm_ggml_mul_mat(ctx0, model.output, cur); - lm_ggml_set_name(cur, "result_output"); + offload_func_t func = lm_ggml_offload_nop; - lm_ggml_build_forward_expand(gf, cur); + // this is needed for compatibility with Metal for example +#ifdef LM_GGML_USE_CUBLAS + static offload_func_t lm_ggml_offload_gpu = lm_ggml_cuda_assign_buffers_no_alloc; +#else + static offload_func_t lm_ggml_offload_gpu = lm_ggml_offload_nop; +#endif - lm_ggml_free(ctx0); + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: func = lm_ggml_offload_nop; break; + case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQ: + case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_NR: + case OFFLOAD_FUNC_EMB: func = lm_ggml_offload_gpu; break; + default: LM_GGML_ASSERT(false); + } - return gf; -} + // apply offload function to the tensor + func(cur); -static struct lm_ggml_cgraph * llama_build_graph( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; +#ifdef LLAMA_OFFLOAD_DEBUG + if (worst_case) { + LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); + } +#endif + }; struct lm_ggml_cgraph * result = NULL; + struct llm_build_context llm(lctx, batch, cb, worst_case); + + llm.init(); + switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, batch); + result = llm.build_llama(); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, batch); + result = llm.build_baichuan(); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, batch); + result = llm.build_falcon(); } break; case LLM_ARCH_STARCODER: { - result = llm_build_starcoder(lctx, batch); + result = llm.build_starcoder(); } break; case LLM_ARCH_PERSIMMON: { - result = llm_build_persimmon(lctx, batch); + result = llm.build_persimmon(); } break; case LLM_ARCH_REFACT: { - result = llm_build_refact(lctx, batch); + result = llm.build_refact(); } break; case LLM_ARCH_BLOOM: { - result = llm_build_bloom(lctx, batch); + result = llm.build_bloom(); } break; case LLM_ARCH_MPT: { - result = llm_build_mpt(lctx, batch); + result = llm.build_mpt(); } break; default: LM_GGML_ASSERT(false); } + llm.free(); + + if (worst_case) { + int n_non_view_total = 0; + + for (int i = 0; i < result->n_nodes; ++i) { + if (result->nodes[i]->view_src == nullptr) { + n_non_view_total++; + } + } + + LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); + + if (n_non_view != n_non_view_total) { + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); + LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); + LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); + LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + } + } + return result; } @@ -5970,8 +5186,6 @@ static int llama_decode_internal( } } - lm_ggml_cuda_set_mul_mat_q(cparams.mul_mat_q); - // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed if (!lctx.embedding.empty()) { embeddings->backend = LM_GGML_BACKEND_CPU; @@ -5991,11 +5205,14 @@ static int llama_decode_internal( } // If all tensors can be run on the GPU then using more than 1 thread is detrimental. - const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || - model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || - model.arch == LLM_ARCH_MPT; + const bool full_offload_supported = + model.arch == LLM_ARCH_LLAMA || + model.arch == LLM_ARCH_BAICHUAN || + model.arch == LLM_ARCH_FALCON || + model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_MPT || + model.arch == LLM_ARCH_STARCODER; + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (lm_ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { n_threads = 1; @@ -6022,11 +5239,20 @@ static int llama_decode_internal( #endif // update the kv ring buffer - lctx.kv_self.has_shift = false; - lctx.kv_self.head += n_tokens; - // Ensure kv cache head points to a valid index. - if (lctx.kv_self.head >= lctx.kv_self.size) { - lctx.kv_self.head = 0; + { + if (kv_self.has_shift) { + kv_self.has_shift = false; + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } + } + + kv_self.head += n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } } #ifdef LM_GGML_PERF @@ -6041,6 +5267,8 @@ static int llama_decode_internal( //} // extract logits + // TODO: do not compute and extract logits if only embeddings are needed + // need to update the graphs to skip "result_output" { auto & logits_out = lctx.logits; @@ -7307,6 +6535,32 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } } +void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + if (p <= 0.0f || !candidates->size) { + return; + } + + llama_sample_softmax(ctx, candidates); + + const int64_t t_start_sample_us = lm_ggml_time_us(); + + float scale = candidates->data[0].p; // scale by max prob + size_t i = 1; // first token always matches + + for (; i < candidates->size; ++i) { + if (candidates->data[i].p < p * scale && i >= min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + candidates->size = i; + + if (ctx) { + ctx->t_sample_us += lm_ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { if (z >= 1.0f || candidates->size <= 2) { return; @@ -7996,6 +7250,24 @@ struct no_init { no_init() { /* do nothing */ } }; +struct quantize_state_internal { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; + + int n_k_quantized = 0; + int n_fallback = 0; + + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; + static void llama_convert_tensor_internal( struct lm_ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -8054,14 +7326,14 @@ static void llama_convert_tensor_internal( workers.clear(); } -#ifdef LM_GGML_USE_K_QUANTS static lm_ggml_type get_k_quant_type( - lm_ggml_type new_type, const lm_ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv, - int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2 + quantize_state_internal & qs, + lm_ggml_type new_type, const lm_ggml_tensor * tensor, llama_ftype ftype ) { const std::string name = lm_ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants - const auto tn = LLM_TN(model.arch); + const llm_arch arch = qs.model.arch; + const auto tn = LLM_TN(arch); auto use_more_bits = [](int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; @@ -8069,7 +7341,7 @@ static lm_ggml_type get_k_quant_type( if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; - if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = LM_GGML_TYPE_Q8_0; } else if (new_type != LM_GGML_TYPE_Q8_0) { @@ -8078,46 +7350,46 @@ static lm_ggml_type get_k_quant_type( } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = *i_attention_wv < 2 ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; + new_type = qs.i_attention_wv < 2 ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(*i_attention_wv, n_attention_wv)) new_type = LM_GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = LM_GGML_TYPE_Q5_K; + use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = LM_GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = LM_GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = LM_GGML_TYPE_Q6_K; - if (model.type == MODEL_70B) { + (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = LM_GGML_TYPE_Q6_K; + if (qs.model.type == MODEL_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == LM_GGML_TYPE_Q3_K || new_type == LM_GGML_TYPE_Q4_K) new_type = LM_GGML_TYPE_Q5_K; } - ++*i_attention_wv; + ++qs.i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = *i_feed_forward_w2 < 2 ? LM_GGML_TYPE_Q5_K - : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? LM_GGML_TYPE_Q4_K + new_type = qs.i_feed_forward_w2 < 2 ? LM_GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? LM_GGML_TYPE_Q4_K : LM_GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = model.arch == LLM_ARCH_FALCON ? LM_GGML_TYPE_Q4_K : LM_GGML_TYPE_Q5_K; + new_type = arch == LLM_ARCH_FALCON ? LM_GGML_TYPE_Q4_K : LM_GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (model.arch == LLM_ARCH_FALCON) { - new_type = *i_feed_forward_w2 < 2 ? LM_GGML_TYPE_Q6_K : - use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; + if (arch == LLM_ARCH_FALCON) { + new_type = qs.i_feed_forward_w2 < 2 ? LM_GGML_TYPE_Q6_K : + use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; } else { - if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = LM_GGML_TYPE_Q6_K; + if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = LM_GGML_TYPE_Q6_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = LM_GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = LM_GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { new_type = LM_GGML_TYPE_Q5_K; } - ++*i_feed_forward_w2; + ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (model.arch != LLM_ARCH_FALCON) { + if (arch != LLM_ARCH_FALCON) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = LM_GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = LM_GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q5_K; @@ -8144,25 +7416,27 @@ static lm_ggml_type get_k_quant_type( int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, lm_ggml_type_name(new_type)); convert_incompatible_tensor = true; + } else { + ++qs.n_k_quantized; } } if (convert_incompatible_tensor) { - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { - new_type = LM_GGML_TYPE_F16; //fall back to F16 instead of just failing. - LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { - new_type = LM_GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); - } else { - throw std::runtime_error("Unsupported tensor size encountered\n"); + switch (new_type) { + case LM_GGML_TYPE_Q2_K: new_type = LM_GGML_TYPE_Q4_0; break; + case LM_GGML_TYPE_Q3_K: new_type = LM_GGML_TYPE_Q4_1; break; + case LM_GGML_TYPE_Q4_K: new_type = LM_GGML_TYPE_Q5_0; break; + case LM_GGML_TYPE_Q5_K: new_type = LM_GGML_TYPE_Q5_1; break; + case LM_GGML_TYPE_Q6_K: new_type = LM_GGML_TYPE_Q8_0; break; + default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } + LLAMA_LOG_WARN(" - using fallback quantization %s\n", lm_ggml_type_name(new_type)); + ++qs.n_fallback; } return new_type; } -#endif static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { lm_ggml_type quantized_type; @@ -8177,7 +7451,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_F16: quantized_type = LM_GGML_TYPE_F16; break; case LLAMA_FTYPE_ALL_F32: quantized_type = LM_GGML_TYPE_F32; break; -#ifdef LM_GGML_USE_K_QUANTS // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = LM_GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: @@ -8188,7 +7461,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = LM_GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = LM_GGML_TYPE_Q6_K; break; -#endif + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -8215,6 +7488,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_arch(ml, model); llm_load_hparams(ml, model); + struct quantize_state_internal qs(model, params); + if (params->only_copy) { ftype = model.ftype; } @@ -8227,10 +7502,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); -#ifdef LM_GGML_USE_K_QUANTS - int n_attention_wv = 0; - int n_feed_forward_w2 = 0; - for (int i = 0; i < ml.n_tensors; ++i) { struct lm_ggml_tensor * meta = ml.get_tensor_meta(i); @@ -8238,21 +7509,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { - ++n_attention_wv; + ++qs.n_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { - ++n_feed_forward_w2; + ++qs.n_feed_forward_w2; } } - if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { + if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", - __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); + __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); } - int i_attention_wv = 0; - int i_feed_forward_w2 = 0; -#endif - size_t total_size_org = 0; size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); @@ -8316,11 +7583,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantize) { new_type = quantized_type; -#ifdef LM_GGML_USE_K_QUANTS - new_type = get_k_quant_type( - new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2 - ); -#endif + if (!params->pure) { + new_type = get_k_quant_type(qs, new_type, tensor, ftype); + } + // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type; @@ -8445,6 +7711,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("\n"); } } + + if (qs.n_fallback > 0) { + LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", + __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); + } } static int llama_apply_lora_from_file_internal( @@ -8609,14 +7880,14 @@ static int llama_apply_lora_from_file_internal( lm_ggml_tensor * dest_t = model_tensors[base_name]; - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; + offload_func_t offload_func = lm_ggml_offload_nop; + offload_func_t offload_func_force_inplace = lm_ggml_offload_nop; #ifdef LM_GGML_USE_CUBLAS if (dest_t->backend == LM_GGML_BACKEND_GPU || dest_t->backend == LM_GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != LM_GGML_TYPE_F16) { throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); } offload_func = lm_ggml_cuda_assign_buffers; offload_func_force_inplace = lm_ggml_cuda_assign_buffers_force_inplace; @@ -8751,8 +8022,14 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ LM_GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ LM_GGML_DEFAULT_N_THREADS, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, + /*.yarn_ext_factor =*/ -1.0f, + /*.yarn_attn_factor =*/ 1.0f, + /*.yarn_beta_fast =*/ 32.0f, + /*.yarn_beta_slow =*/ 1.0f, + /*.yarn_orig_ctx =*/ 0, /*.mul_mat_q =*/ true, /*.f16_kv =*/ true, /*.logits_all =*/ false, @@ -8769,6 +8046,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, + /*.pure =*/ false, }; return result; @@ -8838,10 +8116,7 @@ struct llama_model * llama_load_model_from_file( }; } - if (!llama_model_load(path_model, *model, params.n_gpu_layers, - params.main_gpu, params.tensor_split, - params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *model, params)) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; @@ -8867,13 +8142,35 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - cparams.n_batch = params.n_batch; - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.mul_mat_q = params.mul_mat_q; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : + hparams.n_ctx_train; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + } if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); @@ -9129,8 +8426,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->kv_self.head; } -void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) { - llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1); +void llama_kv_cache_clear(struct llama_context * ctx) { + llama_kv_cache_clear(ctx->kv_self); } void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { @@ -9576,7 +8873,7 @@ int llama_eval( llama_token * tokens, int32_t n_tokens, int n_past) { - llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); + llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0)); if (ret < 0) { @@ -9591,7 +8888,7 @@ int llama_eval_embd( float * embd, int32_t n_tokens, int n_past) { - llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); + llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; diff --git a/cpp/llama.h b/cpp/llama.h index fa6afd98..d8069999 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -106,6 +106,14 @@ extern "C" { LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; + enum llama_rope_scaling_type { + LLAMA_ROPE_SCALING_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_NONE = 0, + LLAMA_ROPE_SCALING_LINEAR = 1, + LLAMA_ROPE_SCALING_YARN = 2, + LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -167,18 +175,24 @@ extern "C" { }; struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - uint32_t n_ctx; // text context, 0 = from model - uint32_t n_batch; // prompt processing maximum batch size - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing + uint32_t seed; // RNG seed, -1 for random + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing + int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency, 0 = from model - float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim + uint32_t yarn_orig_ctx; // YaRN original context size // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels + bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool logits_all; // the llama_eval() call computes all logits, not just the last one bool embedding; // embedding mode only @@ -191,6 +205,7 @@ extern "C" { bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // disable k-quant mixtures and quantize all tensors to the same type } llama_model_quantize_params; // grammar types @@ -333,17 +348,14 @@ extern "C" { LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx), "avoid using this, it will be removed in the future, instead - count the tokens in user code"); - // Remove all tokens data of cells in [c0, c1) - // c0 < 0 : [0, c1] - // c1 < 0 : [c0, inf) - LLAMA_API void llama_kv_cache_tokens_rm( - struct llama_context * ctx, - int32_t c0, - int32_t c1); + // Clear the KV cache + LLAMA_API void llama_kv_cache_clear( + struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) - // p0 < 0 : [0, p1] - // p1 < 0 : [p0, inf) + // seq_id < 0 : match any sequence + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_rm( struct llama_context * ctx, llama_seq_id seq_id, @@ -600,6 +612,13 @@ extern "C" { float p, size_t min_keep); + /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + LLAMA_API void llama_sample_min_p( + struct llama_context * ctx, + llama_token_data_array * candidates, + float p, + size_t min_keep); + /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. LLAMA_API void llama_sample_tail_free( struct llama_context * ctx, @@ -658,6 +677,7 @@ extern "C" { float * mu); /// @details Selects the token with the highest probability. + /// Does not compute the token probabilities. Use llama_sample_softmax() instead. LLAMA_API llama_token llama_sample_token_greedy( struct llama_context * ctx, llama_token_data_array * candidates); diff --git a/cpp/log.h b/cpp/log.h index a5084d84..374404e2 100644 --- a/cpp/log.h +++ b/cpp/log.h @@ -97,38 +97,56 @@ #define LOG_TEE_TARGET stderr #endif -// NOTE: currently disabled as it produces too many log files +// Utility for synchronizing log configuration state +// since std::optional was introduced only in c++17 +enum LogTriState +{ + LogTriStateSame, + LogTriStateFalse, + LogTriStateTrue +}; + // Utility to obtain "pid" like unique process id and use it when creating log files. -//inline std::string log_get_pid() -//{ -// static std::string pid; -// if (pid.empty()) -// { -// // std::this_thread::get_id() is the most portable way of obtaining a "process id" -// // it's not the same as "pid" but is unique enough to solve multiple instances -// // trying to write to the same log. -// std::stringstream ss; -// ss << std::this_thread::get_id(); -// pid = ss.str(); -// } -// -// return pid; -//} +inline std::string log_get_pid() +{ + static std::string pid; + if (pid.empty()) + { + // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // it's not the same as "pid" but is unique enough to solve multiple instances + // trying to write to the same log. + std::stringstream ss; + ss << std::this_thread::get_id(); + pid = ss.str(); + } + + return pid; +} // Utility function for generating log file names with unique id based on thread id. // invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" // where the number is a runtime id of the current thread. -#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) +#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) // INTERNAL, DO NOT USE -inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) +inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) { + static bool _multilog = false; + + if (multilog != LogTriStateSame) + { + _multilog = multilog == LogTriStateTrue; + } + std::stringstream buf; buf << log_file_basename; - //buf << "."; - //buf << log_get_pid(); + if (_multilog) + { + buf << "."; + buf << log_get_pid(); + } buf << "."; buf << log_file_extension; @@ -213,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #define LOG_TEE_FLF_VAL ,"" #endif -// Utility for synchronizing log configuration state -// since std::optional was introduced only in c++17 -enum LogTriState -{ - LogTriStateSame, - LogTriStateFalse, - LogTriStateTrue -}; - // INTERNAL, DO NOT USE // USE LOG() INSTEAD // @@ -328,16 +337,23 @@ enum LogTriState #endif // INTERNAL, DO NOT USE -inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) +inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) { - static bool _initialized{false}; - static bool _disabled{(filename.empty() && target == nullptr)}; + static bool _initialized = false; + static bool _append = false; + static bool _disabled = filename.empty() && target == nullptr; static std::string log_current_filename{filename}; static FILE *log_current_target{target}; static FILE *logfile = nullptr; if (change) { + if (append != LogTriStateSame) + { + _append = append == LogTriStateTrue; + return logfile; + } + if (disable == LogTriStateTrue) { // Disable primary target @@ -390,7 +406,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } } - logfile = fopen(filename.c_str(), "w"); + logfile = fopen(filename.c_str(), _append ? "a" : "w"); } if (!logfile) @@ -411,9 +427,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } // INTERNAL, DO NOT USE -inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) +inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) { - return log_handler1_impl(change, disable, filename, target); + return log_handler1_impl(change, append, disable, filename, target); } // Disables logs entirely at runtime. @@ -424,7 +440,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri // INTERNAL, DO NOT USE inline FILE *log_disable_impl() { - return log_handler1_impl(true, LogTriStateTrue); + return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); } // Enables logs at runtime. @@ -433,19 +449,31 @@ inline FILE *log_disable_impl() // INTERNAL, DO NOT USE inline FILE *log_enable_impl() { - return log_handler1_impl(true, LogTriStateFalse); + return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); } // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) #define log_set_target(target) log_set_target_impl(target) // INTERNAL, DO NOT USE -inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } -inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } +inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } +inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } // INTERNAL, DO NOT USE inline FILE *log_handler() { return log_handler1_impl(); } +// Enable or disable creating separate log files for each run. +// can ONLY be invoked BEFORE first log use. +#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") +// Enable or disable append mode for log file. +// can ONLY be invoked BEFORE first log use. +#define log_append(enable) log_append_impl(enable) +// INTERNAL, DO NOT USE +inline FILE *log_append_impl(bool enable) +{ + return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); +} + inline void log_test() { log_disable(); @@ -507,6 +535,18 @@ inline bool log_param_single_parse(const std::string & param) return true; } + if (param == "--log-new") + { + log_multilog(true); + return true; + } + + if (param == "--log-append") + { + log_append(true); + return true; + } + return false; } @@ -536,7 +576,9 @@ inline void log_print_usage() printf(" --log-disable Disable trace logs\n"); printf(" --log-enable Enable trace logs\n"); printf(" --log-file Specify a log filename (without extension)\n"); - printf(" Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ + printf(" --log-new Create a separate new log file on start. " + "Each log file will have unique name: \"..log\"\n"); + printf(" --log-append Don't truncate the old log file.\n"); } #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp index 5258d4e8..1317024c 100644 --- a/cpp/sampling.cpp +++ b/cpp/sampling.cpp @@ -39,6 +39,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) { void llama_sampling_reset(llama_sampling_context * ctx) { if (ctx->grammar != NULL) { llama_grammar_free(ctx->grammar); + ctx->grammar = NULL; } if (!ctx->parsed_grammar.rules.empty()) { @@ -89,10 +90,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, - params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, + params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); return std::string(result); @@ -110,6 +111,7 @@ llama_token llama_sampling_sample( const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; + const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; @@ -167,8 +169,12 @@ llama_token llama_sampling_sample( llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); } - if (temp <= 0) { - // greedy sampling + if (temp < 0.0) { + // greedy sampling, with probs + llama_sample_softmax(ctx_main, &cur_p); + id = cur_p.data[0].id; + } else if (temp == 0.0) { + // greedy sampling, no probs id = llama_sample_token_greedy(ctx_main, &cur_p); } else { if (mirostat == 1) { @@ -186,6 +192,7 @@ llama_token llama_sampling_sample( llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); llama_sample_temp (ctx_main, &cur_p, temp); id = llama_sample_token(ctx_main, &cur_p); diff --git a/cpp/sampling.h b/cpp/sampling.h index 62ea6d4c..7c9b8dcf 100644 --- a/cpp/sampling.h +++ b/cpp/sampling.h @@ -14,6 +14,7 @@ typedef struct llama_sampling_params { int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 1b8054bb..8bf07c5e 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.3.0-rc.2): + - llama-rn (0.3.0-rc.4): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1242,7 +1242,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: bd839b6cef0d2d94fd90bab43fed46efe1f4f961 + llama-rn: 0abcf4f4f58615499974d3a30876ddf8ca986012 RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 diff --git a/llama-rn.podspec b/llama-rn.podspec index f7c13dc9..30da99e8 100644 --- a/llama-rn.podspec +++ b/llama-rn.podspec @@ -2,7 +2,7 @@ require "json" package = JSON.parse(File.read(File.join(__dir__, "package.json"))) base_ld_flags = "-framework Accelerate -framework Foundation -framework Metal -framework MetalKit" -base_compiler_flags = "-fno-objc-arc -DLM_GGML_USE_ACCELERATE -DLM_GGML_USE_K_QUANTS -Wno-shorten-64-to-32" +base_compiler_flags = "-fno-objc-arc -DLM_GGML_USE_ACCELERATE -Wno-shorten-64-to-32" folly_compiler_flags = "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -Wno-comma" if ENV["RNLLAMA_DISABLE_METAL"] != "1" then @@ -11,7 +11,7 @@ end # Use base_optimizer_flags = "" for debug builds # base_optimizer_flags = "" -base_optimizer_flags = "-Ofast -DNDEBUG" +base_optimizer_flags = "-O3 -DNDEBUG" Pod::Spec.new do |s| s.name = "llama-rn" diff --git a/llama.cpp b/llama.cpp index 34b2a5e1..a75fa576 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 34b2a5e1ee4fe6295fb4420eb91131d743694c65 +Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 130694a5..7fd9bc8e 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -3,11 +3,6 @@ git submodule init git submodule update --recursive -cd llama.cpp -./scripts/build-info.sh > build-info.h -cd - - -cp ./llama.cpp/build-info.h ./cpp/build-info.h cp ./llama.cpp/ggml.h ./cpp/ggml.h cp ./llama.cpp/ggml.c ./cpp/ggml.c cp ./llama.cpp/ggml-metal.h ./cpp/ggml-metal.h @@ -17,10 +12,11 @@ cp ./llama.cpp/ggml-alloc.h ./cpp/ggml-alloc.h cp ./llama.cpp/ggml-alloc.c ./cpp/ggml-alloc.c cp ./llama.cpp/ggml-backend.h ./cpp/ggml-backend.h cp ./llama.cpp/ggml-backend.c ./cpp/ggml-backend.c +cp ./llama.cpp/ggml-impl.h ./cpp/ggml-impl.h cp ./llama.cpp/llama.h ./cpp/llama.h cp ./llama.cpp/llama.cpp ./cpp/llama.cpp -cp ./llama.cpp/k_quants.h ./cpp/k_quants.h -cp ./llama.cpp/k_quants.c ./cpp/k_quants.c +cp ./llama.cpp/ggml-quants.h ./cpp/ggml-quants.h +cp ./llama.cpp/ggml-quants.c ./cpp/ggml-quants.c cp ./llama.cpp/unicode.h ./cpp/unicode.h cp ./llama.cpp/common/log.h ./cpp/log.h cp ./llama.cpp/common/common.h ./cpp/common.h @@ -39,12 +35,13 @@ files=( "./cpp/ggml-metal.m" "./cpp/llama.h" "./cpp/llama.cpp" - "./cpp/k_quants.h" - "./cpp/k_quants.c" + "./cpp/ggml-quants.h" + "./cpp/ggml-quants.c" "./cpp/ggml-alloc.h" "./cpp/ggml-alloc.c" "./cpp/ggml-backend.h" "./cpp/ggml-backend.c" + "./cpp/ggml-impl.h" ) # Loop through each file and run the sed commands @@ -71,6 +68,7 @@ echo "Replacement completed successfully!" yarn example # Apply patch +patch -p0 -d ./cpp < ./scripts/common.cpp.patch patch -p0 -d ./cpp < ./scripts/log.h.patch patch -p0 -d ./cpp < ./scripts/llama.cpp.patch patch -p0 -d ./cpp < ./scripts/ggml-metal.m.patch diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch new file mode 100644 index 00000000..782ccbe5 --- /dev/null +++ b/scripts/common.cpp.patch @@ -0,0 +1,11 @@ +--- common.cpp.orig 2023-11-07 10:50:44 ++++ common.cpp 2023-11-07 10:50:46 +@@ -1225,8 +1225,6 @@ + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + const llama_sampling_params & sparams = params.sparams; + +- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); +- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index add06b20..c1604689 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,11 +1,11 @@ ---- ggml-metal.m.orig 2023-10-27 12:54:53 -+++ ggml-metal.m 2023-10-27 12:54:54 +--- ggml-metal.m.orig 2023-11-02 10:42:43 ++++ ggml-metal.m 2023-11-02 10:43:38 @@ -209,7 +209,7 @@ } else { LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + NSString * sourcePath = [bundle pathForResource:@"ggml-metal-llama" ofType:@"metal"]; - LM_GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); - NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; - if (error) { + if (sourcePath == nil) { + LM_GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); + sourcePath = @"ggml-metal.metal"; diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index e5e8b70f..649b1868 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,5 +1,5 @@ ---- llama.cpp.orig 2023-10-19 10:27:20 -+++ llama.cpp 2023-10-19 10:27:21 +--- llama.cpp.orig 2023-11-10 13:36:00 ++++ llama.cpp 2023-11-10 13:36:02 @@ -103,6 +103,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) @@ -18,7 +18,7 @@ // // helpers // -@@ -737,16 +748,16 @@ +@@ -779,16 +790,16 @@ if (prefetch > 0) { // Advise the kernel to preload the mapped memory diff --git a/scripts/log.h.patch b/scripts/log.h.patch index f531adee..240348e1 100644 --- a/scripts/log.h.patch +++ b/scripts/log.h.patch @@ -1,6 +1,6 @@ ---- log.h.orig 2023-10-27 12:54:53 -+++ log.h 2023-10-27 12:54:54 -@@ -314,6 +314,19 @@ +--- log.h.orig 2023-11-02 10:45:07 ++++ log.h 2023-11-02 10:45:08 +@@ -323,6 +323,19 @@ #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n") #endif @@ -18,5 +18,5 @@ +#endif + // INTERNAL, DO NOT USE - inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) + inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) {