feat: sync llama.cpp (#22)

* feat: sync llama.cpp * feat: update rn-llama.hpp * fix: build of API changes * feat: sync * feat: add lora_scaled param * fix(android): lora params
mybigday · Oct 2, 2023 · 8da7244 · 8da7244
1 parent 9907c72
commit 8da7244
Show file tree

Hide file tree

Showing 25 changed files with 3,640 additions and 1,900 deletions.
diff --git a/android/src/main/java/com/rnllama/LlamaContext.java b/android/src/main/java/com/rnllama/LlamaContext.java
@@ -56,6 +56,8 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
       params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true,
       // String lora,
       params.hasKey("lora") ? params.getString("lora") : "",
+      // float lora_scaled,
+      params.hasKey("lora_scaled") ? (float) params.getDouble("lora_scaled") : 1.0f,
       // String lora_base,
       params.hasKey("lora_base") ? params.getString("lora_base") : "",
       // float rope_freq_base,
@@ -221,6 +223,7 @@ protected static native long initContext(
     boolean use_mmap,
     boolean memory_f16,
     String lora,
+    float lora_scaled,
     String lora_base,
     float rope_freq_base,
     float rope_freq_scale

diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
@@ -131,6 +131,7 @@ Java_com_rnllama_LlamaContext_initContext(
     jboolean use_mmap,
     jboolean memory_f16,
     jstring lora_str,
+    jfloat lora_scaled,
     jstring lora_base_str,
     jfloat rope_freq_base,
     jfloat rope_freq_scale
@@ -160,10 +161,12 @@ Java_com_rnllama_LlamaContext_initContext(
     defaultParams.memory_f16 = memory_f16;
 
     const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
-    defaultParams.lora_adapter = lora_chars;
-
     const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
-    defaultParams.lora_base = lora_base_chars;
+    if (!lora_chars) {
+        defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
+        defaultParams.lora_base = lora_base_chars;
+        defaultParams.use_mmap = false;
+    }
 
     defaultParams.rope_freq_base = rope_freq_base;
     defaultParams.rope_freq_scale = rope_freq_scale;
@@ -281,7 +284,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
         llama->params.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
     }
 
-    const int n_vocab = llama_n_vocab(llama->ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
     jsize logit_bias_len = env->GetArrayLength(logit_bias);
 
     for (jsize i = 0; i < logit_bias_len; i++) {

diff --git a/cpp/build-info.h b/cpp/build-info.h
@@ -1,8 +1,8 @@
 #ifndef BUILD_INFO_H
 #define BUILD_INFO_H
 
-#define BUILD_NUMBER 1255
-#define BUILD_COMMIT "7ddf185"
+#define BUILD_NUMBER 1299
+#define BUILD_COMMIT "f5ef5cf"
 #define BUILD_COMPILER ""
 #define BUILD_TARGET "unknown"
 

diff --git a/cpp/common.cpp b/cpp/common.cpp
diff --git a/cpp/common.h b/cpp/common.h
@@ -3,7 +3,6 @@
 #pragma once
 
 #include "llama.h"
-#include "build-info.h"
 
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
@@ -37,20 +36,23 @@ int32_t get_num_physical_cores();
 struct gpt_params {
     uint32_t seed                           = -1;   // RNG seed
     int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict                       = -1;   // new tokens to predict
     int32_t n_ctx                           = 512;  // context size
     int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel                      = 1;    // number of parallel sequences to decode
+    int32_t n_sequences                     = 1;    // number of sequences to decode
     int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
-    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
-    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
+    float   rope_freq_base                  = 0.0f; // RoPE base frequency
+    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
 
     // sampling parameters
     int32_t top_k             = 40;    // <= 0 to use vocab size
@@ -84,8 +86,8 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
-    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base    = "";  // base model path for the lora adapter
+    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::string lora_base  = "";                              // base model path for the lora adapter
 
     int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -94,7 +96,6 @@ struct gpt_params {
     bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
-    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
@@ -108,30 +109,35 @@ struct gpt_params {
     bool interactive_first = false; // wait for user input immediately
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
+    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
+    bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
-    bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
+std::string get_system_info(const gpt_params & params);
+
 std::string gpt_random_prompt(std::mt19937 & rng);
 
+void process_escapes(std::string& input);
+
 //
 // Model utils
 //
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
+struct llama_model_params   llama_model_params_from_gpt_params(const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
 //
@@ -141,7 +147,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
-        struct llama_context * ctx,
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::vector<llama_token> llama_tokenize(
+    const struct llama_model * model,
            const std::string & text,
                         bool   add_bos);
 
@@ -182,7 +193,7 @@ std::string llama_detokenize_bpe(
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - grammar:       grammar to use for sampling, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits(ctx) + idx * n_vocab
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
 //  - token:      sampled token

diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c
@@ -77,7 +77,7 @@ struct free_block {
     size_t size;
 };
 
-#define MAX_FREE_BLOCKS 128
+#define MAX_FREE_BLOCKS 256
 
 struct lm_ggml_allocr {
     void * data;
@@ -187,6 +187,7 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
     }
 
     tensor->data = addr;
+    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
 
 #ifdef LM_GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
@@ -218,7 +219,8 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_
 
     size_t size = lm_ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 
 #ifdef LM_GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
@@ -631,3 +633,7 @@ static size_t lm_ggml_allocr_alloc_graph_tensors_n(
 size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph) {
     return lm_ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
+
+size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc) {
+    return alloc->max_size;
+}
diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h
@@ -19,6 +19,7 @@ LM_GGML_API bool   lm_ggml_allocr_is_measure(struct lm_ggml_allocr * alloc);
 LM_GGML_API void   lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc);
 LM_GGML_API void   lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor);
 LM_GGML_API size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph);
+LM_GGML_API size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc);
 
 
 #ifdef  __cplusplus

diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include "ggml.h"
+
 #include <stddef.h>
 #include <stdbool.h>
 
@@ -33,6 +35,8 @@ struct lm_ggml_cgraph;
 extern "C" {
 #endif
 
+void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data);
+
 struct lm_ggml_metal_context;
 
 // number of command buffers to use