Skip to content

Commit

Permalink
feat: sync llama.cpp (#22)
Browse files Browse the repository at this point in the history
* feat: sync llama.cpp

* feat: update rn-llama.hpp

* fix: build of API changes

* feat: sync

* feat: add lora_scaled param

* fix(android): lora params
  • Loading branch information
jhen0409 authored Oct 2, 2023
1 parent 9907c72 commit 8da7244
Show file tree
Hide file tree
Showing 25 changed files with 3,640 additions and 1,900 deletions.
3 changes: 3 additions & 0 deletions android/src/main/java/com/rnllama/LlamaContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true,
// String lora,
params.hasKey("lora") ? params.getString("lora") : "",
// float lora_scaled,
params.hasKey("lora_scaled") ? (float) params.getDouble("lora_scaled") : 1.0f,
// String lora_base,
params.hasKey("lora_base") ? params.getString("lora_base") : "",
// float rope_freq_base,
Expand Down Expand Up @@ -221,6 +223,7 @@ protected static native long initContext(
boolean use_mmap,
boolean memory_f16,
String lora,
float lora_scaled,
String lora_base,
float rope_freq_base,
float rope_freq_scale
Expand Down
11 changes: 7 additions & 4 deletions android/src/main/jni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ Java_com_rnllama_LlamaContext_initContext(
jboolean use_mmap,
jboolean memory_f16,
jstring lora_str,
jfloat lora_scaled,
jstring lora_base_str,
jfloat rope_freq_base,
jfloat rope_freq_scale
Expand Down Expand Up @@ -160,10 +161,12 @@ Java_com_rnllama_LlamaContext_initContext(
defaultParams.memory_f16 = memory_f16;

const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
defaultParams.lora_adapter = lora_chars;

const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
defaultParams.lora_base = lora_base_chars;
if (!lora_chars) {
defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
defaultParams.lora_base = lora_base_chars;
defaultParams.use_mmap = false;
}

defaultParams.rope_freq_base = rope_freq_base;
defaultParams.rope_freq_scale = rope_freq_scale;
Expand Down Expand Up @@ -281,7 +284,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
llama->params.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
}

const int n_vocab = llama_n_vocab(llama->ctx);
const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
jsize logit_bias_len = env->GetArrayLength(logit_bias);

for (jsize i = 0; i < logit_bias_len; i++) {
Expand Down
4 changes: 2 additions & 2 deletions cpp/build-info.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef BUILD_INFO_H
#define BUILD_INFO_H

#define BUILD_NUMBER 1255
#define BUILD_COMMIT "7ddf185"
#define BUILD_NUMBER 1299
#define BUILD_COMMIT "f5ef5cf"
#define BUILD_COMPILER ""
#define BUILD_TARGET "unknown"

Expand Down
203 changes: 140 additions & 63 deletions cpp/common.cpp

Large diffs are not rendered by default.

31 changes: 21 additions & 10 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#pragma once

#include "llama.h"
#include "build-info.h"

#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
Expand Down Expand Up @@ -37,20 +36,23 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor

// sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size
Expand Down Expand Up @@ -84,8 +86,8 @@ struct gpt_params {
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files

std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter

int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
Expand All @@ -94,7 +96,6 @@ struct gpt_params {
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score

bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
Expand All @@ -108,30 +109,35 @@ struct gpt_params {
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = false; // insert new sequences for decoding on-the-fly

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

std::string get_system_info(const gpt_params & params);

std::string gpt_random_prompt(std::mt19937 & rng);

void process_escapes(std::string& input);

//
// Model utils
//

std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

//
Expand All @@ -141,7 +147,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
// tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode`
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const struct llama_context * ctx,
const std::string & text,
bool add_bos);

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos);

Expand Down Expand Up @@ -182,7 +193,7 @@ std::string llama_detokenize_bpe(
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits(ctx) + idx * n_vocab
// - idx: sample from llama_get_logits_ith(ctx, idx)
//
// returns:
// - token: sampled token
Expand Down
10 changes: 8 additions & 2 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ struct free_block {
size_t size;
};

#define MAX_FREE_BLOCKS 128
#define MAX_FREE_BLOCKS 256

struct lm_ggml_allocr {
void * data;
Expand Down Expand Up @@ -187,6 +187,7 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
}

tensor->data = addr;
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);

#ifdef LM_GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, tensor);
Expand Down Expand Up @@ -218,7 +219,8 @@ static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_

size_t size = lm_ggml_allocr_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);

#ifdef LM_GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, tensor);
Expand Down Expand Up @@ -631,3 +633,7 @@ static size_t lm_ggml_allocr_alloc_graph_tensors_n(
size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph) {
return lm_ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
}

size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc) {
return alloc->max_size;
}
1 change: 1 addition & 0 deletions cpp/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ LM_GGML_API bool lm_ggml_allocr_is_measure(struct lm_ggml_allocr * alloc);
LM_GGML_API void lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc);
LM_GGML_API void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor);
LM_GGML_API size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph);
LM_GGML_API size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc);


#ifdef __cplusplus
Expand Down
4 changes: 4 additions & 0 deletions cpp/ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#pragma once

#include "ggml.h"

#include <stddef.h>
#include <stdbool.h>

Expand All @@ -33,6 +35,8 @@ struct lm_ggml_cgraph;
extern "C" {
#endif

void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data);

struct lm_ggml_metal_context;

// number of command buffers to use
Expand Down
Loading

0 comments on commit 8da7244

Please sign in to comment.