Skip to content

Commit

Permalink
feat: sync llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Oct 11, 2023
1 parent 95b4fb2 commit 81f9cb2
Show file tree
Hide file tree
Showing 17 changed files with 2,486 additions and 412 deletions.
1 change: 1 addition & 0 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(RNLLAMA_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../cpp)
set(
SOURCE_FILES
${RNLLAMA_LIB_DIR}/ggml-alloc.c
${RNLLAMA_LIB_DIR}/ggml-backend.c
${RNLLAMA_LIB_DIR}/ggml.c
${RNLLAMA_LIB_DIR}/k_quants.c
${RNLLAMA_LIB_DIR}/common.cpp
Expand Down
4 changes: 2 additions & 2 deletions cpp/build-info.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef BUILD_INFO_H
#define BUILD_INFO_H

#define BUILD_NUMBER 1338
#define BUILD_COMMIT "1faaae8"
#define BUILD_NUMBER 1364
#define BUILD_COMMIT "9f6ede1"
#define BUILD_COMPILER ""
#define BUILD_TARGET "unknown"

Expand Down
4 changes: 2 additions & 2 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// store the external file name in params
params.prompt_file = argv[i];
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
if (!params.prompt.empty() && params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-n" || arg == "--n-predict") {
Expand Down Expand Up @@ -295,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
if (params.cfg_negative_prompt.back() == '\n') {
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
params.cfg_negative_prompt.pop_back();
}
} else if (arg == "--cfg-scale") {
Expand Down
169 changes: 62 additions & 107 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
@@ -1,30 +1,12 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#include <assert.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/types.h>
#include <sys/mman.h>
#endif
#endif
#endif

#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <memoryapi.h>
#endif


#define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b))
Expand Down Expand Up @@ -80,8 +62,9 @@ struct free_block {
#define MAX_FREE_BLOCKS 256

struct lm_ggml_allocr {
struct lm_ggml_backend_buffer * buffer;
bool buffer_owned;
void * data;
size_t size;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
Expand Down Expand Up @@ -119,28 +102,20 @@ static void remove_allocated_tensor(struct lm_ggml_allocr * alloc, struct lm_ggm
}
#endif

static size_t lm_ggml_allocr_get_alloc_size(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
return lm_ggml_nbytes(tensor);

UNUSED(alloc);
}

// check if a tensor is allocated by this buffer
static bool lm_ggml_allocr_is_own(struct lm_ggml_allocr * alloc, const struct lm_ggml_tensor * tensor) {
void * ptr = tensor->data;
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
return tensor->buffer == alloc->buffer;
}

static bool lm_ggml_is_view(struct lm_ggml_tensor * t) {
return t->view_src != NULL;
}

void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
#ifdef LM_GGML_ALLOCATOR_DEBUG
LM_GGML_ASSERT(!lm_ggml_is_view(tensor)); // views generally get data pointer from one of their sources
LM_GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
#endif
size_t size = lm_ggml_allocr_get_alloc_size(alloc, tensor);

size_t size = lm_ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
size = aligned_offset(NULL, size, alloc->alignment);

AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
Expand Down Expand Up @@ -188,6 +163,8 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *

tensor->data = addr;
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
tensor->buffer = alloc->buffer;
lm_ggml_backend_buffer_init_tensor(alloc->buffer, tensor);

#ifdef LM_GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, tensor);
Expand All @@ -208,19 +185,21 @@ void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *

// this is a very naive implementation, but for our case the number of free blocks should be very small
static void lm_ggml_allocr_free_tensor(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor) {
void * ptr = tensor->data;

if (lm_ggml_allocr_is_own(alloc, tensor) == false) {
// the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
return;
}

size_t size = lm_ggml_allocr_get_alloc_size(alloc, tensor);
void * ptr = tensor->data;

size_t size = lm_ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);

lm_ggml_backend_buffer_free_tensor(alloc->buffer, tensor);

#ifdef LM_GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, tensor);
Expand Down Expand Up @@ -285,15 +264,18 @@ void lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc) {
alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
alloc->free_blocks[0].size = alloc->size - align_offset;
alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
}

struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alignment) {
struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
struct lm_ggml_backend_buffer * buffer = lm_ggml_backend_cpu_buffer_from_ptr(NULL, data, size);

struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr));

*alloc = (struct lm_ggml_allocr){
/*.data = */ data,
/*.size = */ size,
/*.buffer = */ buffer,
/*.buffer_owned = */ true,
/*.base = */ lm_ggml_backend_buffer_get_base(buffer),
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
Expand All @@ -312,74 +294,26 @@ struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alig
return alloc;
}

// OS specific functions to allocate and free uncommitted virtual memory
static void * alloc_vmem(size_t size) {
#if defined(_WIN32)
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
#elif defined(_POSIX_MAPPED_FILES)
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ptr == MAP_FAILED) {
return NULL;
}
return ptr;
#else
// use a fixed address for other platforms
uintptr_t base_addr = (uintptr_t)-size - 0x100;
return (void *)base_addr;
#endif
}

static void free_vmem(void * base_addr, size_t size) {
#if defined(_WIN32)
VirtualFree(base_addr, 0, MEM_RELEASE);
UNUSED(size);
#elif defined(_POSIX_MAPPED_FILES)
munmap(base_addr, size);
#else
// nothing to do
UNUSED(base_addr);
UNUSED(size);
#endif
}

// allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 128GB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
do {
*base_addr = alloc_vmem(*size);
if (*base_addr != NULL) {
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
return;
}
// try again with half the size
*size /= 2;
} while (*size > 0);

LM_GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
}

static void free_measure_vmem(void * base_addr, size_t size) {
free_vmem(base_addr, size);
}

struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment) {
struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
struct lm_ggml_allocr * alloc = lm_ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
alloc->measure = true;

void * base_addr;
size_t size;
return alloc;
}

alloc_measure_vmem(&base_addr, &size);
struct lm_ggml_allocr * lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) {
struct lm_ggml_allocr * alloc = (struct lm_ggml_allocr *)malloc(sizeof(struct lm_ggml_allocr));

*alloc = (struct lm_ggml_allocr){
/*.data = */ base_addr,
/*.size = */ size,
/*.alignment = */ alignment,
/*.buffer = */ buffer,
/*.buffer_owned = */ false,
/*.base = */ lm_ggml_backend_buffer_get_base(buffer),
/*.alignment = */ lm_ggml_backend_buffer_get_alignment(buffer),
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
/*.hash_table = */ {{0}},
/*.max_size = */ 0,
/*.measure = */ true,
/*.measure = */ false,
/*.parse_seq = */ {0},
/*.parse_seq_len = */ 0,
#ifdef LM_GGML_ALLOCATOR_DEBUG
Expand All @@ -393,8 +327,8 @@ struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment) {
}

void lm_ggml_allocr_free(struct lm_ggml_allocr * alloc) {
if (alloc->measure) {
free_measure_vmem(alloc->data, alloc->size);
if (alloc->buffer_owned) {
lm_ggml_backend_buffer_free(alloc->buffer);
}
free(alloc);
}
Expand Down Expand Up @@ -437,20 +371,30 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) {
case LM_GGML_OP_ROPE:
case LM_GGML_OP_RMS_NORM:
case LM_GGML_OP_SOFT_MAX:
case LM_GGML_OP_CONT:
return true;

default:
return false;
}
}

static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view) {
assert(view->view_src != NULL && view->view_src->data != NULL);
view->backend = view->view_src->backend;
view->buffer = view->view_src->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the lm_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(lm_ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
lm_ggml_backend_buffer_init_tensor(alloc->buffer, view);
}

static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * node) {
struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) {
if (lm_ggml_is_view(node)) {
assert(node->view_src->data != NULL);
node->data = (char *)node->view_src->data + node->view_offs;
init_view(alloc, node);
} else {
// see if we can reuse a parent's buffer (inplace)
if (lm_ggml_op_can_inplace(node->op)) {
Expand Down Expand Up @@ -478,13 +422,17 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->data = parent->data;
node->view_src = view_src;
view_src_hn->n_views += 1;
init_view(alloc, node);
return;
}
}
else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->data = parent->data;
node->view_src = parent;
p_hn->n_views += 1;
init_view(alloc, node);
return;
}
}
Expand All @@ -495,7 +443,7 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
}
}

static size_t lm_ggml_allocr_alloc_graph_tensors_n(
size_t lm_ggml_allocr_alloc_graph_n(
struct lm_ggml_allocr * alloc,
struct lm_ggml_cgraph ** graphs, int n_graphs,
struct lm_ggml_tensor *** inputs, struct lm_ggml_tensor *** outputs) {
Expand All @@ -513,6 +461,10 @@ static size_t lm_ggml_allocr_alloc_graph_tensors_n(
if (lm_ggml_is_view(node)) {
struct lm_ggml_tensor * view_src = node->view_src;
hash_get(ht, view_src)->n_views += 1;
if (node->buffer == NULL && node->data != NULL) {
// view of a pre-allocated tensor, didn't call init_view() yet
init_view(alloc, node);
}
}

for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
Expand All @@ -521,6 +473,9 @@ static size_t lm_ggml_allocr_alloc_graph_tensors_n(
break;
}
hash_get(ht, parent)->n_children += 1;
if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
init_view(alloc, parent);
}
}
}
}
Expand Down Expand Up @@ -631,7 +586,7 @@ static size_t lm_ggml_allocr_alloc_graph_tensors_n(
}

size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph) {
return lm_ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
return lm_ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
}

size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc) {
Expand Down
16 changes: 11 additions & 5 deletions cpp/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,27 @@
extern "C" {
#endif

struct lm_ggml_backend_buffer;

LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new(void * data, size_t size, size_t alignment);
LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new_measure(size_t alignment);
LM_GGML_API struct lm_ggml_allocr * lm_ggml_allocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer);

// tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order
LM_GGML_API void lm_ggml_allocr_set_parse_seq(struct lm_ggml_allocr * alloc, const int * list, int n);

LM_GGML_API void lm_ggml_allocr_free(struct lm_ggml_allocr * alloc);
LM_GGML_API bool lm_ggml_allocr_is_measure(struct lm_ggml_allocr * alloc);
LM_GGML_API void lm_ggml_allocr_reset(struct lm_ggml_allocr * alloc);
LM_GGML_API void lm_ggml_allocr_alloc(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor);
LM_GGML_API void lm_ggml_allocr_free (struct lm_ggml_allocr * alloc);
LM_GGML_API bool lm_ggml_allocr_is_measure (struct lm_ggml_allocr * alloc);
LM_GGML_API void lm_ggml_allocr_reset (struct lm_ggml_allocr * alloc);
LM_GGML_API void lm_ggml_allocr_alloc (struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * tensor);
LM_GGML_API size_t lm_ggml_allocr_alloc_graph(struct lm_ggml_allocr * alloc, struct lm_ggml_cgraph * graph);
LM_GGML_API size_t lm_ggml_allocr_max_size(struct lm_ggml_allocr * alloc);
LM_GGML_API size_t lm_ggml_allocr_max_size (struct lm_ggml_allocr * alloc);

LM_GGML_API size_t lm_ggml_allocr_alloc_graph_n(
struct lm_ggml_allocr * alloc,
struct lm_ggml_cgraph ** graphs, int n_graphs,
struct lm_ggml_tensor *** inputs, struct lm_ggml_tensor *** outputs);

#ifdef __cplusplus
}
Expand Down
Loading

0 comments on commit 81f9cb2

Please sign in to comment.