Skip to content

Commit

Permalink
feat: Update llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
abetlen committed Oct 22, 2024
1 parent 7c4aead commit 7403e00
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 33 deletions.
18 changes: 17 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ if (LLAMA_BUILD)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
set(CMAKE_SKIP_RPATH FALSE)

# Enable building of the common library
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)

# Building llama
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
# Need to disable these llama.cpp flags on Apple x86_64,
Expand Down Expand Up @@ -106,7 +109,7 @@ if (LLAMA_BUILD)
# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
# Set CUDA_ARCHITECTURES to OFF on windows
# Set CUDA_ARCHITECTURES to OFF on Windows
if (WIN32)
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
endif()
Expand All @@ -121,5 +124,18 @@ if (LLAMA_BUILD)
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
)
endif()

# Fix for llava build: Add include directory for llama.h
# Move these commands after the add_subdirectory call
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)

if (BUILD_SHARED_LIBS)
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
endif()

target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
endif()
endif()
152 changes: 121 additions & 31 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,8 @@ class llama_token_data(ctypes.Structure):


# typedef struct llama_token_data_array {
# // TODO: consider SoA
# // NOTE: this pointer can be modified by the samplers
# llama_token_data * data;
# size_t size;
# int64_t selected; // this is the index in the data array (i.e. not the token id)
Expand Down Expand Up @@ -507,8 +509,11 @@ class llama_token_data_array(ctypes.Structure):
# // - token : the token ids of the input (used when embd is NULL)
# // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
# // - pos : the positions of the respective token in the sequence
# // (if set to NULL, the token position will be tracked automatically by llama_decode)
# // - seq_id : the sequence to which the respective token belongs
# // (if set to NULL, the sequence ID will be assumed to be 0)
# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
# // (if set to NULL, only the logits for last token will be returned)
# //
# typedef struct llama_batch {
# int32_t n_tokens;
Expand All @@ -519,16 +524,6 @@ class llama_token_data_array(ctypes.Structure):
# int32_t * n_seq_id;
# llama_seq_id ** seq_id;
# int8_t * logits; // TODO: rename this to "output"


# // NOTE: helpers for smooth API transition - can be deprecated in the future
# // for future-proof code, use the above fields instead and ignore everything below
# //
# // pos[i] = all_pos_0 + i*all_pos_1
# //
# llama_pos all_pos_0; // used if pos == NULL
# llama_pos all_pos_1; // used if pos == NULL
# llama_seq_id all_seq_id; // used if seq_id == NULL
# } llama_batch;
class llama_batch(ctypes.Structure):
"""Input data for llama_decode
Expand Down Expand Up @@ -563,9 +558,6 @@ class llama_batch(ctypes.Structure):
("n_seq_id", ctypes.POINTER(ctypes.c_int32)),
("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))),
("logits", ctypes.POINTER(ctypes.c_int8)),
("all_pos_0", llama_pos),
("all_pos_1", llama_pos),
("all_seq_id", llama_seq_id),
]


Expand Down Expand Up @@ -1170,6 +1162,12 @@ def llama_supports_gpu_offload() -> bool:
...


# LLAMA_API bool llama_supports_rpc (void);
@ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
def llama_supports_rpc() -> bool:
...


# LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_ctx(ctx: llama_context_p, /) -> int:
Expand Down Expand Up @@ -2255,30 +2253,26 @@ def llama_state_seq_load_file(
# //


# // Return batch for single sequence of tokens starting at pos_0
# // Return batch for single sequence of tokens
# // The sequence ID will be fixed to 0
# // The position of the tokens will be tracked automatically by llama_decode
# //
# // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
# //
# LLAMA_API struct llama_batch llama_batch_get_one(
# llama_token * tokens,
# int32_t n_tokens,
# llama_pos pos_0,
# llama_seq_id seq_id);
# int32_t n_tokens);
@ctypes_function(
"llama_batch_get_one",
[
llama_token_p,
ctypes.c_int,
llama_pos,
llama_seq_id,
ctypes.c_int32,
],
llama_batch,
)
def llama_batch_get_one(
tokens: CtypesArray[llama_token],
n_tokens: Union[ctypes.c_int, int],
pos_0: Union[llama_pos, int],
seq_id: llama_seq_id,
/,
) -> llama_batch:
"""Return batch for single sequence of tokens starting at pos_0
Expand Down Expand Up @@ -2616,6 +2610,13 @@ def llama_token_eos(model: llama_model_p, /) -> int:
...


# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
def llama_token_eot(model: llama_model_p, /) -> int:
"""end-of-turn"""
...


# LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
@ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
def llama_token_cls(model: llama_model_p, /) -> int:
Expand Down Expand Up @@ -2650,30 +2651,54 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool:


# // Codellama infill tokens
# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
@ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
def llama_token_prefix(model: llama_model_p) -> int:
"""codellama infill tokens"""
...


# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
def llama_token_middle(model: llama_model_p, /) -> int:
...


# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
def llama_token_suffix(model: llama_model_p, /) -> int:
...


# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
def llama_token_eot(model: llama_model_p, /) -> int:
# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token)
def llama_token_fim_pre(model: llama_model_p, /) -> int:
...

# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token)
def llama_token_fim_suf(model: llama_model_p, /) -> int:
...

# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token)
def llama_token_fim_mid(model: llama_model_p, /) -> int:
...

# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token)
def llama_token_fim_pad(model: llama_model_p, /) -> int:
...

# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token)
def llama_token_fim_rep(model: llama_model_p, /) -> int:
...

# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token)
def llama_token_fim_sep(model: llama_model_p, /) -> int:
...

# //
# // Tokenization
Expand Down Expand Up @@ -2786,6 +2811,23 @@ def llama_token_to_piece(
...


# # // check if token0 is contained as a prefix in token1
# # LLAMA_API bool llama_token_is_prefix(
# # const struct llama_model * model,
# # llama_token token0,
# # llama_token token1);
# @ctypes_function(
# "llama_token_is_prefix",
# [llama_model_p_ctypes, llama_token, llama_token],
# ctypes.c_bool,
# )
# def llama_token_is_prefix(
# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
# ) -> bool:
# """Check if token0 is contained as a prefix in token1"""
# ...


# /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
# /// @param text The char pointer must be large enough to hold the resulting text.
# /// @return Returns the number of chars/bytes on success, no more than text_len_max.
Expand Down Expand Up @@ -3099,20 +3141,22 @@ def llama_sampler_chain_remove(

# // available samplers:
#
# LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
@ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
def llama_sampler_init_greedy() -> llama_sampler_p:
...


# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
@ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
...


# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
# LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
def llama_sampler_init_softmax() -> llama_sampler_p:
...
Expand Down Expand Up @@ -3188,6 +3232,19 @@ def llama_sampler_init_temp_ext(
...


# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
# LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
@ctypes_function(
"llama_sampler_init_xtc",
[ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32],
llama_sampler_p_ctypes,
)
def llama_sampler_init_xtc(
p: float, t: float, min_keep: int, seed: int, /
) -> llama_sampler_p:
...


# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
Expand Down Expand Up @@ -3301,6 +3358,39 @@ def llama_sampler_init_logit_bias(
...


# // this sampler is meant to be used for fill-in-the-middle infilling
# // it's supposed to be used after top_k + top_p sampling
# //
# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
# // 2. combine probs of tokens that have the same prefix
# //
# // example:
# //
# // - before:
# // "hel": 0.5
# // "hell": 0.2
# // "hello": 0.1
# // "dummy": 0.1
# //
# // - after:
# // "hel": 0.8
# // "dummy": 0.1
# //
# // 3. discard non-EOG tokens with low prob
# // 4. if no tokens are left -> pick EOT
# //
# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
@ctypes_function(
"llama_sampler_init_infill",
[llama_model_p_ctypes],
llama_sampler_p_ctypes,
)
def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p:
"""This sampler is meant to be used for fill-in-the-middle infilling.
"""
...


# // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
# LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
@ctypes_function(
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 168 files

0 comments on commit 7403e00

Please sign in to comment.