diff --git a/cpp/common.cpp b/cpp/common.cpp index df53149d..9efc83e8 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -1330,6 +1330,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "--no-warmup") { + params.warmup = false; + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1452,6 +1456,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); options.push_back({ "server infill", " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); @@ -1635,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); diff --git a/cpp/ggml-aarch64.c b/cpp/ggml-aarch64.c index cffe3576..b6b05a44 100644 --- a/cpp/ggml-aarch64.c +++ b/cpp/ggml-aarch64.c @@ -384,8 +384,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) && + if (lm_ggml_sve_cnt_b == QK8_0) { + LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -496,8 +496,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) && + if (lm_ggml_sve_cnt_b == QK8_0) { + LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -614,7 +614,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (lm_ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -680,12 +680,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void return; } else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) { - LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) && + LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (lm_ggml_cpu_has_neon()) { - LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) && + LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } @@ -745,8 +745,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) && + if (lm_ggml_sve_cnt_b == QK8_0) { + LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1266,8 +1266,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) && + if (lm_ggml_sve_cnt_b == QK8_0) { + LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1728,7 +1728,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (lm_ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2139,12 +2139,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void return; } else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) { - LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) && + LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (lm_ggml_cpu_has_neon()) { - LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) && + LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } diff --git a/cpp/ggml-common.h b/cpp/ggml-common.h index 87065fc5..617e25a5 100644 --- a/cpp/ggml-common.h +++ b/cpp/ggml-common.h @@ -19,7 +19,11 @@ typedef half2 lm_ggml_half2; #define LM_GGML_COMMON_DECL #elif defined(LM_GGML_COMMON_DECL_CUDA) +#if defined(LM_GGML_COMMON_DECL_MUSA) +#include +#else #include +#endif #include typedef half lm_ggml_half; @@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) + #define LM_GGML_TABLE_END() }; #define LM_GGML_COMMON_IMPL -#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP) +#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP) || defined(LM_GGML_COMMON_IMPL_MUSA) #include #define LM_GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = { diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h index bdd736c4..7a58aec0 100644 --- a/cpp/ggml-impl.h +++ b/cpp/ggml-impl.h @@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) { /** * Converts float32 to brain16. * - * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. - * Subnormals shall be flushed to zero, and NANs will be quiet. + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. * This code should vectorize nicely if using modern compilers. */ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) { @@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) { h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } - if (!(u.i & 0x7f800000)) { /* subnormal */ - h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ - return h; - } h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; return h; } @@ -146,6 +143,7 @@ extern "C" { #if defined(__ARM_FEATURE_SVE) #include +#include #endif // 16-bit float diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index 9377cbe2..96eec350 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (lm_ggml_sve_cnt_b == QK8_0) { const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); @@ -4190,15 +4190,18 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F) - 8; const int v1 = (x[ib].qs[j] >> 4) - 8; - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += sumi*LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d); } @@ -4474,15 +4477,18 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void sumf = hsum_float_8(acc) + summs; #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F); const int v1 = (x[ib].qs[j] >> 4); - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s); } @@ -4823,18 +4829,21 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16; + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi; } @@ -5194,7 +5203,8 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; @@ -5203,9 +5213,11 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s); } @@ -5291,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (lm_ggml_sve_cnt_b == QK8_0) { svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); @@ -6437,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void // compute mask for subtraction vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); - vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl); + vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); m <<= 1; vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl); + vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); m <<= 1; vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl); + vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); m <<= 1; vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); - vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl); + vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); m <<= 1; // load Q8 and take product with Q3 @@ -7708,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl)); vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl); + vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl); m <<= 1; vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl)); vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl); + vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl); m <<= 1; vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl); diff --git a/cpp/ggml-quants.h b/cpp/ggml-quants.h index d3c3461a..2ac316a9 100644 --- a/cpp/ggml-quants.h +++ b/cpp/ggml-quants.h @@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); +#if defined(__ARM_FEATURE_SVE) +extern int lm_ggml_sve_cnt_b; +#endif + #ifdef __cplusplus } #endif diff --git a/cpp/ggml.c b/cpp/ggml.c index b600701d..84740ab1 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -37,6 +37,9 @@ #include #endif +#if defined(__ARM_FEATURE_SVE) +int lm_ggml_sve_cnt_b = 0; +#endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef LM_GGML_USE_LLAMAFILE #endif @@ -141,7 +144,51 @@ typedef pthread_t lm_ggml_thread_t; #include -#if defined(__linux__) +#if defined(__ANDROID__) +#include +#include +#include + +struct backtrace_state { + void ** current; + void ** end; +}; + +static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) { + struct backtrace_state * state = (struct backtrace_state *)arg; + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + if (state->current == state->end) { + return _URC_END_OF_STACK; + } else { + *state->current++ = (void*)pc; + } + } + return _URC_NO_REASON; +} + +static void lm_ggml_print_backtrace_symbols(void) { + const int max = 100; + void* buffer[max]; + + struct backtrace_state state = {buffer, buffer + max}; + _Unwind_Backtrace(unwind_callback, &state); + + int count = state.current - buffer; + + for (int idx = 0; idx < count; ++idx) { + const void * addr = buffer[idx]; + const char * symbol = ""; + + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); + } +} +#elif defined(__linux__) && defined(__GLIBC__) #include static void lm_ggml_print_backtrace_symbols(void) { // void * trace[100]; @@ -436,9 +483,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) { } } +void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) { + for (int i = 0; i < n; i++) { + y[i] = lm_ggml_compute_fp32_to_bf16(x[i]); + } +} + void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) { int i = 0; #if defined(__AVX512BF16__) + // subnormals are flushed to zero on this platform for (; i + 32 <= n; i += 32) { _mm512_storeu_si512( (__m512i *)(y + i), @@ -918,7 +972,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row, .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row, - .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row, + .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref, .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16, .vec_dot_type = LM_GGML_TYPE_BF16, .nrows = 1, @@ -2258,7 +2312,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } @@ -3507,6 +3561,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) { LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer); +#if defined(__ARM_FEATURE_SVE) + if (!lm_ggml_sve_cnt_b) { + lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); + } +#endif + LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__); lm_ggml_critical_section_end(); @@ -14746,7 +14806,7 @@ static void lm_ggml_compute_forward_pool_1d_sk_p0( const struct lm_ggml_tensor * src = dst->src[0]; - assert(src->type == LM_GGML_TYPE_F32); + assert(src->type == LM_GGML_TYPE_F32 || src->type == LM_GGML_TYPE_F16); if (params->ith != 0) { return; @@ -14759,10 +14819,8 @@ static void lm_ggml_compute_forward_pool_1d_sk_p0( const int64_t rs = dst->ne[0]; while (cdata < data_end) { - const float * const srow = (const float *)cdata; - + const void * srow = (const void *)cdata; int j = 0; - for (int64_t i = 0; i < rs; ++i) { switch (op) { case LM_GGML_OP_POOL_AVG: drow[i] = 0; break; @@ -14770,10 +14828,11 @@ static void lm_ggml_compute_forward_pool_1d_sk_p0( case LM_GGML_OP_POOL_COUNT: LM_GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { + const float srow_j = (src->type == LM_GGML_TYPE_F32) ? ((const float*)srow)[j] : LM_GGML_FP16_TO_FP32(((const lm_ggml_fp16_t*)srow)[j]); switch (op) { - case LM_GGML_OP_POOL_AVG: drow[i] += srow[j]; break; - case LM_GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ABORT("fatal error"); + case LM_GGML_OP_POOL_AVG: drow[i] += srow_j; break; + case LM_GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ABORT("fatal error"); } ++j; } @@ -14814,7 +14873,7 @@ static void lm_ggml_compute_forward_pool_2d( const struct lm_ggml_tensor * src = dst->src[0]; - LM_GGML_ASSERT(src->type == LM_GGML_TYPE_F32); + assert(src->type == LM_GGML_TYPE_F32 || src->type == LM_GGML_TYPE_F16); if (params->ith != 0) { return; @@ -14857,14 +14916,15 @@ static void lm_ggml_compute_forward_pool_2d( for (int ky = 0; ky < k1; ++ky) { if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; - const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; + const float srow_j = (src->type == LM_GGML_TYPE_F32) ? ((const float*)srow)[j] : LM_GGML_FP16_TO_FP32(((const lm_ggml_fp16_t*)srow)[j]); switch (op) { - case LM_GGML_OP_POOL_AVG: *out += srow[j]; break; - case LM_GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case LM_GGML_OP_POOL_COUNT: LM_GGML_ABORT("fatal error"); + case LM_GGML_OP_POOL_AVG: *out += srow_j; break; + case LM_GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; + case LM_GGML_OP_POOL_COUNT: LM_GGML_ABORT("fatal error"); } } } @@ -18078,7 +18138,6 @@ static void lm_ggml_build_forward_impl(struct lm_ggml_cgraph * cgraph, struct lm } const int n0 = cgraph->n_nodes; - UNUSED(n0); lm_ggml_visit_parents(cgraph, tensor); @@ -20607,7 +20666,7 @@ size_t lm_ggml_quantize_chunk( case LM_GGML_TYPE_BF16: { size_t elemsize = sizeof(lm_ggml_bf16_t); - lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n); + lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n); result = n * elemsize; } break; case LM_GGML_TYPE_F32: diff --git a/cpp/ggml.h b/cpp/ggml.h index 313af008..e4733134 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -349,6 +349,7 @@ extern "C" { LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float); LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16 LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t); + LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t); LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t); struct lm_ggml_object; @@ -1455,7 +1456,6 @@ extern "C" { // if mode & 2 == 1, GPT-NeoX style // // b is an int32 vector with size a->ne[2], it contains the positions - // c is freq factors (e.g. phi3-128k), (optional) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -1472,6 +1472,7 @@ extern "C" { int mode); // custom RoPE + // c is freq factors (e.g. phi3-128k), (optional) LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, diff --git a/cpp/llama-vocab.cpp b/cpp/llama-vocab.cpp index 791af186..1a0e1a09 100644 --- a/cpp/llama-vocab.cpp +++ b/cpp/llama-vocab.cpp @@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { return token != -1 && ( token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) + token == llama_token_eot_impl(vocab) || + token == llama_token_eom_impl(vocab) ); } @@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { return vocab.special_eot_id; } +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + int32_t llama_tokenize_impl( const struct llama_vocab & vocab, const char * text, diff --git a/cpp/llama-vocab.h b/cpp/llama-vocab.h index 30b565d5..7adfc16d 100644 --- a/cpp/llama-vocab.h +++ b/cpp/llama-vocab.h @@ -45,6 +45,7 @@ struct llama_vocab { id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token + id special_eom_id = -1; // tokenizer flags bool tokenizer_add_space_prefix = false; @@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); llama_token llama_token_eot_impl (const struct llama_vocab & vocab); +llama_token llama_token_eom_impl (const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, diff --git a/cpp/llama.cpp b/cpp/llama.cpp index d95f48c5..1d99f941 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -101,7 +101,6 @@ #endif // bump if necessary -#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -134,17 +133,14 @@ static std::string trim(const std::string & str) { } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); } - s = std::move(result); } static bool is_float_close(float a, float b, float abs_tol) { @@ -374,6 +370,7 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -471,6 +468,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -2463,6 +2461,7 @@ struct llama_layer { // long rope factors struct lm_ggml_tensor * rope_long = nullptr; struct lm_ggml_tensor * rope_short = nullptr; + struct lm_ggml_tensor * rope_freqs = nullptr; // bitnet scale struct lm_ggml_tensor * wq_scale; @@ -2944,7 +2943,7 @@ static bool llama_kv_cache_init( // TODO: find a nicer way to add other recurrent model architectures cache.recurrent = model.arch == LLM_ARCH_MAMBA; - cache.v_trans = !cparams.flash_attn; + cache.v_trans = !cache.recurrent && !cparams.flash_attn; cache.head = 0; cache.size = kv_size; @@ -3578,6 +3577,15 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map; +// TODO: update when needed or think of some clever automatic way to do this +static size_t llama_model_max_nodes(const llama_model & /*model*/) { + //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B + // return 32768; + //} + + return 8192; +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -4971,6 +4979,7 @@ static void llm_load_hparams( hparams.attn_soft_cap = true; switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; case 42: model.type = e_model::MODEL_9B; break; case 46: model.type = e_model::MODEL_27B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -5587,6 +5596,7 @@ static void llm_load_vocab( { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, + { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, }; for (const auto & it : special_token_types) { @@ -5639,6 +5649,17 @@ static void llm_load_vocab( } } } + + // find EOM token: "<|eom_id|>" + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID + // for now, we apply this workaround to find the EOM token based on its text + if (vocab.special_eom_id == -1) { + const auto & t = vocab.token_to_id.find("<|eom_id|>"); + if (t != vocab.token_to_id.end()) { + vocab.special_eom_id = t->second; + } + } } // build special tokens cache @@ -6062,6 +6083,8 @@ static bool llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (n_expert == 0) { layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -8407,7 +8430,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_k_shift() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); LM_GGML_ASSERT(kv_self.size == n_ctx); @@ -8438,7 +8461,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_s_copy() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); LM_GGML_ASSERT(kv_self.recurrent); @@ -8461,7 +8484,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_defrag(const std::vector & ids) { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); for (uint32_t i = 0; i < ids.size(); ++i) { const uint32_t id = ids[i]; @@ -8539,6 +8562,10 @@ struct llm_build_context { // choose long/short freq factors based on the context size const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { return model.layers[il].rope_long; } @@ -8702,7 +8729,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_llama() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -8733,6 +8760,9 @@ struct llm_build_context { // self-attention { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct lm_ggml_tensor * rope_factors = build_rope_factors(il); + // compute Q and K and RoPE them struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -8756,14 +8786,14 @@ struct llm_build_context { } Qcur = lm_ggml_rope_ext( - ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = lm_ggml_rope_ext( - ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8845,7 +8875,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_baichuan() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8960,7 +8990,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_xverse() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9063,7 +9093,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_falcon() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9183,7 +9213,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_grok() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9340,7 +9370,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_dbrx() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9466,7 +9496,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_starcoder() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9570,7 +9600,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_refact() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9664,7 +9694,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_bert() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9858,7 +9888,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_bloom() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9959,7 +9989,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_mpt() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10249,7 +10279,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_qwen() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10361,7 +10391,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_qwen2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10473,7 +10503,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_qwen2moe() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -10619,7 +10649,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_phi2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10740,7 +10770,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_phi3() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10972,7 +11002,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_gpt2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11077,7 +11107,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_codeshell() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11188,7 +11218,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_orion() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11306,7 +11336,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_internlm2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11427,7 +11457,7 @@ struct llm_build_context { // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 // based on the original build_llama() function struct lm_ggml_cgraph * build_minicpm() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11571,7 +11601,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_gemma() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11679,7 +11709,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_gemma2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11729,6 +11759,7 @@ struct llm_build_context { // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e switch (model.type) { + case e_model::MODEL_2B: case e_model::MODEL_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; default: LM_GGML_ABORT("fatal error"); @@ -11814,7 +11845,7 @@ struct llm_build_context { struct lm_ggml_cgraph * build_starcoder2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11933,7 +11964,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_mamba() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t d_model = n_embd; const int64_t d_conv = hparams.ssm_d_conv; @@ -12082,7 +12113,7 @@ struct llm_build_context { struct lm_ggml_cgraph * build_command_r() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12236,7 +12267,7 @@ struct llm_build_context { // * removed bias // * removed MoE struct lm_ggml_cgraph * build_olmo() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12360,7 +12391,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_openelm() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12485,7 +12516,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_gptneox() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12627,7 +12658,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_arctic() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12759,7 +12790,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_deepseek2() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12987,7 +13018,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_bitnet() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13127,7 +13158,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_t5() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -13444,7 +13475,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_jais() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -13536,7 +13567,7 @@ struct llm_build_context { } struct lm_ggml_cgraph * build_chatglm() { - struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -14881,9 +14912,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // each move requires 6*n_layer tensors (see build_defrag) // - source view, destination view, copy operation // - x2 for keys and values - //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer); // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer); + const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer); // determine which KV cells to move where // @@ -16773,8 +16804,10 @@ struct llama_context * llama_new_context_with_model( } } + const size_t max_nodes = llama_model_max_nodes(*model); + // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead_custom(LLAMA_MAX_NODES, false)); + ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*max_nodes + lm_ggml_graph_overhead_custom(max_nodes, false)); // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = @@ -16787,7 +16820,7 @@ struct llama_context * llama_new_context_with_model( // currently this is only implemented in the CUDA backend pipeline_parallel = false; #endif - ctx->sched = lm_ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); + ctx->sched = lm_ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, lm_ggml_backend_sched_get_n_copies(ctx->sched)); @@ -17294,18 +17327,18 @@ void llama_kv_cache_update(struct llama_context * ctx) { } // deprecated -size_t llama_get_state_size(const struct llama_context * ctx) { +size_t llama_get_state_size(struct llama_context * ctx) { return llama_state_get_size(ctx); } // deprecated size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { - return llama_state_get_data(ctx, dst); + return llama_state_get_data(ctx, dst, -1); } // deprecated size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - return llama_state_set_data(ctx, src); + return llama_state_set_data(ctx, src, -1); } // deprecated @@ -17318,302 +17351,284 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi return llama_state_save_file(ctx, path_session, tokens, n_token_count); } -// Returns the *maximum* size of the state -size_t llama_state_get_size(const struct llama_context * ctx) { - const auto & cparams = ctx->cparams; - const auto & hparams = ctx->model.hparams; - - // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. - // for reference, std::mt19937(1337) serializes to 6701 bytes. - const size_t s_rng_size = sizeof(size_t); - const size_t s_rng = LLAMA_MAX_RNG_STATE; - const size_t s_n_outputs = sizeof(size_t); - // assume worst case for outputs although only currently set ones are serialized - const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t); - const size_t s_logits_size = sizeof(size_t); - const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0; - const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0; - const size_t s_kv_buf_size = sizeof(size_t); - const size_t s_kv_head = sizeof(uint32_t); - const size_t s_kv_size = sizeof(uint32_t); - const size_t s_kv_used = sizeof(uint32_t); - const size_t s_v_trans = sizeof(uint32_t); - const size_t s_kv = ctx->kv_self.total_size(); - const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id); - const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell; - - const size_t s_total = ( - + s_rng_size - + s_rng - + s_n_outputs - + s_output_pos - + s_logits_size - + s_logits - + s_embedding_size - + s_embedding - + s_kv_buf_size - + s_kv_head - + s_kv_size - + s_kv_used - + s_v_trans - + s_kv - + s_kv_cells - ); - - // on session change it is very likely that the state size has changed - so we need to update this function - static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?"); - - return s_total; -} - -// llama_context_data -struct llama_data_context { +// TODO: replace all non-fatal assertions with returned errors or exceptions +struct llama_data_write { virtual void write(const void * src, size_t size) = 0; virtual size_t get_size_written() = 0; - virtual ~llama_data_context() = default; -}; + virtual ~llama_data_write() = default; -struct llama_data_buffer_context : llama_data_context { - uint8_t * ptr; - size_t size_written = 0; + void write_string(const std::string & str) { + uint32_t str_size = str.size(); - llama_data_buffer_context(uint8_t * p) : ptr(p) {} - - void write(const void * src, size_t size) override { - memcpy(ptr, src, size); - ptr += size; - size_written += size; + write(&str_size, sizeof(str_size)); + write(str.data(), str_size); } - size_t get_size_written() override { - return size_written; + void write_model_info(const struct llama_context * ctx) { + std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch); + write_string(arch_str); + // TODO: add more model-specific info which should prevent loading the session file if not identical } -}; -struct llama_data_file_context : llama_data_context { - llama_file * file; - size_t size_written = 0; + void write_rng(const std::mt19937 & rng) { + std::ostringstream rng_ss; + rng_ss << rng; - llama_data_file_context(llama_file * f) : file(f) {} + const std::string & rng_str = rng_ss.str(); - void write(const void * src, size_t size) override { - file->write_raw(src, size); - size_written += size; + write_string(rng_str); } - size_t get_size_written() override { - return size_written; - } -}; + void write_output_ids(const struct llama_context * ctx) { + const uint32_t n_outputs = ctx->n_outputs; -/** copy state data into either a buffer or file depending on the passed in context - * - * file context: - * llama_file file("/path", "wb"); - * llama_data_file_context data_ctx(&file); - * llama_state_get_data(ctx, &data_ctx); - * - * buffer context: - * std::vector buf(max_size, 0); - * llama_data_buffer_context data_ctx(&buf.data()); - * llama_state_get_data(ctx, &data_ctx); - * -*/ -static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { - llama_synchronize(ctx); + std::vector output_pos; - // copy rng - { - std::ostringstream rng_ss; - rng_ss << ctx->sampling.rng; + const size_t n_batch = ctx->cparams.n_batch; + const auto & output_ids = ctx->output_ids; - const std::string & rng_str = rng_ss.str(); - const size_t rng_size = rng_str.size(); + LM_GGML_ASSERT(n_outputs <= ctx->output_size); - LM_GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); + output_pos.resize(n_outputs); - data_ctx->write(&rng_size, sizeof(rng_size)); - data_ctx->write(rng_str.data(), rng_size); + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch; ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + LM_GGML_ASSERT((uint32_t) pos < n_outputs); + output_pos[pos] = i; + } + } + + write(&n_outputs, sizeof(n_outputs)); + + if (n_outputs) { + write(output_pos.data(), n_outputs * sizeof(int32_t)); + } } - // copy outputs - { - // Can't use ctx->n_outputs because it's not for the - // entire last batch when n_ubatch is smaller than n_batch - size_t n_outputs = 0; + void write_logits(const struct llama_context * ctx) { + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab); - // copy output ids - { - std::vector output_pos; + write(&logits_size, sizeof(logits_size)); + + if (logits_size) { + write(ctx->logits, logits_size * sizeof(float)); + } + } + + void write_embeddings(const struct llama_context * ctx) { + const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); + + write(&embeddings_size, sizeof(embeddings_size)); + + if (embeddings_size) { + write(ctx->embd, embeddings_size * sizeof(float)); + } + } - const size_t n_batch = ctx->cparams.n_batch; - const auto & output_ids = ctx->output_ids; + void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) { + + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = kv_self.cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; - output_pos.resize(ctx->output_size); + write(&pos, sizeof(pos)); + write(&n_seq_id, sizeof(n_seq_id)); - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch; ++i) { - // map an output id to a position in the batch - int32_t pos = output_ids[i]; - if (pos >= 0) { - if ((size_t) pos >= n_outputs) { - n_outputs = pos + 1; + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + write(&seq_id, sizeof(seq_id)); } - LM_GGML_ASSERT((size_t) pos < ctx->output_size); - output_pos[pos] = i; } } + } + } - data_ctx->write(&n_outputs, sizeof(n_outputs)); + void write_kv_cache_data(const struct llama_context * ctx, const std::vector> & cell_ranges) { + const struct llama_kv_cache & kv_self = ctx->kv_self; + const struct llama_hparams & hparams = ctx->model.hparams; - if (n_outputs) { - data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t)); - } - } + const uint32_t v_trans = kv_self.v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; - // copy logits - { - const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab); + write(&v_trans, sizeof(v_trans)); + write(&n_layer, sizeof(n_layer)); - data_ctx->write(&logits_size, sizeof(logits_size)); + std::vector tmp_buf; - if (logits_size) { - data_ctx->write(ctx->logits, logits_size * sizeof(float)); - } - } + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - // copy embeddings - { - const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd); + // Write key type + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + write(&k_type_i, sizeof(k_type_i)); - data_ctx->write(&embeddings_size, sizeof(embeddings_size)); + // Write row size of key + const uint64_t k_size_row = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + write(&k_size_row, sizeof(k_size_row)); - if (embeddings_size) { - data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + tmp_buf.resize(range_size * k_size_row); + lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); + write(tmp_buf.data(), tmp_buf.size()); } } - } - // copy kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - // NOTE: kv_size and kv_buf_size are mostly used for sanity checks - const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); - const uint32_t kv_size = kv_self.size; - const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head; - const uint32_t kv_used = kv_self.used; - const uint32_t v_trans = kv_self.v_trans ? 1 : 0; - - data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); - data_ctx->write(&kv_head, sizeof(kv_head)); - data_ctx->write(&kv_size, sizeof(kv_size)); - data_ctx->write(&kv_used, sizeof(kv_used)); - data_ctx->write(&v_trans, sizeof(v_trans)); - - if (kv_buf_size) { - const size_t pre_kv_buf_size = data_ctx->get_size_written(); - - std::vector tmp_buf; - for (int il = 0; il < (int) n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + if (!kv_self.v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - const size_t k_size = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); - - tmp_buf.resize(k_size); - lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + write(&v_type_i, sizeof(v_type_i)); - if (kv_self.recurrent || !kv_self.v_trans) { - // v is contiguous for recurrent models - // TODO: use other tensors for state models than k and v - const size_t v_size = lm_ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); + // Write row size of value + const uint64_t v_size_row = lm_ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + write(&v_size_row, sizeof(v_size_row)); - tmp_buf.resize(v_size); - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); - continue; + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + tmp_buf.resize(range_size * v_size_row); + lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); + write(tmp_buf.data(), tmp_buf.size()); } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = kv_self.size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - // v is not contiguous, copy row by row - const size_t v_row_size = lm_ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = lm_ggml_row_size(kv_self.v_l[il]->type, kv_size); + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + write(&v_type_i, sizeof(v_type_i)); - tmp_buf.resize(v_row_size); - for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // Write element size + const uint32_t v_size_el = lm_ggml_type_size(kv_self.v_l[il]->type); + write(&v_size_el, sizeof(v_size_el)); + + // Write GQA embedding size + write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + tmp_buf.resize(range_size * v_size_el); + lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); + write(tmp_buf.data(), tmp_buf.size()); + } } } - LM_GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size); } + } - for (uint32_t i = 0; i < kv_head; ++i) { - const auto & cell = kv_self.cells[i]; - - const llama_pos pos = cell.pos; - const size_t seq_id_size = cell.seq_id.size(); - - data_ctx->write(&pos, sizeof(pos)); - data_ctx->write(&seq_id_size, sizeof(seq_id_size)); + void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) { + const struct llama_kv_cache & kv_self = ctx->kv_self; + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; - for (auto seq_id : cell.seq_id) { - data_ctx->write(&seq_id, sizeof(seq_id)); + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = kv_self.size; + for (uint32_t i = 0; i < kv_self.size; ++i) { + const auto & cell = kv_self.cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == kv_self.size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != kv_self.size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = kv_self.size; + } } } - } -} + if (cell_range_begin != kv_self.size) { + cell_ranges.emplace_back(cell_range_begin, kv_self.size); + } -size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) { - llama_data_buffer_context data_ctx(dst); - llama_state_get_data_internal(ctx, &data_ctx); + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + LM_GGML_ASSERT(cell_count == cell_count_check); - return data_ctx.get_size_written(); -} + write(&cell_count, sizeof(cell_count)); -// Sets the state reading from the specified source address -size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { - llama_synchronize(ctx); + write_kv_cache_meta(kv_self, cell_ranges, seq_id); + write_kv_cache_data(ctx, cell_ranges); + } +}; - const uint8_t * inp = src; +struct llama_data_read { + virtual const uint8_t * read(size_t size) = 0; + virtual void read_to(void * dst, size_t size) = 0; + virtual size_t get_size_read() = 0; + virtual ~llama_data_read() = default; - // set rng - { - size_t rng_size; - memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); + void read_string(std::string & str) { + uint32_t str_size; + read_to(&str_size, sizeof(str_size)); + + str.assign((const char *) read(str_size), str_size); + } - LM_GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); + // validate model information + void read_model_info(const struct llama_context * ctx) { + std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch); + std::string arch_str; + read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); + } + // TODO: add more info which needs to be identical but which is not verified otherwise + } - std::string rng_str((const char *)inp, rng_size); inp += rng_size; + void read_rng(std::mt19937 & rng) { + std::string rng_str; + read_string(rng_str); std::istringstream rng_ss(rng_str); - rng_ss >> ctx->sampling.rng; + rng_ss >> rng; - LM_GGML_ASSERT(!rng_ss.fail()); + if (rng_ss.fail()) { + throw std::runtime_error("failed to load RNG state"); + } } - // set output ids - { - size_t n_outputs; + void read_output_ids(struct llama_context * ctx) { std::vector output_pos; - memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs); + uint32_t n_outputs; + read_to(&n_outputs, sizeof(n_outputs)); - LM_GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs)); + if (n_outputs > llama_output_reserve(*ctx, n_outputs)) { + throw std::runtime_error("could not reserve outputs"); + } if (n_outputs) { output_pos.resize(n_outputs); - memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t)); - inp += n_outputs * sizeof(int32_t); + read_to(output_pos.data(), n_outputs * sizeof(int32_t)); for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { int32_t id = output_pos[i]; - LM_GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch); + if ((uint32_t) id >= ctx->cparams.n_batch) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); + } ctx->output_ids[id] = i; } @@ -17621,128 +17636,434 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { } } - // set logits - { - size_t logits_size; + void read_logits(struct llama_context * ctx) { + uint64_t logits_size; + read_to(&logits_size, sizeof(logits_size)); - memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - - LM_GGML_ASSERT(ctx->logits_size >= logits_size); + if (ctx->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } if (logits_size) { - memcpy(ctx->logits, inp, logits_size * sizeof(float)); - inp += logits_size * sizeof(float); + read_to(ctx->logits, logits_size * sizeof(float)); } } - // set embeddings - { - size_t embeddings_size; + void read_embeddings(struct llama_context * ctx) { + uint64_t embeddings_size; + read_to(&embeddings_size, sizeof(embeddings_size)); - memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); - - LM_GGML_ASSERT(ctx->embd_size >= embeddings_size); + if (ctx->embd_size < embeddings_size) { + throw std::runtime_error("embeddings buffer too small"); + } if (embeddings_size) { - memcpy(ctx->embd, inp, embeddings_size * sizeof(float)); - inp += embeddings_size * sizeof(float); + read_to(ctx->embd, embeddings_size * sizeof(float)); } } - // set kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; + bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { + struct llama_kv_cache & kv_self = ctx->kv_self; - const uint32_t n_layer = hparams.n_layer; + if (dest_seq_id != -1) { + // single sequence - size_t kv_buf_size; - uint32_t kv_head; - uint32_t kv_size; - uint32_t kv_used; - uint32_t v_trans; + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + + llama_batch batch = llama_batch_init(cell_count, 0, 1); + batch.n_tokens = cell_count; + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + read_to(&pos, sizeof(pos)); + read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = dest_seq_id; + } + if (!llama_kv_cache_find_slot(kv_self, batch)) { + llama_batch_free(batch); + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + + // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + LM_GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); + LM_GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); + LM_GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + LM_GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); + LM_GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); + + // Cleanup + llama_batch_free(batch); + } else { + // whole KV cache restore + + if (cell_count > kv_self.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + llama_kv_cache_clear(kv_self); + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = kv_self.cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + read_to(&pos, sizeof(pos)); + read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + read_to(&seq_id, sizeof(seq_id)); + + if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + return false; + } - memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); - memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); - memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); - memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans); + cell.seq_id.insert(seq_id); + } + } + + kv_self.head = 0; + kv_self.used = cell_count; + } - LM_GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition + return true; + } - if (kv_self.size != kv_size) { - // the KV cache needs to be big enough to load all the KV cells from the saved state - LM_GGML_ASSERT(kv_self.size >= kv_head); + bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) { + const struct llama_hparams & hparams = ctx->model.hparams; + struct llama_kv_cache & kv_self = ctx->kv_self; + uint32_t v_trans; + uint32_t n_layer; + read_to(&v_trans, sizeof(v_trans)); + read_to(&n_layer, sizeof(n_layer)); - LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n", - __func__, kv_head, kv_size, kv_self.size); + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > kv_self.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size); + return false; + } + if (kv_self.v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; } - llama_kv_cache_clear(ctx); + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } - if (kv_buf_size) { - const size_t pre_kv_buf_size = inp - src; + // Read row size of key + uint64_t k_size_row_ref; + read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } - LM_GGML_ASSERT(kv_self.total_size() >= kv_buf_size); + if (cell_count) { + // Read and set the keys for the whole cell range + lm_ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row); + } + } - for (int il = 0; il < (int) n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + if (!kv_self.v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - const size_t k_size = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + // Read type of value + int32_t v_type_i_ref; + read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } - lm_ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); - inp += k_size; + // Read row size of value + uint64_t v_size_row_ref; + read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = lm_ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } - if (kv_self.recurrent || !kv_self.v_trans) { - // v is contiguous for recurrent models - // TODO: use other tensors for state models than k and v - const size_t v_size = lm_ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); + if (cell_count) { + // Read and set the values for the whole cell range + lm_ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size); - inp += v_size; - continue; + // Read type of value + int32_t v_type_i_ref; + read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; } - // v is not contiguous, copy row by row - const size_t v_row_size = lm_ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = lm_ggml_row_size(kv_self.v_l[il]->type, kv_self.size); + // Read element size of value + uint32_t v_size_el_ref; + read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = lm_ggml_type_size(kv_self.v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } - for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); - inp += v_row_size; + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el; + lm_ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } } } - LM_GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size); } + return true; + } - ctx->kv_self.head = kv_head; - ctx->kv_self.used = kv_used; + void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) { + uint32_t cell_count; + read_to(&cell_count, sizeof(cell_count)); - for (uint32_t i = 0; i < kv_head; ++i) { - llama_pos pos; - size_t seq_id_size; + bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count); - memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos); - memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size); + if (!res) { + if (seq_id == -1) { + llama_kv_cache_clear(ctx); + } else { + llama_kv_cache_seq_rm(ctx, seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } + } +}; - ctx->kv_self.cells[i].pos = pos; +struct llama_data_write_dummy : llama_data_write { + size_t size_written = 0; - llama_seq_id seq_id; + llama_data_write_dummy() {} - for (size_t j = 0; j < seq_id_size; ++j) { - memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id); - ctx->kv_self.cells[i].seq_id.insert(seq_id); - } + // TODO: avoid unnecessary calls to lm_ggml_backend_tensor_get in a dummy context + + void write(const void * /* src */, size_t size) override { + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_write_buffer : llama_data_write { + uint8_t * ptr; + size_t buf_size = 0; + size_t size_written = 0; + + llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + + void write(const void * src, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); } + memcpy(ptr, src, size); + ptr += size; + size_written += size; + buf_size -= size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_read_buffer : llama_data_read { + const uint8_t * ptr; + size_t buf_size = 0; + size_t size_read = 0; + + llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + + const uint8_t * read(size_t size) override { + const uint8_t * base_ptr = ptr; + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ptr += size; + size_read += size; + buf_size -= size; + return base_ptr; + } + + void read_to(void * dst, size_t size) override { + memcpy(dst, read(size), size); + } + + size_t get_size_read() override { + return size_read; + } +}; + +struct llama_data_write_file : llama_data_write { + llama_file * file; + size_t size_written = 0; + + llama_data_write_file(llama_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_read_file : llama_data_read { + llama_file * file; + size_t size_read = 0; + std::vector temp_buffer; + + llama_data_read_file(llama_file * f) : file(f) {} + + void read_to(void * dst, size_t size) override { + file->read_raw(dst, size); + size_read += size; } - const size_t nread = inp - src; - const size_t max_size = llama_state_get_size(ctx); + const uint8_t * read(size_t size) override { + temp_buffer.resize(size); + read_to(temp_buffer.data(), size); + return temp_buffer.data(); + } + + size_t get_size_read() override { + return size_read; + } +}; + +/** copy state data into either a buffer or file depending on the passed in context + * + * file context: + * llama_file file("/path", "wb"); + * llama_data_write_file data_ctx(&file); + * llama_state_get_data_internal(ctx, data_ctx); + * + * buffer context: + * std::vector buf(max_size, 0); + * llama_data_write_buffer data_ctx(buf.data(), max_size); + * llama_state_get_data_internal(ctx, data_ctx); + * +*/ +static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) { + llama_synchronize(ctx); + + data_ctx.write_model_info(ctx); + + data_ctx.write_rng(ctx->sampling.rng); + + // copy outputs + data_ctx.write_output_ids(ctx); + data_ctx.write_logits(ctx); + data_ctx.write_embeddings(ctx); - LM_GGML_ASSERT(nread <= max_size); + data_ctx.write_kv_cache(ctx); - return nread; + return data_ctx.get_size_written(); +} + +size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { + llama_data_write_buffer data_ctx(dst, size); + try { + return llama_state_get_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; + } +} + +// Returns the *actual* size of the state. +// Intended to be used when saving to state to a buffer. +size_t llama_state_get_size(struct llama_context * ctx) { + llama_data_write_dummy data_ctx; + try { + return llama_state_get_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } +} + +static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) { + llama_synchronize(ctx); + + data_ctx.read_model_info(ctx); + + // set rng + data_ctx.read_rng(ctx->sampling.rng); + + // set outputs + data_ctx.read_output_ids(ctx); + data_ctx.read_logits(ctx); + data_ctx.read_embeddings(ctx); + + data_ctx.read_kv_cache(ctx); + + return data_ctx.get_size_read(); +} + +// Sets the state reading from the specified source address +size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { + llama_data_read_buffer data_ctx(src, size); + try { + return llama_state_set_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); + return 0; + } } static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { @@ -17754,15 +18075,7 @@ static bool llama_state_load_file_internal(struct llama_context * ctx, const cha const uint32_t version = file.read_u32(); if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return false; - } - - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); - - if (session_hparams != ctx->model.hparams) { - LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__); + LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); return false; } } @@ -17772,7 +18085,7 @@ static bool llama_state_load_file_internal(struct llama_context * ctx, const cha const uint32_t n_token_count = file.read_u32(); if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); return false; } @@ -17783,19 +18096,15 @@ static bool llama_state_load_file_internal(struct llama_context * ctx, const cha // restore the context state { const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_state_get_size(ctx); - if (n_state_size_cur > n_state_size_max) { - LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); + llama_data_read_file data_ctx(&file); + const size_t n_read = llama_state_set_data_internal(ctx, data_ctx); + + if (n_read != n_state_size_cur) { + LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); return false; } - - std::vector state_data(n_state_size_max); - file.read_raw(state_data.data(), n_state_size_cur); - - llama_state_set_data(ctx, state_data.data()); } - return true; } @@ -17803,7 +18112,7 @@ bool llama_state_load_file(struct llama_context * ctx, const char * path_session try { return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what()); return false; } } @@ -17814,15 +18123,13 @@ static bool llama_state_save_file_internal(struct llama_context * ctx, const cha file.write_u32(LLAMA_SESSION_MAGIC); file.write_u32(LLAMA_SESSION_VERSION); - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); - // save the prompt file.write_u32((uint32_t) n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count); // save the context state using stream saving - llama_data_file_context data_ctx(&file); - llama_state_get_data_internal(ctx, &data_ctx); + llama_data_write_file data_ctx(&file); + llama_state_get_data_internal(ctx, data_ctx); return true; } @@ -17831,401 +18138,50 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session try { return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error saving session file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what()); return false; } } -size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) { - // save the size of size_t as a uint32_t for safety check - const size_t size_t_size_size = sizeof(uint32_t); - - // other values - const size_t s_cell_count_size = sizeof(uint32_t); - const size_t s_layer_count_size = sizeof(uint32_t); - const size_t n_embd_v_gqa_size = sizeof(uint32_t); - - size_t s_cell_count = 0; - size_t s_cell_data_size = 0; - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; - if (cell.seq_id.count(seq_id) > 0) { - ++s_cell_count; - s_cell_data_size += sizeof(llama_pos); - } - } - - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // types of keys and values - s_cell_data_size += sizeof(int32_t) * 2; - // k_size_row and v_size_el values of layer - s_cell_data_size += sizeof(size_t) * 2; - - // keys - const size_t k_size_row = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - s_cell_data_size += k_size_row * s_cell_count; - - // values (transposed) - const size_t v_size_el = lm_ggml_type_size(kv_self.v_l[il]->type); - s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa; - } - - const size_t s_total = ( - size_t_size_size + - s_cell_count_size + - s_layer_count_size + - n_embd_v_gqa_size + - s_cell_data_size - ); - - return s_total; -} - -static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) { +static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { llama_synchronize(ctx); - const auto & kv_self = ctx->kv_self; - LM_GGML_ASSERT(!kv_self.recurrent); // not implemented - - // Save the size of size_t as a uint32_t for safety check - const uint32_t size_t_size = sizeof(size_t); - data_ctx.write(&size_t_size, sizeof(size_t_size)); - - std::vector> cell_ranges; // ranges, from inclusive, to exclusive - uint32_t cell_count = 0; - - // Count the number of cells with the specified seq_id - // Find all the ranges of cells with this seq id - { - uint32_t cell_range_begin = kv_self.size; - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; - if (cell.has_seq_id(seq_id)) { - ++cell_count; - if (cell_range_begin == kv_self.size) { - cell_range_begin = i; - } - } - else { - if (cell_range_begin != kv_self.size) { - cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = kv_self.size; - } - } - } - if (cell_range_begin != kv_self.size) { - cell_ranges.emplace_back(cell_range_begin, kv_self.size); - } - - // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count - uint32_t cell_count_check = 0; - for (const auto & range : cell_ranges) { - cell_count_check += range.second - range.first; - } - LM_GGML_ASSERT(cell_count == cell_count_check); - } - - // Write the cell count - data_ctx.write(&cell_count, sizeof(cell_count)); - - const auto & hparams = ctx->model.hparams; - const uint32_t n_layer = hparams.n_layer; - - // Write the layer count - data_ctx.write(&n_layer, sizeof(n_layer)); - - // Write n_embd_v_gqa (reference value) - { - const uint32_t n_embd_v_gqa_ref = hparams.n_embd_v_gqa() + hparams.n_embd_k_s(); - data_ctx.write(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); - } - - // Iterate the ranges and write all the pos (this is the token position in the prompt) - for (const auto & range : cell_ranges) { - for (uint32_t i = range.first; i < range.second; ++i) { - const auto & cell = kv_self.cells[i]; - data_ctx.write(&cell.pos, sizeof(cell.pos)); - } - } - - // Iterate and write all the keys first, each row is a cell - // Get whole range at a time - std::vector tmp_buf; - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Write key type - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; - data_ctx.write(&k_type_i, sizeof(k_type_i)); - - // Write row size of key - const size_t k_size_row = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - data_ctx.write(&k_size_row, sizeof(k_size_row)); - - // Read each range of cells of k_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); - } - } - - // TODO: simplify, reduce copy-paste - if (!kv_self.v_trans) { - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - data_ctx.write(&v_type_i, sizeof(v_type_i)); - - // Write row size of value - const size_t v_size_row = lm_ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); - data_ctx.write(&v_size_row, sizeof(v_size_row)); - - // Read each range of cells of v_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); - } - } - } else { - // For the values, they are transposed, so we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv_self.size; - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - data_ctx.write(&v_type_i, sizeof(v_type_i)); - - // Write element size - const size_t v_size_el = lm_ggml_type_size(kv_self.v_l[il]->type); - data_ctx.write(&v_size_el, sizeof(v_size_el)); - - // For each row, we get the element values of each cell - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); - } - } - } - } + data_ctx.write_kv_cache(ctx, seq_id); return data_ctx.get_size_written(); } -size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) { - llama_data_buffer_context data_ctx(dst); +size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) { + llama_data_write_dummy data_ctx; return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); } -size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { - llama_synchronize(ctx); - - auto & kv_self = ctx->kv_self; - LM_GGML_ASSERT(!kv_self.recurrent); // not implemented - - // Wipe the slot - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - - const uint8_t * inp = src; - - // Read size of size_t - uint32_t size_t_size; - memcpy(&size_t_size, inp, sizeof(size_t_size)); - inp += sizeof(size_t_size); - if (size_t_size != sizeof(size_t)) { - LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__); +size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { + llama_data_write_buffer data_ctx(dst, size); + try { + return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what()); return 0; } +} - // Read the cell count - uint32_t cell_count; - memcpy(&cell_count, inp, sizeof(cell_count)); - inp += sizeof(cell_count); - - // Read the layer count - uint32_t n_layer_ref; - memcpy(&n_layer_ref, inp, sizeof(n_layer_ref)); - inp += sizeof(n_layer_ref); - - // Read n_embd_v_gqa - uint32_t n_embd_v_gqa_ref; - memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref)); - inp += sizeof(n_embd_v_gqa_ref); +static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { + llama_synchronize(ctx); - // Sanity check model compatibility - const auto & hparams = ctx->model.hparams; - const uint32_t n_layer = hparams.n_layer; + data_ctx.read_kv_cache(ctx, dest_seq_id); - if (n_layer != n_layer_ref) { - LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref); - return 0; - } + return data_ctx.get_size_read(); +} - if (hparams.n_embd_v_gqa() != n_embd_v_gqa_ref) { - LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, hparams.n_embd_v_gqa(), n_embd_v_gqa_ref); +size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) { + llama_data_read_buffer data_ctx(src, size); + try { + return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what()); return 0; } - - // Allocate the new cells for the slot - if (cell_count) { - llama_batch batch = llama_batch_init(cell_count, 0, 1); - batch.n_tokens = cell_count; - for (uint32_t i = 0; i < cell_count; ++i) { - llama_pos pos; - memcpy(&pos, inp, sizeof(pos)); - inp += sizeof(pos); - - batch.pos[i] = pos; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = dest_seq_id; - } - if (!llama_kv_cache_find_slot(kv_self, batch)) { - llama_batch_free(batch); - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); - return 0; - } - - // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - LM_GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); - LM_GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); - LM_GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - LM_GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); - LM_GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); - - // Cleanup - llama_batch_free(batch); - } - - const uint32_t kv_size = kv_self.size; - const uint32_t kv_head = kv_self.head; - - // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Read type of key - int32_t k_type_i_ref; - memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref)); - inp += sizeof(k_type_i_ref); - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; - if (k_type_i != k_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); - return 0; - } - - // Read row size of key - size_t k_size_row_ref; - memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref)); - inp += sizeof(k_size_row_ref); - const size_t k_size_row = lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - if (k_size_row != k_size_row_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il); - return 0; - } - - if (cell_count) { - // Read and set the keys for the whole cell range - lm_ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row); - inp += cell_count * k_size_row; - } - } - - // TODO: simplify, reduce copy-paste - if (!kv_self.v_trans) { - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); - inp += sizeof(v_type_i_ref); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return 0; - } - - // Read row size of value - size_t v_size_row_ref; - memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref)); - inp += sizeof(v_size_row_ref); - const size_t v_size_row = lm_ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); - if (v_size_row != v_size_row_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il); - return 0; - } - - if (cell_count) { - // Read and set the values for the whole cell range - lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row); - inp += cell_count * v_size_row; - } - } - } else { - // For each layer, read the values for each cell (transposed) - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); - inp += sizeof(v_type_i_ref); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return 0; - } - - // Read element size of value - size_t v_size_el_ref; - memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref)); - inp += sizeof(v_size_el_ref); - const size_t v_size_el = lm_ggml_type_size(kv_self.v_l[il]->type); - if (v_size_el != v_size_el_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il); - return 0; - } - - if (cell_count) { - // For each row in the transposed matrix, read the values for the whole cell range - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv_head + j * kv_size) * v_size_el; - lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el); - inp += cell_count * v_size_el; - } - } - } - } - - const size_t nread = inp - src; - - return nread; } static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { @@ -18235,11 +18191,11 @@ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, con file.write_u32(LLAMA_STATE_SEQ_VERSION); // save the prompt - file.write_u32((uint32_t)n_token_count); + file.write_u32((uint32_t) n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count); // save the context state using stream saving - llama_data_file_context data_ctx(&file); + llama_data_write_file data_ctx(&file); llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); const size_t res = file.tell(); @@ -18277,9 +18233,8 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con // restore the context state { const size_t state_size = file.size - file.tell(); - std::vector state_data(state_size); - file.read_raw(state_data.data(), state_size); - const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id); + llama_data_read_file data_ctx(&file); + const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); if (!nread) { LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); return 0; @@ -18295,7 +18250,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa try { return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what()); return 0; } } @@ -18304,7 +18259,7 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa try { return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what()); return 0; } } diff --git a/cpp/llama.h b/cpp/llama.h index b2526864..fe1e5818 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -33,17 +33,15 @@ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF -#define LLAMA_MAX_RNG_STATE (64*1024) - #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 7 +#define LLAMA_SESSION_VERSION 8 #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ -#define LLAMA_STATE_SEQ_VERSION 1 +#define LLAMA_STATE_SEQ_VERSION 2 #ifdef __cplusplus extern "C" { @@ -691,10 +689,11 @@ extern "C" { // State / sessions // - // Returns the maximum size in bytes of the state (rng, logits, embedding - // and kv_cache) - will often be smaller after compacting tokens - LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx); - LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx), + // Returns the *actual* size in bytes of the state + // (rng, logits, embedding and kv_cache) + // Only use when saving the state, not when restoring it, otherwise the size may be too small. + LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); + LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx), "use llama_state_get_size instead"); // Copies the state to the specified destination address. @@ -702,7 +701,8 @@ extern "C" { // Returns the number of bytes copied LLAMA_API size_t llama_state_get_data( struct llama_context * ctx, - uint8_t * dst); + uint8_t * dst, + size_t size); LLAMA_API DEPRECATED(size_t llama_copy_state_data( struct llama_context * ctx, uint8_t * dst), @@ -712,7 +712,8 @@ extern "C" { // Returns the number of bytes read LLAMA_API size_t llama_state_set_data( struct llama_context * ctx, - const uint8_t * src); + const uint8_t * src, + size_t size); LLAMA_API DEPRECATED(size_t llama_set_state_data( struct llama_context * ctx, const uint8_t * src), @@ -754,6 +755,7 @@ extern "C" { LLAMA_API size_t llama_state_seq_get_data( struct llama_context * ctx, uint8_t * dst, + size_t size, llama_seq_id seq_id); // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence @@ -763,6 +765,7 @@ extern "C" { LLAMA_API size_t llama_state_seq_set_data( struct llama_context * ctx, const uint8_t * src, + size_t size, llama_seq_id dest_seq_id); LLAMA_API size_t llama_state_seq_save_file( diff --git a/llama.cpp b/llama.cpp index 2b1f616b..bc0f887e 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 2b1f616b208a4a21c4ee7a7eb85d822ff1d787af +Subproject commit bc0f887e159c0d78c28121e2c8b5c58094170875 diff --git a/scripts/common.cpp.patch b/scripts/common.cpp.patch index e7631a72..3c793cd1 100644 --- a/scripts/common.cpp.patch +++ b/scripts/common.cpp.patch @@ -1,15 +1,15 @@ ---- common.cpp.orig 2024-05-29 09:16:58 -+++ common.cpp 2024-05-29 09:16:59 -@@ -51,6 +51,12 @@ - #include +--- common.cpp.orig ++++ common.cpp +@@ -52,6 +52,12 @@ #include #endif -+ + +// build info +int LLAMA_BUILD_NUMBER = 0; +char const *LLAMA_COMMIT = "unknown"; +char const *LLAMA_COMPILER = "unknown"; +char const *LLAMA_BUILD_TARGET = "unknown"; - ++ #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data + #endif diff --git a/scripts/common.h.patch b/scripts/common.h.patch index f8c9be2a..8e33f282 100644 --- a/scripts/common.h.patch +++ b/scripts/common.h.patch @@ -1,6 +1,6 @@ ---- common.h.orig 2024-05-29 09:16:58 -+++ common.h 2024-05-29 09:16:59 -@@ -41,6 +41,17 @@ +--- common.h.orig ++++ common.h +@@ -41,6 +41,17 @@ extern char const * LLAMA_BUILD_TARGET; struct llama_control_vector_load_info; diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index 1592f387..00d9061a 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,6 +1,6 @@ ---- ggml-metal.m.orig 2024-05-29 09:16:58 -+++ ggml-metal.m 2024-05-29 09:16:59 -@@ -336,7 +336,7 @@ +--- ggml-metal.m.orig ++++ ggml-metal.m +@@ -336,7 +336,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { const bool try_metallib = true; #endif diff --git a/scripts/ggml.c.patch b/scripts/ggml.c.patch index 7cf3efe7..bd535d50 100644 --- a/scripts/ggml.c.patch +++ b/scripts/ggml.c.patch @@ -1,7 +1,7 @@ ---- ggml.c.orig 2024-07-27 14:24:52 -+++ ggml.c 2024-07-27 14:25:09 -@@ -144,9 +144,9 @@ - #if defined(__linux__) +--- ggml.c.orig ++++ ggml.c +@@ -191,9 +191,9 @@ static void lm_ggml_print_backtrace_symbols(void) { + #elif defined(__linux__) && defined(__GLIBC__) #include static void lm_ggml_print_backtrace_symbols(void) { - void * trace[100]; diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 1c84810e..94535298 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,6 +1,6 @@ ---- llama.cpp.orig 2024-07-27 14:12:00 -+++ llama.cpp 2024-07-27 14:12:02 -@@ -105,6 +105,17 @@ +--- llama.cpp.orig ++++ llama.cpp +@@ -104,6 +104,17 @@ #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -18,7 +18,7 @@ // // helpers // -@@ -1691,16 +1702,16 @@ +@@ -1689,16 +1700,16 @@ if (prefetch > 0) { // advise the kernel to preload the mapped memory diff --git a/scripts/log.h.patch b/scripts/log.h.patch index e9f0b516..82b3753e 100644 --- a/scripts/log.h.patch +++ b/scripts/log.h.patch @@ -1,6 +1,6 @@ ---- log.h.orig 2024-03-20 12:34:08 -+++ log.h 2024-03-20 12:34:16 -@@ -323,6 +323,19 @@ +--- log.h.orig ++++ log.h +@@ -323,6 +323,19 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n") #endif