From 08f35c46a6360af40b89638fa5db248f64502ab8 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sat, 16 Sep 2023 00:36:47 +0800 Subject: [PATCH] support-mqa-directly --- convert-starcoder-hf-to-gguf.py | 21 +-------------------- llama.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 1c56dca9f50f7..fcdf86b3fc4dc 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -109,7 +109,7 @@ def parse_args() -> argparse.Namespace: gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) gguf_writer.add_block_count(block_count) gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(hparams["n_head"]) +gguf_writer.add_head_count_kv(1) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_file_type(ftype) @@ -209,25 +209,6 @@ def parse_args() -> argparse.Namespace: data = data.squeeze().numpy() - # TODO: implement MQA directly, instead of duplicate into MHA. - if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"): - print("Duplicate K,V heads to use MHA instead of MQA for", name) - - embed_dim = hparams["n_embd"] - head_dim = embed_dim // hparams["n_head"] - - # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) - q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0) - # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) - if len(k.shape) == 2: - k = np.tile(k, (hparams["n_head"], 1)) - v = np.tile(v, (hparams["n_head"], 1)) - elif len(k.shape) == 1: - k = np.tile(k, (hparams["n_head"])) - v = np.tile(v, (hparams["n_head"])) - # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) - data = np.concatenate((q, k, v), axis=0) - # map tensor names new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: diff --git a/llama.cpp b/llama.cpp index 20eb5d5befc88..a4ced9e08b599 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2265,8 +2265,8 @@ static void llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {3*n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); @@ -3538,8 +3538,8 @@ static struct ggml_cgraph * llm_build_starcoder( cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd); + struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); struct ggml_tensor * Qcur = tmpq; struct ggml_tensor * Kcur = tmpk;