diff --git a/cpp/build-info.h b/cpp/build-info.h
index e6c577c1..a6ba2c9e 100644
--- a/cpp/build-info.h
+++ b/cpp/build-info.h
@@ -1,8 +1,8 @@
 #ifndef BUILD_INFO_H
 #define BUILD_INFO_H
 
-#define BUILD_NUMBER 1317
-#define BUILD_COMMIT "79f34ab"
+#define BUILD_NUMBER 1338
+#define BUILD_COMMIT "1faaae8"
 #define BUILD_COMPILER ""
 #define BUILD_TARGET "unknown"
 
diff --git a/cpp/common.cpp b/cpp/common.cpp
index bf702819..532184ec 100644
--- a/cpp/common.cpp
+++ b/cpp/common.cpp
@@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
+            // store the external file name in params
+            params.prompt_file = argv[i];
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
@@ -361,7 +363,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
             params.use_mmap = false;
         } else if (arg == "--lora-scaled") {
             if (++i >= argc) {
@@ -373,7 +375,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
+            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
             params.use_mmap = false;
         } else if (arg == "--lora-base") {
             if (++i >= argc) {
@@ -616,6 +618,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         process_escapes(params.prompt);
         process_escapes(params.input_prefix);
         process_escapes(params.input_suffix);
+        for (auto & antiprompt : params.antiprompt) {
+            process_escapes(antiprompt);
+        }
     }
 
     return true;
@@ -1017,10 +1022,11 @@ llama_token llama_sample_token(
             id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
         } else {
             // Temperature sampling
-            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
-            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
-            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
+            size_t min_keep = std::max(1, params.n_probs);
+            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
+            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
+            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
+            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
             llama_sample_temp(ctx, &cur_p, temp);
 
             {
diff --git a/cpp/common.h b/cpp/common.h
index e095c56e..c8021527 100644
--- a/cpp/common.h
+++ b/cpp/common.h
@@ -79,6 +79,7 @@ struct gpt_params {
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
+    std::string prompt_file       = "";  // store the external prompt file name
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
     std::string input_suffix      = "";  // string to suffix user inputs with
diff --git a/cpp/ggml.c b/cpp/ggml.c
index 7bd73551..b1e441e0 100644
--- a/cpp/ggml.c
+++ b/cpp/ggml.c
@@ -1032,8 +1032,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
             y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
         }
 
         memcpy(&y[i].qh, &qh, sizeof(qh));
@@ -1080,8 +1080,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
             y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
         }
 
         memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
@@ -4081,12 +4081,16 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "ALIBI",
     "CLAMP",
     "CONV_1D",
+    "CONV_TRANSPOSE_1D",
     "CONV_2D",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
 
+    "CONV_1D_STAGE_0",
+    "CONV_1D_STAGE_1",
+
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
@@ -4112,7 +4116,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(LM_GGML_OP_COUNT == 68, "LM_GGML_OP_COUNT != 68");
+static_assert(LM_GGML_OP_COUNT == 71, "LM_GGML_OP_COUNT != 71");
 
 static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "none",
@@ -4163,12 +4167,16 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "alibi(x)",
     "clamp(x)",
     "conv_1d(x)",
+    "conv_transpose_1d(x)",
     "conv_2d(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
 
+    "conv_1d_stage_0(x)",
+    "conv_1d_stage_1(x)",
+
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
@@ -4194,7 +4202,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(LM_GGML_OP_COUNT == 68, "LM_GGML_OP_COUNT != 68");
+static_assert(LM_GGML_OP_COUNT == 71, "LM_GGML_OP_COUNT != 71");
 
 static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
 
@@ -4223,7 +4231,10 @@ static void lm_ggml_setup_op_has_task_pass(void) {
         p[LM_GGML_OP_DIAG_MASK_INF          ] = true;
         p[LM_GGML_OP_DIAG_MASK_ZERO         ] = true;
         p[LM_GGML_OP_CONV_1D                ] = true;
+        p[LM_GGML_OP_CONV_1D_STAGE_0        ] = true;
+        p[LM_GGML_OP_CONV_1D_STAGE_1        ] = true;
         p[LM_GGML_OP_CONV_2D                ] = true;
+        p[LM_GGML_OP_CONV_TRANSPOSE_1D      ] = true;
         p[LM_GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[LM_GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[LM_GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -6746,7 +6757,6 @@ struct lm_ggml_tensor * lm_ggml_cont_4d(
     return result;
 }
 
-
 // lm_ggml_reshape
 
 struct lm_ggml_tensor * lm_ggml_reshape(
@@ -7504,14 +7514,17 @@ static int64_t lm_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
-        struct lm_ggml_context * ctx,
-        struct lm_ggml_tensor  * a,
-        struct lm_ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    LM_GGML_ASSERT(lm_ggml_is_matrix(b));
+// im2col: [N, IC, IL] => [N, OL, IC*K]
+// a: [OC，IC, K]
+// b: [N, IC, IL]
+// result: [N, OL, IC*K]
+static struct lm_ggml_tensor * lm_ggml_conv_1d_stage_0(
+    struct lm_ggml_context * ctx,
+    struct lm_ggml_tensor  * a,
+    struct lm_ggml_tensor  * b,
+    int                   s0,
+    int                   p0,
+    int                   d0) {
     LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
     bool is_node = false;
 
@@ -7520,16 +7533,54 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
         is_node = true;
     }
 
+    const int64_t OL = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
     const int64_t ne[4] = {
-        lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-        a->ne[2], 1, 1,
+        a->ne[1] * a->ne[0],
+        OL,
+        b->ne[2],
+        1,
     };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 2, ne);
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne);
 
     int32_t params[] = { s0, p0, d0 };
     lm_ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = LM_GGML_OP_CONV_1D;
+    result->op = LM_GGML_OP_CONV_1D_STAGE_0;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// lm_ggml_conv_1d_stage_1
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// a: [OC, IC, K]
+// b: [N, OL, IC * K]
+// result: [N, OC, OL]
+static struct lm_ggml_tensor * lm_ggml_conv_1d_stage_1(
+    struct lm_ggml_context * ctx,
+    struct lm_ggml_tensor  * a,
+    struct lm_ggml_tensor  * b) {
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        LM_GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        b->ne[1],
+        a->ne[2],
+        b->ne[2],
+        1,
+    };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+
+    result->op = LM_GGML_OP_CONV_1D_STAGE_1;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7537,6 +7588,53 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
     return result;
 }
 
+// lm_ggml_conv_1d
+
+LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct lm_ggml_tensor * result = lm_ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
+    result = lm_ggml_conv_1d_stage_1(ctx, a, result);
+    return result;
+}
+
+// LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
+//         struct lm_ggml_context * ctx,
+//         struct lm_ggml_tensor  * a,
+//         struct lm_ggml_tensor  * b,
+//         int                   s0,
+//         int                   p0,
+//         int                   d0) {
+//     LM_GGML_ASSERT(lm_ggml_is_matrix(b));
+//     LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
+//     bool is_node = false;
+
+//     if (a->grad || b->grad) {
+//         LM_GGML_ASSERT(false); // TODO: implement backward
+//         is_node = true;
+//     }
+
+//     const int64_t ne[4] = {
+//         lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+//         a->ne[2], 1, 1,
+//     };
+//     struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 2, ne);
+
+//     int32_t params[] = { s0, p0, d0 };
+//     lm_ggml_set_op_params(result, params, sizeof(params));
+
+//     result->op = LM_GGML_OP_CONV_1D;
+//     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+//     result->src[0] = a;
+//     result->src[1] = b;
+
+//     return result;
+// }
+
 // lm_ggml_conv_1d_ph
 
 struct lm_ggml_tensor* lm_ggml_conv_1d_ph(
@@ -7548,6 +7646,50 @@ struct lm_ggml_tensor* lm_ggml_conv_1d_ph(
     return lm_ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
 
+// lm_ggml_conv_transpose_1d
+
+static int64_t lm_ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+
+LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    LM_GGML_ASSERT(lm_ggml_is_matrix(b));
+    LM_GGML_ASSERT(a->ne[2] == b->ne[1]);
+    LM_GGML_ASSERT(a->ne[3] == 1);
+
+    LM_GGML_ASSERT(p0 == 0);
+    LM_GGML_ASSERT(d0 == 1);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        LM_GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        lm_ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
+    };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { s0, p0, d0 };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = LM_GGML_OP_CONV_TRANSPOSE_1D;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 // lm_ggml_conv_2d
 
 struct lm_ggml_tensor * lm_ggml_conv_2d(
@@ -12940,7 +13082,7 @@ static void lm_ggml_compute_forward_alibi_f32(
         return;
     }
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@@ -12961,7 +13103,6 @@ static void lm_ggml_compute_forward_alibi_f32(
     //const int nb3 = src0->nb[3];
 
     LM_GGML_ASSERT(nb0 == sizeof(float));
-    LM_GGML_ASSERT(ne1 + n_past == ne0);
     LM_GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
@@ -13687,7 +13828,7 @@ static void lm_ggml_compute_forward_rope_back(
 
 // lm_ggml_compute_forward_conv_1d
 
-static void lm_ggml_compute_forward_conv_1d_s1_ph_f16_f32(
+static void lm_ggml_compute_forward_conv_1d_f16_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -13705,42 +13846,33 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     const int nth = params->nth;
 
     const int nk = ne00;
-    const int nh = nk/2;
 
-    const int ew0 = lm_ggml_up32(ne01);
+    // size of the convolution row - the kernel size unrolled across all input channels
+    const int ew0 = nk*ne01;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
 
-    LM_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
     LM_GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == LM_GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
-        {
-            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
+        lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
 
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    lm_ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
+        for (int64_t i11 = 0; i11 < ne11; i11++) {
+            const float * const src = (float *)((char *) src1->data + i11*nb11);
+            lm_ggml_fp16_t * dst_data = wdata;
 
-        // prepare source data (src1)
-        {
-            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                for (int64_t ik = 0; ik < nk; ik++) {
+                    const int idx0 = i0*s0 + ik*d0 - p0;
 
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                lm_ggml_fp16_t * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = LM_GGML_FP32_TO_FP16(src[i10]);
+                    if(!(idx0 < 0 || idx0 >= ne10)) {
+                        dst_data[i0*ew0 + i11*nk + ik] = LM_GGML_FP32_TO_FP16(src[idx0]);
+                    }
                 }
             }
         }
@@ -13753,7 +13885,7 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     }
 
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne2;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -13762,23 +13894,22 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                lm_ggml_vec_dot_f16(ew0, &v,
-                        (lm_ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (lm_ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
+    lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
+
+    for (int i2 = 0; i2 < ne2; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                lm_ggml_vec_dot_f16(ew0, dst_data + i0,
+                        (lm_ggml_fp16_t *) ((char *) src0->data + i1*nb02),
+                        (lm_ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
             }
         }
     }
 }
 
-static void lm_ggml_compute_forward_conv_1d_s1_ph_f32(
+static void lm_ggml_compute_forward_conv_1d_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -13796,42 +13927,32 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f32(
     const int nth = params->nth;
 
     const int nk = ne00;
-    const int nh = nk/2;
 
-    const int ew0 = lm_ggml_up32(ne01);
+    const int ew0 = nk*ne01;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
 
-    LM_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     LM_GGML_ASSERT(nb00 == sizeof(float));
     LM_GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == LM_GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
-        {
-            float * const wdata = (float *) params->wdata + 0;
+        float * const wdata = (float *) params->wdata + 0;
 
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
+        for (int64_t i11 = 0; i11 < ne11; i11++) {
+            const float * const src = (float *)((char *) src1->data + i11*nb11);
+            float * dst_data = wdata;
 
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                for (int64_t ik = 0; ik < nk; ik++) {
+                    const int idx0 = i0*s0 + ik*d0 - p0;
 
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
+                    if(!(idx0 < 0 || idx0 >= ne10)) {
+                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
+                    }
                 }
             }
         }
@@ -13853,35 +13974,242 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                lm_ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
+    float * const wdata = (float *) params->wdata + 0;
+
+    for (int i2 = 0; i2 < ne2; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                lm_ggml_vec_dot_f32(ew0, dst_data + i0,
+                        (float *) ((char *) src0->data + i1*nb02),
+                        (float *)                wdata + i2*nb2 + i0*ew0);
+            }
+        }
+    }
+}
+
+static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
+                             lm_ggml_fp16_t * A,
+                             lm_ggml_fp16_t * B,
+                             float * C,
+                             const int ith, const int nth) {
+    // does not seem to make a difference
+    int64_t m0, m1, n0, n1;
+    // patches per thread
+    if (m > n) {
+        n0 = 0;
+        n1 = n;
+
+        // total patches in dst
+        const int np = m;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        m0 = dp*ith;
+        m1 = MIN(m0 + dp, np);
+    } else {
+        m0 = 0;
+        m1 = m;
+
+        // total patches in dst
+        const int np = n;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        n0 = dp*ith;
+        n1 = MIN(n0 + dp, np);
+    }
+
+    // block-tiling attempt
+    int64_t blck_n = 16;
+    int64_t blck_m = 16;
+
+    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
+    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(lm_ggml_fp16_t) * K);
+    // if (blck_size > 0) {
+    //     blck_0 = 4;
+    //     blck_1 = blck_size / blck_0;
+    //     if (blck_1 < 0) {
+    //         blck_1 = 1;
+    //     }
+    //     // blck_0 = (int64_t)sqrt(blck_size);
+    //     // blck_1 = blck_0;
+    // }
+    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
+
+    for (int j = n0; j < n1; j+=blck_n) {
+        for (int i = m0; i < m1; i+=blck_m) {
+            // printf("i j k => %d %d %d\n", i, j, K);
+            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
+                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
+                    lm_ggml_vec_dot_f16(k,
+                                    C + ii*n + jj,
+                                    A + ii * k,
+                                    B + jj * k);
+                }
             }
         }
     }
 }
 
-static void lm_ggml_compute_forward_conv_1d_s1_ph(
+// src0: kernel [OC, IC, K]
+// src1: signal [N, IC, IL]
+// dst:  result [N, OL, IC*K]
+static void lm_ggml_compute_forward_conv_1d_stage_0_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
               struct lm_ggml_tensor * dst) {
-    switch (src0->type) {
+    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
+    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
+    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16);
+
+    int64_t t0 = lm_ggml_perf_time_us();
+    UNUSED(t0);
+
+    LM_GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N  = ne12;
+    const int64_t IC = ne11;
+    const int64_t IL = ne10;
+
+    const int64_t K = ne00;
+
+    const int64_t OL = ne1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+
+    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
+    LM_GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == LM_GGML_TASK_INIT) {
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+        return;
+    }
+
+    if (params->type == LM_GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IL] => [N, OL, IC*K]
+    {
+        lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iol = 0; iol < OL; iol++) {
+                for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+                    // micro kernel
+                    lm_ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
+                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
+
+                    for (int64_t ik = 0; ik < K; ik++) {
+                        const int64_t iil = iol*s0 + ik*d0 - p0;
+
+                        if (!(iil < 0 || iil >= IL)) {
+                            dst_data[iic*K + ik] = LM_GGML_FP32_TO_FP16(src_data[iil]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// src0: [OC, IC, K]
+// src1: [N, OL, IC * K]
+// result: [N, OC, OL]
+static void lm_ggml_compute_forward_conv_1d_stage_1_f16(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
+    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16);
+    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
+
+    int64_t t0 = lm_ggml_perf_time_us();
+    UNUSED(t0);
+
+    if (params->type == LM_GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == LM_GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    LM_GGML_TENSOR_BINARY_OP_LOCALS;
+
+    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
+    LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t));
+    LM_GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne12;
+    const int OL = ne11;
+
+    const int OC = ne02;
+    const int IC = ne01;
+    const int K  = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OL;
+    int64_t k = IC * K;
+
+    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+    for (int i = 0; i < N; i++) {
+        lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k]
+        lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
+static void lm_ggml_compute_forward_conv_1d(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    switch(src0->type) {
         case LM_GGML_TYPE_F16:
             {
-                lm_ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
+                lm_ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
             } break;
         case LM_GGML_TYPE_F32:
             {
-                lm_ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
+                lm_ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void lm_ggml_compute_forward_conv_1d_stage_0(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    switch(src0->type) {
+        case LM_GGML_TYPE_F16:
+            {
+                lm_ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -13890,7 +14218,26 @@ static void lm_ggml_compute_forward_conv_1d_s1_ph(
     }
 }
 
-static void lm_ggml_compute_forward_conv_1d_s2_ph_f16_f32(
+static void lm_ggml_compute_forward_conv_1d_stage_1(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    switch(src0->type) {
+        case LM_GGML_TYPE_F16:
+            {
+                lm_ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// lm_ggml_compute_forward_conv_transpose_1d
+
+static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -13907,43 +14254,38 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f16_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = lm_ggml_up32(ne01);
+    const int nk = ne00*ne01*ne02;
 
-    LM_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
     LM_GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == LM_GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
         {
             lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
 
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    lm_ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
+                    lm_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
+                        dst_data[i00*ne02 + i02] = src[i00];
                     }
                 }
             }
         }
 
-        // prepare source data (src1)
+        // permute source data (src1) from (L x Cin) to (Cin x L)
         {
-            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
+            lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + nk;
+            lm_ggml_fp16_t * dst_data = wdata;
 
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
-                lm_ggml_fp16_t * dst_data = wdata;
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = LM_GGML_FP32_TO_FP16(src[i10]);
+                    dst_data[i10*ne11 + i11] = LM_GGML_FP32_TO_FP16(src[i10]);
                 }
             }
         }
@@ -13955,8 +14297,10 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f16_f32(
         return;
     }
 
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne1;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -13965,23 +14309,26 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    lm_ggml_fp16_t * const wdata     = (lm_ggml_fp16_t *) params->wdata + 0;
+    lm_ggml_fp16_t * const wdata_src = wdata + nk;
+
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                lm_ggml_vec_dot_f16(ew0, &v,
-                        (lm_ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (lm_ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
+        lm_ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                lm_ggml_vec_dot_f16(ne02, &v,
+                        (lm_ggml_fp16_t *)    wdata_src + i1n,
+                        (lm_ggml_fp16_t *) wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
             }
         }
     }
 }
 
-static void lm_ggml_compute_forward_conv_1d_s2_ph_f32(
+static void lm_ggml_compute_forward_conv_transpose_1d_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -13998,29 +14345,24 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = lm_ggml_up32(ne01);
+    const int nk = ne00*ne01*ne02;
 
-    LM_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     LM_GGML_ASSERT(nb00 == sizeof(float));
     LM_GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == LM_GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
         {
             float * const wdata = (float *) params->wdata + 0;
 
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
+                    float * dst_data = wdata + i01*ne00*ne02;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
+                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
                     }
                 }
             }
@@ -14028,13 +14370,13 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f32(
 
         // prepare source data (src1)
         {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
+            float * const wdata = (float *) params->wdata + nk;
+            float * dst_data = wdata;
 
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
+                    dst_data[i10*ne11 + i11] = src[i10];
                 }
             }
         }
@@ -14046,8 +14388,10 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f32(
         return;
     }
 
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne1;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -14056,23 +14400,26 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
+
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                lm_ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                lm_ggml_vec_dot_f32(ne02, &v,
+                        wdata_src + i1n,
+                        wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
             }
         }
     }
 }
 
-static void lm_ggml_compute_forward_conv_1d_s2_ph(
+static void lm_ggml_compute_forward_conv_transpose_1d(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
@@ -14080,11 +14427,11 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph(
     switch (src0->type) {
         case LM_GGML_TYPE_F16:
             {
-                lm_ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
+                lm_ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
             } break;
         case LM_GGML_TYPE_F32:
             {
-                lm_ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
+                lm_ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -14093,27 +14440,6 @@ static void lm_ggml_compute_forward_conv_1d_s2_ph(
     }
 }
 
-// lm_ggml_compute_forward_conv_1d
-
-static void lm_ggml_compute_forward_conv_1d(
-        const struct lm_ggml_compute_params * params,
-        const struct lm_ggml_tensor * src0,
-        const struct lm_ggml_tensor * src1,
-              struct lm_ggml_tensor * dst) {
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-    LM_GGML_ASSERT(d0 == 1); // dilation not supported
-    LM_GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
-    if (s0 == 1) {
-        lm_ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
-    } else if (s0 == 2) {
-        lm_ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
-    } else {
-        LM_GGML_ASSERT(false); // only stride 1 and 2 supported
-    }
-}
-
 // lm_ggml_compute_forward_conv_2d
 
 static void lm_ggml_compute_forward_conv_2d_f16_f32(
@@ -14156,20 +14482,22 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32(
         {
             lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
 
-            for (int i12 = 0; i12 < ne12; i12++) {
-                const float * const src = (float *)((char *) src1->data + i12*nb12);
-                lm_ggml_fp16_t * dst_data = wdata;
-
-                for (int i1 = 0; i1 < ne1; i1++) {
-                    for (int i0 = 0; i0 < ne0; i0++) {
-                        for (int ik1 = 0; ik1 < nk1; ik1++) {
-                            for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                const int idx0 = i0*s0 + ik0*d0 - p0;
-                                const int idx1 = i1*s1 + ik1*d1 - p1;
-
-                                if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
-                                    dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                        LM_GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+            for (int i13 = 0; i13 < ne13; i13++) {
+                for (int i12 = 0; i12 < ne12; i12++) {
+                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
+                    lm_ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
+
+                    for (int i1 = 0; i1 < ne1; i1++) {
+                        for (int i0 = 0; i0 < ne0; i0++) {
+                            for (int ik1 = 0; ik1 < nk1; ik1++) {
+                                for (int ik0 = 0; ik0 < nk0; ik0++) {
+                                    const int idx0 = i0*s0 + ik0*d0 - p0;
+                                    const int idx1 = i1*s1 + ik1*d1 - p1;
+
+                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                            LM_GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+                                    }
                                 }
                             }
                         }
@@ -16452,6 +16780,18 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
+        case LM_GGML_OP_CONV_1D_STAGE_0:
+            {
+                lm_ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case LM_GGML_OP_CONV_1D_STAGE_1:
+            {
+                lm_ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case LM_GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                lm_ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
         case LM_GGML_OP_CONV_2D:
             {
                 lm_ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17377,10 +17717,22 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case LM_GGML_OP_CONV_1D_STAGE_0:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case LM_GGML_OP_CONV_1D_STAGE_1:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case LM_GGML_OP_CONV_2D:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case LM_GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
@@ -18222,21 +18574,68 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
                     LM_GGML_ASSERT(node->src[1]->ne[2] == 1);
                     LM_GGML_ASSERT(node->src[1]->ne[3] == 1);
 
+                    const int64_t ne00 = node->src[0]->ne[0];
+                    const int64_t ne01 = node->src[0]->ne[1];
+                    const int64_t ne02 = node->src[0]->ne[2];
+
+                    const int64_t ne10 = node->src[1]->ne[0];
+                    const int64_t ne11 = node->src[1]->ne[1];
+
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t nk  = ne00;
+                    const int64_t ew0 = nk * ne01;
+
+                    UNUSED(ne02);
+                    UNUSED(ne10);
+                    UNUSED(ne11);
+
                     size_t cur = 0;
-                    const int nk = node->src[0]->ne[0];
 
                     if (node->src[0]->type == LM_GGML_TYPE_F16 &&
-                            node->src[1]->type == LM_GGML_TYPE_F32) {
-                        cur = sizeof(lm_ggml_fp16_t)*(
-                                nk*lm_ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
-                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
-                                );
+                        node->src[1]->type == LM_GGML_TYPE_F32) {
+                        cur = sizeof(lm_ggml_fp16_t)*(ne0*ne1*ew0);
+                    } else if (node->src[0]->type == LM_GGML_TYPE_F32 &&
+                               node->src[1]->type == LM_GGML_TYPE_F32) {
+                        cur = sizeof(float)*(ne0*ne1*ew0);
+                    } else {
+                        LM_GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case LM_GGML_OP_CONV_1D_STAGE_0:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case LM_GGML_OP_CONV_1D_STAGE_1:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case LM_GGML_OP_CONV_TRANSPOSE_1D:
+                {
+                    n_tasks = n_threads;
+
+                    LM_GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    LM_GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    LM_GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                    const int64_t ne00 = node->src[0]->ne[0];  // K
+                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
+
+                    const int64_t ne10 = node->src[1]->ne[0];  // L
+                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
+
+                    size_t cur = 0;
+                    if (node->src[0]->type == LM_GGML_TYPE_F16 &&
+                        node->src[1]->type == LM_GGML_TYPE_F32) {
+                        cur += sizeof(lm_ggml_fp16_t)*ne00*ne01*ne02;
+                        cur += sizeof(lm_ggml_fp16_t)*ne10*ne11;
                     } else if (node->src[0]->type == LM_GGML_TYPE_F32 &&
-                            node->src[1]->type == LM_GGML_TYPE_F32) {
-                        cur = sizeof(float)*(
-                                nk*lm_ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
-                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
-                                );
+                               node->src[1]->type == LM_GGML_TYPE_F32) {
+                        cur += sizeof(float)*ne00*ne01*ne02;
+                        cur += sizeof(float)*ne10*ne11;
                     } else {
                         LM_GGML_ASSERT(false);
                     }
@@ -19362,7 +19761,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
         if (callback) {
             callback(callback_data, accum_step, &sched, &cancel);
             if (cancel) {
-                break;
+                return LM_GGML_OPT_CANCEL;
             }
         }
         // lm_ggml_graph_reset  (gf);
@@ -19371,9 +19770,6 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
         lm_ggml_opt_acc_grad(np, ps, g, accum_norm);
         fx += lm_ggml_get_f32_1d(f, 0);
     }
-    if (cancel) {
-        return LM_GGML_OPT_DID_NOT_CONVERGE;
-    }
     fx *= accum_norm;
 
     opt->adam.fx_prev = fx;
@@ -19399,9 +19795,6 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
 
     // run the optimizer
     for (int t = 0; t < params.adam.n_iter; ++t) {
-        if (cancel) {
-            break;
-        }
         opt->iter = iter0 + t + 1;
         LM_GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
 
@@ -19459,7 +19852,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
             if (callback) {
                 callback(callback_data, accum_step, &sched, &cancel);
                 if (cancel) {
-                    break;
+                    return LM_GGML_OPT_CANCEL;;
                 }
             }
             // lm_ggml_graph_reset  (gf);
@@ -19468,9 +19861,6 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
             lm_ggml_opt_acc_grad(np, ps, g, accum_norm);
             fx += lm_ggml_get_f32_1d(f, 0);
         }
-        if (cancel) {
-            break;
-        }
         fx *= accum_norm;
 
         opt->loss_after = fx;
@@ -19589,7 +19979,7 @@ static enum lm_ggml_opt_result linesearch_backtracking(
     finit = *fx;
     dgtest = params->lbfgs.ftol*dginit;
 
-    while (!*cancel) {
+    while (true) {
         lm_ggml_vec_cpy_f32(nx, x, xp);
         lm_ggml_vec_mad_f32(nx, x, d, *step);
 
@@ -19605,7 +19995,7 @@ static enum lm_ggml_opt_result linesearch_backtracking(
                     float sched = 0;
                     callback(callback_data, accum_step, &sched, cancel);
                     if (*cancel) {
-                        break;
+                        return LM_GGML_OPT_CANCEL;
                     }
                 }
                 // lm_ggml_graph_reset  (gf);
@@ -19614,9 +20004,6 @@ static enum lm_ggml_opt_result linesearch_backtracking(
                 lm_ggml_opt_acc_grad(np, ps, g, accum_norm);
                 *fx += lm_ggml_get_f32_1d(f, 0);
             }
-            if (*cancel) {
-                break;
-            }
             *fx *= accum_norm;
 
         }
@@ -19749,7 +20136,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs(
                 float sched = 0;
                 callback(callback_data, accum_step, &sched, &cancel);
                 if (cancel) {
-                    break;
+                    return LM_GGML_OPT_CANCEL;
                 }
             }
             // lm_ggml_graph_reset  (gf);
@@ -19758,9 +20145,6 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs(
             lm_ggml_opt_acc_grad(np, ps, g, accum_norm);
             fx += lm_ggml_get_f32_1d(f, 0);
         }
-        if (cancel) {
-            return LM_GGML_OPT_DID_NOT_CONVERGE;
-        }
         fx *= accum_norm;
 
         opt->loss_before = fx;
@@ -19820,8 +20204,8 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs(
         lm_ggml_vec_cpy_f32(nx, gp, g);
 
         ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
-        if (!cancel) {
-            break;
+        if (cancel) {
+            return LM_GGML_OPT_CANCEL;
         }
 
         if (ls < 0) {
diff --git a/cpp/ggml.h b/cpp/ggml.h
index a7f679e4..9ba353ac 100644
--- a/cpp/ggml.h
+++ b/cpp/ggml.h
@@ -401,10 +401,14 @@ extern "C" {
         LM_GGML_OP_CLAMP,
         LM_GGML_OP_CONV_1D,
         LM_GGML_OP_CONV_2D,
+        LM_GGML_OP_CONV_TRANSPOSE_1D,
         LM_GGML_OP_CONV_TRANSPOSE_2D,
         LM_GGML_OP_POOL_1D,
         LM_GGML_OP_POOL_2D,
 
+        LM_GGML_OP_CONV_1D_STAGE_0,  // internal
+        LM_GGML_OP_CONV_1D_STAGE_1,  // internal
+
         LM_GGML_OP_UPSCALE, // nearest interpolate
 
         LM_GGML_OP_FLASH_ATTN,
@@ -1386,6 +1390,14 @@ extern "C" {
             int                   s,
             int                   d);
 
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor  * b,
+            int                   s0,
+            int                   p0,
+            int                   d0);
+
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1759,6 +1771,7 @@ extern "C" {
         LM_GGML_OPT_NO_CONTEXT,
         LM_GGML_OPT_INVALID_WOLFE,
         LM_GGML_OPT_FAIL,
+        LM_GGML_OPT_CANCEL,
 
         LM_GGML_LINESEARCH_FAIL = -128,
         LM_GGML_LINESEARCH_MINIMUM_STEP,
diff --git a/cpp/k_quants.c b/cpp/k_quants.c
index 9ed6cf6b..57548b90 100644
--- a/cpp/k_quants.c
+++ b/cpp/k_quants.c
@@ -69,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 // 2-6 bit quantization in super-blocks
 //
 
-
 //
 // ===================== Helper functions
 //
@@ -348,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     const float q4scale = 15.f;
 
     for (int i = 0; i < nb; i++) {
-
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
         for (int j = 0; j < QK_K/16; ++j) {
diff --git a/cpp/llama.cpp b/cpp/llama.cpp
index 073f266c..4b89e86e 100644
--- a/cpp/llama.cpp
+++ b/cpp/llama.cpp
@@ -125,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
     }
     s = std::move(result);
 }
+
+static bool is_float_close(float a, float b, float abs_tol) {
+    // Check for non-negative tolerance
+    if (abs_tol < 0.0) {
+        throw std::invalid_argument("Tolerance must be non-negative");
+    }
+
+    // Exact equality check
+    if (a == b) {
+        return true;
+    }
+
+    // Check for infinities
+    if (std::isinf(a) || std::isinf(b)) {
+        return false;
+    }
+
+    // Regular comparison using the provided absolute tolerance
+    return std::fabs(b - a) <= abs_tol;
+}
+
 #ifdef LM_GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
@@ -165,6 +186,7 @@ enum llm_arch {
     LLM_ARCH_GPTNEOX,
     LLM_ARCH_MPT,
     LLM_ARCH_STARCODER,
+    LLM_ARCH_REFACT,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -177,6 +199,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_MPT,             "mpt"       },
     { LLM_ARCH_BAICHUAN,        "baichuan"  },
     { LLM_ARCH_STARCODER,       "starcoder" },
+    { LLM_ARCH_REFACT,          "refact" },
 };
 
 enum llm_kv {
@@ -397,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
         },
     },
+    {
+        LLM_ARCH_REFACT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -950,7 +990,24 @@ struct llama_hparams {
     float rope_freq_scale_train;
 
     bool operator!=(const llama_hparams & other) const {
-        return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
+        if (this->vocab_only != other.vocab_only) return true;
+        if (this->n_vocab != other.n_vocab) return true;
+        if (this->n_ctx_train != other.n_ctx_train) return true;
+        if (this->n_embd != other.n_embd) return true;
+        if (this->n_head != other.n_head) return true;
+        if (this->n_head_kv != other.n_head_kv) return true;
+        if (this->n_layer != other.n_layer) return true;
+        if (this->n_rot != other.n_rot) return true;
+        if (this->n_ff != other.n_ff) return true;
+
+        const float EPSILON = 1e-9;
+
+        if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
+
+        return false;
     }
 
     uint32_t n_gqa() const {
@@ -1025,6 +1082,9 @@ struct llama_kv_cell {
 struct llama_kv_cache {
     bool has_shift = false;
 
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
     uint32_t head = 0;
     uint32_t size = 0;
 
@@ -1282,6 +1342,8 @@ static bool llama_kv_cache_init(
 
 // find an empty slot of size "n_tokens" in the cache
 // updates the cache head
+// Note: On success, it's important that cache.head points
+// to the first cell of the slot.
 static bool llama_kv_cache_find_slot(
            struct llama_kv_cache & cache,
         const struct llama_batch & batch) {
@@ -1297,8 +1359,8 @@ static bool llama_kv_cache_find_slot(
 
     while (true) {
         if (cache.head + n_tokens > n_ctx) {
+            n_tested += n_ctx - cache.head;
             cache.head = 0;
-            n_tested   += n_ctx - cache.head;
             continue;
         }
 
@@ -1349,6 +1411,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
         cache.cells[i].pos = -1;
         cache.cells[i].seq_id.clear();
     }
+
+    // Searching for a free slot can start here since we know it will be empty.
+    cache.head = uint32_t(c0);
 }
 
 static void llama_kv_cache_seq_rm(
@@ -1356,6 +1421,8 @@ static void llama_kv_cache_seq_rm(
                  llama_seq_id   seq_id,
                     llama_pos   p0,
                     llama_pos   p1) {
+    uint32_t new_head = cache.size;
+
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
@@ -1364,9 +1431,13 @@ static void llama_kv_cache_seq_rm(
             cache.cells[i].seq_id.erase(seq_id);
             if (cache.cells[i].seq_id.empty()) {
                 cache.cells[i].pos = -1;
+                if (new_head == cache.size) new_head = i;
             }
         }
     }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cache.size) cache.head = new_head;
 }
 
 static void llama_kv_cache_seq_cp(
@@ -1378,6 +1449,8 @@ static void llama_kv_cache_seq_cp(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    cache.head = 0;
+
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
             cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1386,12 +1459,18 @@ static void llama_kv_cache_seq_cp(
 }
 
 static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    uint32_t new_head = cache.size;
+
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (!cache.cells[i].has_seq_id(seq_id)) {
             cache.cells[i].pos = -1;
             cache.cells[i].seq_id.clear();
+            if (new_head == cache.size) new_head = i;
         }
     }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cache.size) cache.head = new_head;
 }
 
 static void llama_kv_cache_seq_shift(
@@ -1400,6 +1479,8 @@ static void llama_kv_cache_seq_shift(
                     llama_pos   p0,
                     llama_pos   p1,
                     llama_pos   delta) {
+    uint32_t new_head = cache.size;
+
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
@@ -1409,12 +1490,17 @@ static void llama_kv_cache_seq_shift(
             if (cache.cells[i].pos < 0) {
                 cache.cells[i].pos = -1;
                 cache.cells[i].seq_id.clear();
+                if (new_head == cache.size) new_head = i;
             } else {
                 cache.has_shift = true;
                 cache.cells[i].delta = delta;
             }
         }
     }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    cache.head = new_head != cache.size ? new_head : 0;
 }
 
 //
@@ -1927,6 +2013,14 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_REFACT:
+            {
+                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_1B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         default: (void)0;
     }
 
@@ -2164,6 +2258,7 @@ static void llm_load_tensors(
         const auto tn = LLM_TN(model.arch);
         switch (model.arch) {
             case LLM_ARCH_LLAMA:
+            case LLM_ARCH_REFACT:
                 {
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU);
 
@@ -3357,6 +3452,353 @@ static struct lm_ggml_cgraph * llm_build_baichaun(
     return gf;
 }
 
+static struct lm_ggml_cgraph * llm_build_refact(
+         llama_context & lctx,
+     const llama_batch & batch) {
+    const auto & model   = lctx.model;
+    const auto & hparams = model.hparams;
+    const auto & cparams = lctx.cparams;
+
+    const auto & kv_self = lctx.kv_self;
+
+    LM_GGML_ASSERT(!!kv_self.ctx);
+
+    const int64_t n_embd      = hparams.n_embd;
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_ctx       = cparams.n_ctx;
+    const int64_t n_head      = hparams.n_head;
+    const int64_t n_head_kv   = hparams.n_head_kv;
+    const int64_t n_embd_head = hparams.n_embd_head();
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+
+    const float norm_rms_eps = hparams.f_norm_rms_eps;
+
+    const int n_gpu_layers = model.n_gpu_layers;
+
+    const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
+    const int32_t kv_head  = lm_ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+
+    // printf("n_kv = %d\n", n_kv);
+
+    auto & buf_compute = lctx.buf_compute;
+
+    struct lm_ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.data,
+        /*.no_alloc   =*/ false,
+    };
+
+    params.no_alloc = true;
+
+    struct lm_ggml_context * ctx0 = lm_ggml_init(params);
+
+    lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0);
+
+    struct lm_ggml_tensor * cur;
+    struct lm_ggml_tensor * inpL;
+
+    if (batch.token) {
+        struct lm_ggml_tensor * inp_tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens);
+
+        lm_ggml_allocr_alloc(lctx.alloc, inp_tokens);
+        if (!lm_ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inp_tokens->data, batch.token, n_tokens*lm_ggml_element_size(inp_tokens));
+        }
+        lm_ggml_set_name(inp_tokens, "inp_tokens");
+
+        inpL = lm_ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
+    } else {
+#ifdef LM_GGML_USE_MPI
+        LM_GGML_ASSERT(false && "not implemented");
+#endif
+
+        inpL = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_tokens);
+
+        lm_ggml_allocr_alloc(lctx.alloc, inpL);
+        if (!lm_ggml_allocr_is_measure(lctx.alloc)) {
+            memcpy(inpL->data, batch.embd, n_tokens * n_embd * lm_ggml_element_size(inpL));
+        }
+    }
+
+    const int i_gpu_start = n_layer - n_gpu_layers;
+    (void) i_gpu_start;
+
+    // offload functions set the tensor output backend to GPU
+    // tensors are GPU-accelerated if any input or the output has been offloaded
+    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
+    offload_func_t offload_func_kq = llama_nop;
+    offload_func_t offload_func_v  = llama_nop;
+
+#ifdef LM_GGML_USE_CUBLAS
+    if (n_gpu_layers > n_layer) {
+        offload_func_nr = lm_ggml_cuda_assign_buffers_no_alloc;
+    }
+    if (n_gpu_layers > n_layer + 1) {
+        offload_func_v  = lm_ggml_cuda_assign_buffers_no_alloc;
+    }
+    if (n_gpu_layers > n_layer + 2) {
+        offload_func_kq = lm_ggml_cuda_assign_buffers_no_alloc;
+    }
+#endif // LM_GGML_USE_CUBLAS
+
+    // KQ_scale
+    struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1);
+    lm_ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+    lm_ggml_allocr_alloc(lctx.alloc, KQ_scale);
+    if (!lm_ggml_allocr_is_measure(lctx.alloc)) {
+        lm_ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
+    }
+
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1);
+    offload_func_kq(KQ_mask);
+    lm_ggml_set_name(KQ_mask, "KQ_mask");
+    lm_ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    if (!lm_ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, lm_ggml_nbytes(KQ_mask));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+
+    for (int il = 0; il < n_layer; ++il) {
+        lm_ggml_format_name(inpL, "layer_inp_%d", il);
+
+        offload_func_t offload_func = llama_nop;
+
+#ifdef LM_GGML_USE_CUBLAS
+        if (il >= i_gpu_start) {
+            offload_func = lm_ggml_cuda_assign_buffers_no_alloc;
+        }
+#endif // LM_GGML_USE_CUBLAS
+
+        struct lm_ggml_tensor * inpSA = inpL;
+
+        // norm
+        {
+            cur = lm_ggml_rms_norm(ctx0, inpL, norm_rms_eps);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "rms_norm_0");
+
+            // cur = cur*attn_norm(broadcasted)
+            cur = lm_ggml_mul(ctx0, cur, model.layers[il].attn_norm);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "attention_norm_0");
+        }
+
+        // self-attention
+        {
+            // compute Q and K
+            struct lm_ggml_tensor * tmpk = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+            offload_func_kq(tmpk);
+            lm_ggml_set_name(tmpk, "tmpk");
+
+            struct lm_ggml_tensor * tmpq = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            offload_func_kq(tmpq);
+            lm_ggml_set_name(tmpq, "tmpq");
+
+            struct lm_ggml_tensor * Kcur = lm_ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
+            offload_func_kq(Kcur);
+            lm_ggml_set_name(Kcur, "Kcur");
+
+            struct lm_ggml_tensor * Qcur = lm_ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens);
+            offload_func_kq(Qcur);
+            lm_ggml_set_name(Qcur, "Qcur");
+
+            // store key and value to memory
+            {
+                // compute the transposed [n_tokens, n_embd] V matrix
+
+                struct lm_ggml_tensor * tmpv = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                offload_func_v(tmpv);
+                lm_ggml_set_name(tmpv, "tmpv");
+
+                struct lm_ggml_tensor * Vcur = lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
+                offload_func_v(Vcur);
+                lm_ggml_set_name(Vcur, "Vcur");
+
+                struct lm_ggml_tensor * k = lm_ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (lm_ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+                offload_func_kq(k);
+                lm_ggml_set_name(k, "k");
+
+                struct lm_ggml_tensor * v = lm_ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
+                        (   n_ctx)*lm_ggml_element_size(kv_self.v),
+                        (il*n_ctx)*lm_ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*lm_ggml_element_size(kv_self.v));
+                offload_func_v(v);
+                lm_ggml_set_name(v, "v");
+
+                lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Kcur, k));
+                lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, Vcur, v));
+            }
+
+            struct lm_ggml_tensor * Q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+            offload_func_kq(Q);
+            lm_ggml_set_name(Q, "Q");
+
+            struct lm_ggml_tensor * K =
+                lm_ggml_view_3d(ctx0, kv_self.k,
+                        n_embd_head, n_kv, n_head_kv,
+                        lm_ggml_element_size(kv_self.k)*n_embd_gqa,
+                        lm_ggml_element_size(kv_self.k)*n_embd_head,
+                        lm_ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+            offload_func_kq(K);
+            lm_ggml_set_name(K, "K");
+
+            // K * Q
+            struct lm_ggml_tensor * KQ = lm_ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+            lm_ggml_set_name(KQ, "KQ");
+
+            // KQ_scaled = KQ / sqrt(n_embd_head)
+            // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
+            struct lm_ggml_tensor * KQ_scaled = lm_ggml_scale(ctx0, KQ, KQ_scale);
+            offload_func_kq(KQ_scaled);
+            lm_ggml_set_name(KQ_scaled, "KQ_scaled");
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct lm_ggml_tensor * KQ_scaled_alibi = lm_ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
+            lm_ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
+
+            struct lm_ggml_tensor * KQ_masked = lm_ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
+            offload_func_kq(KQ_masked);
+            lm_ggml_set_name(KQ_masked, "KQ_masked");
+
+            // KQ = soft_max(KQ_masked)
+            struct lm_ggml_tensor * KQ_soft_max = lm_ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+            lm_ggml_set_name(KQ_soft_max, "KQ_soft_max");
+
+            // split cached V into n_head heads
+            struct lm_ggml_tensor * V =
+                lm_ggml_view_3d(ctx0, kv_self.v,
+                        n_kv, n_embd_head, n_head_kv,
+                        lm_ggml_element_size(kv_self.v)*n_ctx,
+                        lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
+                        lm_ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+            offload_func_v(V);
+            lm_ggml_set_name(V, "V");
+
+#if 1
+            struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+            lm_ggml_set_name(KQV, "KQV");
+#else
+            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
+            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
+            // is there a better way?
+            struct lm_ggml_tensor * V_cont = lm_ggml_cpy(ctx0, V, lm_ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
+            struct lm_ggml_tensor * KQV = lm_ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
+#endif
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct lm_ggml_tensor * KQV_merged = lm_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+            lm_ggml_set_name(KQV_merged, "KQV_merged");
+
+            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
+            cur = lm_ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
+            offload_func_v(cur);
+            lm_ggml_set_name(cur, "KQV_merged_contiguous");
+
+            // projection (no bias)
+            cur = lm_ggml_mul_mat(ctx0,
+                    model.layers[il].wo,
+                    cur);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "result_wo");
+        }
+
+        struct lm_ggml_tensor * inpFF = lm_ggml_add(ctx0, cur, inpSA);
+        offload_func(inpFF);
+        lm_ggml_set_name(inpFF, "inpFF");
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = lm_ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
+                offload_func(cur);
+                lm_ggml_set_name(cur, "rms_norm_1");
+
+                // cur = cur*ffn_norm(broadcasted)
+                cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+                offload_func(cur);
+                lm_ggml_set_name(cur, "ffn_norm");
+            }
+
+            struct lm_ggml_tensor * tmp = lm_ggml_mul_mat(ctx0,
+                    model.layers[il].w3,
+                    cur);
+            offload_func(tmp);
+            lm_ggml_set_name(tmp, "result_w3");
+
+            cur = lm_ggml_mul_mat(ctx0,
+                    model.layers[il].w1,
+                    cur);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "result_w1");
+
+            // SILU activation
+            cur = lm_ggml_silu(ctx0, cur);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "silu");
+
+            cur = lm_ggml_mul(ctx0, cur, tmp);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "silu_x_result_w3");
+
+            cur = lm_ggml_mul_mat(ctx0,
+                    model.layers[il].w2,
+                    cur);
+            offload_func(cur);
+            lm_ggml_set_name(cur, "result_w2");
+        }
+
+        cur = lm_ggml_add(ctx0, cur, inpFF);
+        offload_func(cur);
+        lm_ggml_set_name(cur, "inpFF_+_result_w2");
+
+        // input for next layer
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    // norm
+    {
+        cur = lm_ggml_rms_norm(ctx0, cur, norm_rms_eps);
+        offload_func_nr(cur);
+        lm_ggml_set_name(cur, "rms_norm_2");
+
+        // cur = cur*norm(broadcasted)
+        cur = lm_ggml_mul(ctx0, cur, model.output_norm);
+        // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
+        lm_ggml_set_name(cur, "result_norm");
+    }
+
+    // lm_head
+    cur = lm_ggml_mul_mat(ctx0, model.output, cur);
+    lm_ggml_set_name(cur, "result_output");
+
+    lm_ggml_build_forward_expand(gf, cur);
+
+    lm_ggml_free(ctx0);
+
+    return gf;
+}
+
 static struct lm_ggml_cgraph * llm_build_falcon(
          llama_context & lctx,
      const llama_batch & batch) {
@@ -3997,6 +4439,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
             {
                 result = llm_build_starcoder(lctx, batch);
             } break;
+        case LLM_ARCH_REFACT:
+            {
+                result = llm_build_refact(lctx, batch);
+            } break;
         default:
             LM_GGML_ASSERT(false);
     }
@@ -4075,10 +4521,6 @@ static int llama_decode_internal(
         batch.seq_id = seq_id.data();
     }
 
-    // we always start to search for a free slot from the start of the cache
-    // TODO: better strategies can be implemented
-    kv_self.head = 0;
-
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return 1;
     }
@@ -4130,7 +4572,8 @@ static int llama_decode_internal(
     // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
     const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
         model.arch == LLM_ARCH_BAICHUAN ||
-        model.arch == LLM_ARCH_FALCON;
+        model.arch == LLM_ARCH_FALCON ||
+        model.arch == LLM_ARCH_REFACT;
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
     if (lm_ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
         n_threads = 1;
@@ -4163,8 +4606,12 @@ static int llama_decode_internal(
 #endif
 
     // update the kv ring buffer
-    lctx.kv_self.head      += n_tokens;
     lctx.kv_self.has_shift  = false;
+    lctx.kv_self.head      += n_tokens;
+    // Ensure kv cache head points to a valid index.
+    if (lctx.kv_self.head >= lctx.kv_self.size) {
+        lctx.kv_self.head = 0;
+    }
 
 #ifdef LM_GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
@@ -7801,14 +8248,14 @@ void llama_print_timings(struct llama_context * ctx) {
     const llama_timings timings = llama_get_timings(ctx);
 
     LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %8.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
+    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
 }
 
 void llama_reset_timings(struct llama_context * ctx) {
diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp
index 85a3443f..996bf1b3 100644
--- a/cpp/rn-llama.hpp
+++ b/cpp/rn-llama.hpp
@@ -359,98 +359,20 @@ struct llama_rn_context
             return result;
         }
 
-        // out of user input, sample next token
-        const float temp = params.temp;
-        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
-        const float top_p = params.top_p;
-        const float tfs_z = params.tfs_z;
-        const float typical_p = params.typical_p;
-        const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
-        const float repeat_penalty = params.repeat_penalty;
-        const float alpha_presence = params.presence_penalty;
-        const float alpha_frequency = params.frequency_penalty;
-        const int mirostat = params.mirostat;
-        const float mirostat_tau = params.mirostat_tau;
-        const float mirostat_eta = params.mirostat_eta;
-        const bool penalize_nl = params.penalize_nl;
-        const int32_t n_probs = params.n_probs;
-
         {
-            auto *logits = llama_get_logits(ctx);
-            auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-            // Apply params.logit_bias map
-            for (const auto &it : params.logit_bias)
-            {
-                logits[it.first] += it.second;
-            }
-
+            // out of user input, sample next token
             std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++)
-            {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-            }
+            candidates.reserve(llama_n_vocab(model));
 
-            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
-
-            // Apply penalties
-            float nl_logit = logits[llama_token_nl(ctx)];
-            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
-            llama_sample_repetition_penalty(ctx, &candidates_p,
-                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                                            last_n_repeat, repeat_penalty);
-            llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                                                          last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                                                          last_n_repeat, alpha_frequency, alpha_presence);
-            if (!penalize_nl)
-            {
-                logits[llama_token_nl(ctx)] = nl_logit;
-            }
+            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
 
-            if (grammar != nullptr) {
-                llama_sample_grammar(ctx, &candidates_p, grammar);
-            }
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-            if (temp <= 0)
+            const int32_t n_probs = params.n_probs;
+            if (params.temp <= 0 && n_probs > 0)
             {
-                // Greedy sampling
-                result.tok = llama_sample_token_greedy(ctx, &candidates_p);
-                if (n_probs > 0)
-                {
-                    llama_sample_softmax(ctx, &candidates_p);
-                }
-            }
-            else
-            {
-                if (mirostat == 1)
-                {
-                    static float mirostat_mu = 2.0f * mirostat_tau;
-                    const int mirostat_m = 100;
-                    llama_sample_temp(ctx, &candidates_p, temp);
-                    result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                }
-                else if (mirostat == 2)
-                {
-                    static float mirostat_mu = 2.0f * mirostat_tau;
-                    llama_sample_temp(ctx, &candidates_p, temp);
-                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                }
-                else
-                {
-                    // Temperature sampling
-                    size_t min_keep = std::max(1, n_probs);
-                    llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
-                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
-                    llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
-                    llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
-                    llama_sample_temp(ctx, &candidates_p, temp);
-                    result.tok = llama_sample_token(ctx, &candidates_p);
-                }
-            }
-
-            if (grammar != nullptr) {
-                llama_grammar_accept_token(ctx, grammar, result.tok);
+                // For llama_sample_token_greedy we need to sort candidates
+                llama_sample_softmax(ctx, &candidates_p);
             }
 
             for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
diff --git a/llama.cpp b/llama.cpp
index 79f34abd..1faaae8c 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 79f34abddb72ac5ddbf118f3d87520b611a10a7d
+Subproject commit 1faaae8c2bdc4a21302e367e0754c3fe74a8113e
diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch
index 99a90400..98ed383f 100644
--- a/scripts/llama.cpp.patch
+++ b/scripts/llama.cpp.patch
@@ -1,6 +1,6 @@
---- llama.cpp.orig	2023-10-04 08:41:11
-+++ llama.cpp	2023-10-04 08:41:12
-@@ -649,16 +649,16 @@
+--- llama.cpp.orig	2023-10-07 10:01:30
++++ llama.cpp	2023-10-07 10:01:32
+@@ -689,16 +689,16 @@
 
          if (prefetch > 0) {
              // Advise the kernel to preload the mapped memory