diff --git a/android/src/main/CMakeLists.txt b/android/src/main/CMakeLists.txt
index cc41ebf4..e38f9116 100644
--- a/android/src/main/CMakeLists.txt
+++ b/android/src/main/CMakeLists.txt
@@ -13,6 +13,7 @@ set(
     ${RNLLAMA_LIB_DIR}/k_quants.c
     ${RNLLAMA_LIB_DIR}/common.cpp
     ${RNLLAMA_LIB_DIR}/grammar-parser.cpp
+    ${RNLLAMA_LIB_DIR}/sampling.cpp
     ${RNLLAMA_LIB_DIR}/llama.cpp
     ${RNLLAMA_LIB_DIR}/rn-llama.hpp
     ${CMAKE_SOURCE_DIR}/jni.cpp
diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
index 434b2d3a..fa42043d 100644
--- a/android/src/main/jni.cpp
+++ b/android/src/main/jni.cpp
@@ -299,7 +299,6 @@ Java_com_rnllama_LlamaContext_doCompletion(
 
     llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
     llama->params.grammar = env->GetStringUTFChars(grammar, nullptr);
-    llama->params.temp = temperature;
 
     int max_threads = std::thread::hardware_concurrency();
     // Use 2 threads by default on 4-core devices, 4 threads on more cores
@@ -307,32 +306,26 @@ Java_com_rnllama_LlamaContext_doCompletion(
     llama->params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
 
     llama->params.n_predict = n_predict;
-    llama->params.n_probs = n_probs;
-    llama->params.repeat_last_n = repeat_last_n;
-    llama->params.repeat_penalty = repeat_penalty;
-    llama->params.presence_penalty = presence_penalty;
-    llama->params.frequency_penalty = frequency_penalty;
-    llama->params.mirostat = mirostat;
-    llama->params.mirostat_tau = mirostat_tau;
-    llama->params.mirostat_eta = mirostat_eta;
-    llama->params.top_k = top_k;
-    llama->params.top_p = top_p;
-    llama->params.tfs_z = tfs_z;
-    llama->params.typical_p = typical_p;
     llama->params.ignore_eos = ignore_eos;
 
-    llama->params.antiprompt.clear();
-    int stop_len = env->GetArrayLength(stop);
-    for (int i = 0; i < stop_len; i++) {
-        jstring stop_str = (jstring) env->GetObjectArrayElement(stop, i);
-        const char *stop_chars = env->GetStringUTFChars(stop_str, nullptr);
-        llama->params.antiprompt.push_back(stop_chars);
-        env->ReleaseStringUTFChars(stop_str, stop_chars);
-    }
-
-    llama->params.logit_bias.clear();
+    auto & sparams = llama->params.sampling_params;
+    sparams.temp = temperature;
+    sparams.repeat_last_n = repeat_last_n;
+    sparams.repeat_penalty = repeat_penalty;
+    sparams.presence_penalty = presence_penalty;
+    sparams.frequency_penalty = frequency_penalty;
+    sparams.mirostat = mirostat;
+    sparams.mirostat_tau = mirostat_tau;
+    sparams.mirostat_eta = mirostat_eta;
+    sparams.top_k = top_k;
+    sparams.top_p = top_p;
+    sparams.tfs_z = tfs_z;
+    sparams.typical_p = typical_p;
+    sparams.n_probs = n_probs;
+
+    sparams.logit_bias.clear();
     if (ignore_eos) {
-        llama->params.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
     }
 
     const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
@@ -346,9 +339,9 @@ Java_com_rnllama_LlamaContext_doCompletion(
             llama_token tok = static_cast<llama_token>(doubleArray[0]);
             if (tok >= 0 && tok < n_vocab) {
                 if (doubleArray[1] != 0) {  // If the second element is not false (0)
-                    llama->params.logit_bias[tok] = doubleArray[1];
+                    sparams.logit_bias[tok] = doubleArray[1];
                 } else {
-                    llama->params.logit_bias[tok] = -INFINITY;
+                    sparams.logit_bias[tok] = -INFINITY;
                 }
             }
 
@@ -357,6 +350,15 @@ Java_com_rnllama_LlamaContext_doCompletion(
         env->DeleteLocalRef(el);
     }
 
+    llama->params.antiprompt.clear();
+    int stop_len = env->GetArrayLength(stop);
+    for (int i = 0; i < stop_len; i++) {
+        jstring stop_str = (jstring) env->GetObjectArrayElement(stop, i);
+        const char *stop_chars = env->GetStringUTFChars(stop_str, nullptr);
+        llama->params.antiprompt.push_back(stop_chars);
+        env->ReleaseStringUTFChars(stop_str, stop_chars);
+    }
+
     if (!llama->loadGrammar()) {
         auto result = createWriteableMap(env);
         putString(env, result, "error", "Failed to load grammar");
@@ -408,7 +410,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
             auto tokenResult = createWriteableMap(env);
             putString(env, tokenResult, "token", to_send.c_str());
 
-            if (llama->params.n_probs > 0) {
+            if (llama->params.sampling_params.n_probs > 0) {
               const std::vector<llama_token> to_send_toks = llama_tokenize(llama->ctx, to_send, false);
               size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
               size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
diff --git a/cpp/build-info.h b/cpp/build-info.h
index 9495f1fe..93f808c2 100644
--- a/cpp/build-info.h
+++ b/cpp/build-info.h
@@ -1,8 +1,8 @@
 #ifndef BUILD_INFO_H
 #define BUILD_INFO_H
 
-#define BUILD_NUMBER 1364
-#define BUILD_COMMIT "9f6ede1"
+#define BUILD_NUMBER 1378
+#define BUILD_COMMIT "1e0e873"
 #define BUILD_COMPILER ""
 #define BUILD_TARGET "unknown"
 
diff --git a/cpp/common.cpp b/cpp/common.cpp
index 93e307da..077cc959 100644
--- a/cpp/common.cpp
+++ b/cpp/common.cpp
@@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     std::string arg;
     gpt_params default_params;
     const std::string arg_prefix = "--";
+    llama_sampling_params & sparams = params.sampling_params;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.top_k = std::stoi(argv[i]);
+            sparams.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.top_p = std::stof(argv[i]);
+            sparams.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.temp = std::stof(argv[i]);
+            sparams.temp = std::stof(argv[i]);
         } else if (arg == "--tfs") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.tfs_z = std::stof(argv[i]);
+            sparams.tfs_z = std::stof(argv[i]);
         } else if (arg == "--typical") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.typical_p = std::stof(argv[i]);
+            sparams.typical_p = std::stof(argv[i]);
         } else if (arg == "--repeat-last-n") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_last_n = std::stoi(argv[i]);
+            sparams.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_penalty = std::stof(argv[i]);
+            sparams.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "--frequency-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.frequency_penalty = std::stof(argv[i]);
+            sparams.frequency_penalty = std::stof(argv[i]);
         } else if (arg == "--presence-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.presence_penalty = std::stof(argv[i]);
+            sparams.presence_penalty = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mirostat = std::stoi(argv[i]);
+            sparams.mirostat = std::stoi(argv[i]);
         } else if (arg == "--mirostat-lr") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mirostat_eta = std::stof(argv[i]);
+            sparams.mirostat_eta = std::stof(argv[i]);
         } else if (arg == "--mirostat-ent") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.mirostat_tau = std::stof(argv[i]);
+            sparams.mirostat_tau = std::stof(argv[i]);
         } else if (arg == "--cfg-negative-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.cfg_negative_prompt = argv[i];
+            sparams.cfg_negative_prompt = argv[i];
         } else if (arg == "--cfg-negative-prompt-file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
-            if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
-                params.cfg_negative_prompt.pop_back();
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
+            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
+                sparams.cfg_negative_prompt.pop_back();
             }
         } else if (arg == "--cfg-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.cfg_scale = std::stof(argv[i]);
+            sparams.cfg_scale = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -383,6 +384,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_base = argv[i];
+        } else if (arg == "--mmproj") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mmproj = argv[i];
+        } else if (arg == "--image") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.image = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
@@ -512,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
-            params.penalize_nl = false;
+            sparams.penalize_nl = false;
         } else if (arg == "-l" || arg == "--logit-bias") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -524,7 +537,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::string value_str;
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                 } else {
                     throw std::exception();
                 }
@@ -627,6 +640,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    const llama_sampling_params & sparams = params.sampling_params;
+
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
     printf("options:\n");
@@ -659,19 +674,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
+    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
+    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
+    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
+    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
+    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
     printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
     printf("                        modifies the likelihood of token appearing in the completion,\n");
     printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@@ -682,7 +697,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        negative prompt to use for guidance. (default: empty)\n");
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
     printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
@@ -690,7 +705,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -700,6 +715,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
     if (llama_mlock_supported()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
@@ -840,7 +857,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
 
     if (params.ignore_eos) {
-        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
     }
 
     {
@@ -932,127 +949,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
     return result;
 }
 
-//
-// Sampling utils
-//
-
-llama_token llama_sample_token(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_grammar * grammar,
-               const struct gpt_params & params,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-                                   int   idx) {
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
-    const float   alpha_presence  = params.presence_penalty;
-    const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    llama_token id = 0;
-
-    float * logits = llama_get_logits_ith(ctx, idx);
-
-    // Apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    candidates.clear();
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
-
-    if (ctx_guidance) {
-        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
-    }
-
-    // apply penalties
-    if (!last_tokens.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx)];
-        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
-
-        llama_sample_repetition_penalty(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (grammar != NULL) {
-        llama_sample_grammar(ctx, &cur_p, grammar);
-    }
-
-    if (temp <= 0) {
-        // Greedy sampling
-        id = llama_sample_token_greedy(ctx, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-        } else if (mirostat == 2) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-        } else {
-            // Temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
-            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
-            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
-            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
-            llama_sample_temp(ctx, &cur_p, temp);
-
-            {
-                const int n_top = 10;
-                LOG("top %d candidates:\n", n_top);
-
-                for (int i = 0; i < n_top; i++) {
-                    const llama_token id = cur_p.data[i].id;
-                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
-                }
-            }
-
-            id = llama_sample_token(ctx, &cur_p);
-
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
-        }
-    }
-    // printf("`%d`", candidates_p.size);
-
-    if (grammar != NULL) {
-        llama_grammar_accept_token(ctx, grammar, id);
-    }
-
-    return id;
-}
-
 //
 // YAML utils
 //
@@ -1204,6 +1100,8 @@ std::string get_sortable_timestamp() {
 
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
+    const llama_sampling_params & sparams = params.sampling_params;
+
     fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
     fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
     fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false");
@@ -1250,21 +1148,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 
     fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
+    dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
+    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
 
-    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
-    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
+    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
     fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
 
     dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@@ -1277,7 +1175,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
 
     fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : params.logit_bias) {
+    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
         if (ignore_eos && lb.first == logit_bias_eos->first) {
             continue;
         }
@@ -1301,30 +1199,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
+    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
+    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
+    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
     fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
+    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
     fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
     fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
     dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
     fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
     fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
 
     fprintf(stream, "reverse_prompt:\n");
     for (std::string ap : params.antiprompt) {
@@ -1342,15 +1240,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
     fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
+    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
 
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
-    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
+    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
     fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
+    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
+    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
+    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
diff --git a/cpp/common.h b/cpp/common.h
index c8021527..36fd4416 100644
--- a/cpp/common.h
+++ b/cpp/common.h
@@ -4,6 +4,8 @@
 
 #include "llama.h"
 
+#include "sampling.h"
+
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 
@@ -49,31 +51,12 @@ struct gpt_params {
     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f; // RoPE base frequency
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
 
-    // sampling parameters
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;       // string to help guidance
-    float       cfg_scale         = 1.f;   // How strong is guidance
+    // // sampling parameters
+    struct llama_sampling_params sampling_params;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
@@ -115,13 +98,16 @@ struct gpt_params {
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool penalize_nl       = true;  // consider newlines as a repeatable token
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool infill            = false; // use infill mode
+
+    // multimodal models (see examples/llava)
+    std::string mmproj = ""; // path to multimodal projector
+    std::string image = ""; // path to an image file
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -180,36 +166,6 @@ std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
 
-//
-// Sampling utils
-//
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-//
-// required:
-//  - ctx:    context to use for sampling
-//  - params: sampling parameters
-//
-// optional:
-//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
-//  - grammar:       grammar to use for sampling, ignore if NULL
-//  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits_ith(ctx, idx)
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
-//
-llama_token llama_sample_token(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_grammar * grammar,
-               const struct gpt_params & params,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-                                   int   idx = 0);
-
 //
 // YAML utils
 //
diff --git a/cpp/ggml.c b/cpp/ggml.c
index 9a247103..9fc1bfd9 100644
--- a/cpp/ggml.c
+++ b/cpp/ggml.c
@@ -14428,7 +14428,7 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32(
     int64_t t0 = lm_ggml_perf_time_us();
     UNUSED(t0);
 
-    LM_GGML_TENSOR_BINARY_OP_LOCALS
+    LM_GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp
index 996bf1b3..5f7ef230 100644
--- a/cpp/rn-llama.hpp
+++ b/cpp/rn-llama.hpp
@@ -144,6 +144,7 @@ struct llama_rn_context
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     gpt_params params;
+    llama_sampling_context ctx_sampling;
 
     grammar_parser::parse_state parsed_grammar;
     llama_grammar *grammar = nullptr;
@@ -191,6 +192,7 @@ struct llama_rn_context
         if (grammar != nullptr) {
             llama_grammar_free(grammar);
             grammar = nullptr;
+            ctx_sampling = llama_sampling_context_init(params, NULL);
         }
     }
 
@@ -221,8 +223,8 @@ struct llama_rn_context
             grammar_parser::print_grammar(stderr, parsed_grammar);
 
             {
-                auto it = params.logit_bias.find(llama_token_eos(ctx));
-                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
                     LOG_WARNING("EOS token is disabled, which will cause most grammars to fail");
                 }
             }
@@ -231,6 +233,7 @@ struct llama_rn_context
             grammar = llama_grammar_init(
                 grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
         }
+        ctx_sampling = llama_sampling_context_init(params, grammar);
         return true;
     }
 
@@ -271,9 +274,6 @@ struct llama_rn_context
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
 
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -281,6 +281,9 @@ struct llama_rn_context
             n_past--;
         }
 
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
         LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
             n_past,
             tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
@@ -364,12 +367,12 @@ struct llama_rn_context
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
 
-            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
+            result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
 
             llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-            const int32_t n_probs = params.n_probs;
-            if (params.temp <= 0 && n_probs > 0)
+            const int32_t n_probs = params.sampling_params.n_probs;
+            if (params.sampling_params.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sample_softmax(ctx, &candidates_p);
@@ -443,7 +446,7 @@ struct llama_rn_context
         const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
 
-        if (params.n_probs > 0)
+        if (params.sampling_params.n_probs > 0)
         {
             generated_token_probs.push_back(token_with_probs);
         }
diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp
new file mode 100644
index 00000000..8ce41945
--- /dev/null
+++ b/cpp/sampling.cpp
@@ -0,0 +1,166 @@
+#include "sampling.h"
+
+llama_sampling_context::~llama_sampling_context() {
+    for (auto & it : sequence_contexts) {
+        if (it.second.grammar != NULL) {
+            llama_grammar_free(it.second.grammar);
+            it.second.grammar = NULL;
+        }
+    }
+}
+
+llama_sampling_context llama_sampling_context_init(
+        const struct gpt_params & params,
+                  llama_grammar * grammar) {
+  llama_sampling_context result;
+
+  result.params = params.sampling_params;
+  result.grammar = grammar;
+  return result;
+}
+
+// Note: Creates the context if it doesn't exist, so this always return something.
+llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq) {
+    const auto it = ctx_sampling.sequence_contexts.find(seq);
+    if (it != ctx_sampling.sequence_contexts.end()) {
+        return it->second;
+    }
+    llama_sampler_sequence_context new_ctx = {
+        2.0f * ctx_sampling.params.mirostat_tau,
+        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
+    };
+    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
+}
+
+bool llama_sampling_context_reset(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq) {
+    const auto it = ctx_sampling.sequence_contexts.find(seq);
+    if (it == ctx_sampling.sequence_contexts.end()) return false;
+    if (it->second.grammar != NULL) {
+        llama_grammar_free(it->second.grammar);
+        it->second.grammar = NULL;
+    }
+    ctx_sampling.sequence_contexts.erase(it);
+    return true;
+}
+
+llama_token llama_sampling_sample(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_sampling_context & ctx_sampling,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+        const                      int   idx,
+                          llama_seq_id   seq) {
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+    const llama_sampling_params & params = ctx_sampling.params;
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
+    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    const float   repeat_penalty  = params.repeat_penalty;
+    const float   alpha_presence  = params.presence_penalty;
+    const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+    const bool    penalize_nl     = params.penalize_nl;
+
+    llama_token id = 0;
+
+    float * logits = llama_get_logits_ith(ctx, idx);
+
+    // Apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
+    }
+
+    candidates.clear();
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+
+    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+
+    if (ctx_guidance) {
+        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+    }
+
+    // apply penalties
+    if (!last_tokens.empty()) {
+        const float nl_logit = logits[llama_token_nl(ctx)];
+        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
+
+        llama_sample_repetition_penalty(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, alpha_frequency, alpha_presence);
+
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
+                }
+            }
+        }
+    }
+
+    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
+
+    if (ctx_seq.grammar != NULL) {
+        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
+    }
+
+    if (temp <= 0) {
+        // Greedy sampling
+        id = llama_sample_token_greedy(ctx, &cur_p);
+    } else {
+        if (mirostat == 1) {
+            const int mirostat_m = 100;
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
+        } else if (mirostat == 2) {
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
+        } else {
+            // Temperature sampling
+            size_t min_keep = std::max(1, params.n_probs);
+            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
+            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
+            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
+            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
+            llama_sample_temp(ctx, &cur_p, temp);
+
+            {
+                const int n_top = 10;
+                LOG("top %d candidates:\n", n_top);
+
+                for (int i = 0; i < n_top; i++) {
+                    const llama_token id = cur_p.data[i].id;
+                    (void)id; // To avoid a warning that id is unused when logging is disabled.
+                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
+                }
+            }
+
+            id = llama_sample_token(ctx, &cur_p);
+
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+        }
+    }
+
+    if (ctx_seq.grammar != NULL) {
+        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
+    }
+
+    return id;
+}
diff --git a/cpp/sampling.h b/cpp/sampling.h
new file mode 100644
index 00000000..0aab5d03
--- /dev/null
+++ b/cpp/sampling.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+// sampling parameters
+typedef struct llama_sampling_params {
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+
+    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt;   // string to help guidance
+    float       cfg_scale     = 1.f;   // How strong is guidance
+
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+} llama_sampling_params;
+
+// per-sequence sampler context
+typedef struct llama_sampler_sequence_context {
+    float mirostat_mu; // mirostat sampler state
+    llama_grammar * grammar;
+} llama_sampler_sequence_context;
+
+// general sampler context
+typedef struct llama_sampling_context {
+    ~llama_sampling_context();
+
+    // parameters that will be used for sampling and when creating
+    // new llama_sampler_sequence_context instances
+    llama_sampling_params params;
+
+    // map of sequence ids to sampler contexts
+    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
+
+    // when non-NULL, new instances of llama_sampler_sequence_context
+    // will get a copy of the grammar here
+    // note: only the pointer is stored here, it is not a copy of
+    //       the grammar and shouldn't be freed
+    llama_grammar * grammar;
+} llama_sampling_context;
+
+#include "common.h"
+
+// Create a new sampling context instance.
+llama_sampling_context llama_sampling_context_init(
+        const struct gpt_params & params,
+                  llama_grammar * grammar = NULL);
+
+// Fetches the sampler context for the specified sequence id (defaults to 0).
+// If the context for that sequence id doesn't already exist, it will be created with
+// default values based on the parameters in the ctx_sampling argument.
+llama_sampler_sequence_context & llama_sampling_get_sequence_context(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq = 0);
+
+// Reset the sampler context for the supplied sequence id (defaults to 0).
+// This is necessary to reuse a sequence id or free memory used by sequences
+// that are no longer required.
+bool llama_sampling_context_reset(
+              llama_sampling_context & ctx_sampling,
+        const llama_seq_id             seq = 0);
+
+// this is a common sampling function used across the examples for convenience
+// it can serve as a starting point for implementing your own sampling function
+// Note: When using multiple sequences, it is the caller's responsibility to call
+//       llama_sampling_context_reset when a sequence ends
+//
+// required:
+//  - ctx:          context to use for sampling
+//  - ctx_sampling: sampling-specific context
+//
+// optional:
+//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
+//  - last_tokens:   needed for repetition penalty, ignore if empty
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
+//  - seq:           sequence id to associate sampler state with
+//
+// returns:
+//  - token:      sampled token
+//  - candidates: vector of candidate tokens
+//
+llama_token llama_sampling_sample(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_sampling_context & ctx_sampling,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+        const                      int   idx = 0,
+                          llama_seq_id   seq = 0);
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index b4ef6817..bcfbed7b 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -8,7 +8,7 @@ PODS:
     - hermes-engine/Pre-built (= 0.72.3)
   - hermes-engine/Pre-built (0.72.3)
   - libevent (2.1.12)
-  - llama-rn (0.2.0):
+  - llama-rn (0.3.0-rc.0):
     - RCT-Folly
     - RCTRequired
     - RCTTypeSafety
@@ -1242,7 +1242,7 @@ SPEC CHECKSUMS:
   glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
   hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322
   libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
-  llama-rn: 38a0f48bb799df21706bc5552929475114ddf9cb
+  llama-rn: 181274aa4c46da201545cdf45ccf0300e9bc0363
   RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1
   RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18
   RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3
diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm
index a117f121..2ce63f1c 100644
--- a/ios/RNLlamaContext.mm
+++ b/ios/RNLlamaContext.mm
@@ -137,8 +137,6 @@ - (NSDictionary *)completion:(NSDictionary *)params
         llama->params.grammar = [params[@"grammar"] UTF8String];
     }
 
-    if (params[@"temperature"]) llama->params.temp = [params[@"temperature"] doubleValue];
-
     if (params[@"n_threads"]) {
         int nThreads = params[@"n_threads"] ? [params[@"n_threads"] intValue] : llama->params.n_threads;
         const int maxThreads = (int) [[NSProcessInfo processInfo] processorCount];
@@ -147,22 +145,27 @@ - (NSDictionary *)completion:(NSDictionary *)params
         llama->params.n_threads = nThreads > 0 ? nThreads : defaultNThreads;
     }
     if (params[@"n_predict"]) llama->params.n_predict = [params[@"n_predict"] intValue];
-    if (params[@"n_probs"]) llama->params.n_probs = [params[@"n_probs"] intValue];
 
-    if (params[@"repeat_last_n"]) llama->params.repeat_last_n = [params[@"repeat_last_n"] intValue];
-    if (params[@"repeat_penalty"]) llama->params.repeat_penalty = [params[@"repeat_penalty"] doubleValue];
-    if (params[@"presence_penalty"]) llama->params.presence_penalty = [params[@"presence_penalty"] doubleValue];
-    if (params[@"frequency_penalty"]) llama->params.frequency_penalty = [params[@"frequency_penalty"] doubleValue];
+    auto & sparams = llama->params.sampling_params;
+
+    if (params[@"temperature"]) sparams.temp = [params[@"temperature"] doubleValue];
+
+    if (params[@"n_probs"]) sparams.n_probs = [params[@"n_probs"] intValue];
+
+    if (params[@"repeat_last_n"]) sparams.repeat_last_n = [params[@"repeat_last_n"] intValue];
+    if (params[@"repeat_penalty"]) sparams.repeat_penalty = [params[@"repeat_penalty"] doubleValue];
+    if (params[@"presence_penalty"]) sparams.presence_penalty = [params[@"presence_penalty"] doubleValue];
+    if (params[@"frequency_penalty"]) sparams.frequency_penalty = [params[@"frequency_penalty"] doubleValue];
 
-    if (params[@"mirostat"]) llama->params.mirostat = [params[@"mirostat"] intValue];
-    if (params[@"mirostat_tau"]) llama->params.mirostat_tau = [params[@"mirostat_tau"] doubleValue];
-    if (params[@"mirostat_eta"]) llama->params.mirostat_eta = [params[@"mirostat_eta"] doubleValue];
+    if (params[@"mirostat"]) sparams.mirostat = [params[@"mirostat"] intValue];
+    if (params[@"mirostat_tau"]) sparams.mirostat_tau = [params[@"mirostat_tau"] doubleValue];
+    if (params[@"mirostat_eta"]) sparams.mirostat_eta = [params[@"mirostat_eta"] doubleValue];
 
-    if (params[@"top_k"]) llama->params.top_k = [params[@"top_k"] intValue];
-    if (params[@"top_p"]) llama->params.top_p = [params[@"top_p"] doubleValue];
-    if (params[@"tfs_z"]) llama->params.tfs_z = [params[@"tfs_z"] doubleValue];
+    if (params[@"top_k"]) sparams.top_k = [params[@"top_k"] intValue];
+    if (params[@"top_p"]) sparams.top_p = [params[@"top_p"] doubleValue];
+    if (params[@"tfs_z"]) sparams.tfs_z = [params[@"tfs_z"] doubleValue];
 
-    if (params[@"typical_p"]) llama->params.typical_p = [params[@"typical_p"] doubleValue];
+    if (params[@"typical_p"]) sparams.typical_p = [params[@"typical_p"] doubleValue];
 
     llama->params.antiprompt.clear();
     if (params[@"stop"]) {
@@ -172,9 +175,9 @@ - (NSDictionary *)completion:(NSDictionary *)params
         }
     }
 
-    llama->params.logit_bias.clear();
+    sparams.logit_bias.clear();
     if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) {
-        llama->params.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
     }
 
     if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) {
@@ -185,9 +188,9 @@ - (NSDictionary *)completion:(NSDictionary *)params
                 llama_token tok = [el[0] intValue];
                 if (tok >= 0 && tok < n_vocab) {
                     if ([el[1] isKindOfClass:[NSNumber class]]) {
-                        llama->params.logit_bias[tok] = [el[1] doubleValue];
+                        sparams.logit_bias[tok] = [el[1] doubleValue];
                     } else if ([el[1] isKindOfClass:[NSNumber class]] && ![el[1] boolValue]) {
-                        llama->params.logit_bias[tok] = -INFINITY;
+                        sparams.logit_bias[tok] = -INFINITY;
                     }
                 }
             }
@@ -243,7 +246,7 @@ - (NSDictionary *)completion:(NSDictionary *)params
             NSMutableDictionary *tokenResult = [[NSMutableDictionary alloc] init];
             tokenResult[@"token"] = [NSString stringWithUTF8String:to_send.c_str()];
 
-            if (llama->params.n_probs > 0) {
+            if (llama->params.sampling_params.n_probs > 0) {
                 const std::vector<llama_token> to_send_toks = llama_tokenize(llama->ctx, to_send, false);
                 size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
                 size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
diff --git a/llama.cpp b/llama.cpp
index 9f6ede19..1e0e873c 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 9f6ede19f3cfa50d4a51a5babb056c3f8a450b80
+Subproject commit 1e0e873c373c33989beb6bc64d83cd572ab7fe2b
diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh
index c8b4ccf9..5c7b98a6 100755
--- a/scripts/bootstrap.sh
+++ b/scripts/bootstrap.sh
@@ -27,6 +27,8 @@ cp ./llama.cpp/common/common.h ./cpp/common.h
 cp ./llama.cpp/common/common.cpp ./cpp/common.cpp
 cp ./llama.cpp/common/grammar-parser.h ./cpp/grammar-parser.h
 cp ./llama.cpp/common/grammar-parser.cpp ./cpp/grammar-parser.cpp
+cp ./llama.cpp/common/sampling.h ./cpp/sampling.h
+cp ./llama.cpp/common/sampling.cpp ./cpp/sampling.cpp
 
 # List of files to process
 files=(