Add perplexity evaluation script

li-plus · Dec 3, 2023 · 32ef183 · 32ef183
1 parent 8b17a67
commit 32ef183
Show file tree

Hide file tree

Showing 8 changed files with 289 additions and 44 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -50,6 +50,13 @@ target_link_libraries(chatglm PUBLIC ggml sentencepiece-static)
 add_executable(main main.cpp)
 target_link_libraries(main PRIVATE chatglm)
 
+find_package(OpenMP)
+if (OpenMP_CXX_FOUND)
+    set(CHATGLM_OPENMP_TARGET OpenMP::OpenMP_CXX)
+endif ()
+add_executable(perplexity tests/perplexity.cpp)
+target_link_libraries(perplexity PRIVATE chatglm ${CHATGLM_OPENMP_TARGET})
+
 # GoogleTest
 option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
 if (CHATGLM_ENABLE_TESTING)

diff --git a/README.md b/README.md
@@ -628,6 +628,27 @@ InternLM-20B:
 | ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A   |
 | ms/token (CUDA @ V100 SXM2)    | 21.6  | 23.2  | 25.0  | 25.9  | 33.4  | N/A   |
 
+## Model Quality
+
+We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model.
+
+Download and unzip the dataset ([link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip)). Measure the perplexity with a stride of 512 and max input length of 2048:
+```sh
+./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
+```
+
+|                         | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
+|-------------------------|-------|-------|-------|-------|-------|-------|
+| [ChatGLM3-6B-Base][1]   | 6.215 | 6.184 | 5.997 | 6.015 | 5.965 | 5.971 |
+| [Baichuan2-7B-Base][2]  | 
+| [Baichuan2-13B-Base][3] |
+| [InternLM-7B][4]        |
+
+[1]: https://huggingface.co/THUDM/chatglm3-6b-base
+[2]: https://huggingface.co/baichuan-inc/Baichuan2-7B-Base
+[3]: https://huggingface.co/baichuan-inc/Baichuan2-13B-Base
+[4]: https://huggingface.co/internlm/internlm-7b
+
 ## Development
 
 **Unit Test & Benchmark**

diff --git a/chatglm.cpp b/chatglm.cpp
@@ -81,15 +81,20 @@ std::string to_string(ggml_tensor *tensor, bool with_data) {
                     for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                         auto ptr = (char *)tensor->data + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] +
                                    i0 * tensor->nb[0];
-                        float val;
-                        if (tensor->type == GGML_TYPE_F32) {
-                            val = *(float *)ptr;
-                        } else if (tensor->type == GGML_TYPE_F16) {
-                            val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
+                        oss << (i0 > 0 ? ", " : "");
+                        if (tensor->type == GGML_TYPE_I32) {
+                            oss << *(int *)ptr;
                         } else {
-                            CHATGLM_THROW << "unimplemented";
+                            float val;
+                            if (tensor->type == GGML_TYPE_F32) {
+                                val = *(float *)ptr;
+                            } else if (tensor->type == GGML_TYPE_F16) {
+                                val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
+                            } else {
+                                CHATGLM_THROW << "unimplemented";
+                            }
+                            oss << std::setw(7) << std::fixed << std::setprecision(4) << val;
                         }
-                        oss << (i0 > 0 ? ", " : "") << std::setw(7) << std::fixed << std::setprecision(4) << val;
                     }
                     oss << "]";
                 }
@@ -496,12 +501,11 @@ BaseModelForCausalLM::BaseModelForCausalLM(ModelConfig config, size_t mem_size,
 #endif
 }
 
-int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
-                                              int n_past, int n_ctx) {
+ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,
+                                                         int n_threads, bool is_decoding) {
     ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false);
     ctx_.gf = {};
 
-    int n_threads = gen_config.num_threads; // user defined
     if (n_threads <= 0) {
         n_threads = get_default_num_threads(); // default thread num
     }
@@ -513,7 +517,7 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
     ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
     memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids));
 
-    ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx);
+    ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding);
     lm_logits->backend = GGML_BACKEND_CPU;
 
     ggml_build_forward_expand(&ctx_.gf, lm_logits);
@@ -527,6 +531,13 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
     ggml_graph_print(&ctx_.gf);
 #endif
 
+    return lm_logits;
+}
+
+int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
+                                              int n_past, int n_ctx) {
+    ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true);
+
     int vocab_size = lm_logits->ne[0];
     float *next_token_logits = (float *)lm_logits->data;
 

diff --git a/chatglm.h b/chatglm.h
@@ -855,7 +855,11 @@ class BaseModelForCausalLM {
     virtual ~BaseModelForCausalLM() = default;
 
     virtual void load(ModelLoader &loader) = 0;
-    virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const = 0;
+    virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
+                                 bool is_decoding) const = 0;
+
+    ggml_tensor *forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx, int n_threads,
+                                       bool is_decoding);
 
     std::vector<int> generate(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
                               BaseStreamer *streamer = nullptr);
@@ -896,10 +900,11 @@ class BasicModelForCausalLM : public BaseModelForCausalLM {
     ~BasicModelForCausalLM() { to_cpu(); }
 
   public:
-    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
+    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
+                         bool is_decoding) const override {
         ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx);
-        // NOTE: only compute next_token_logits for the last token
-        if (input_ids->ne[0] > 1) {
+        // NOTE: only compute next token logits for decoding
+        if (is_decoding && input_ids->ne[0] > 1) {
             transformer_outputs = tensor_assign_buffers(
                 ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
                              (input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
@@ -1011,7 +1016,7 @@ class ChatGLMForCausalLM : public BasicModelForCausalLM<ChatGLMModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;      // 2k context
+    static constexpr size_t MEM_SIZE = 1280 * MB;     // 2k context
     static constexpr size_t SCRATCH_SIZE = 1024 * MB; // 2k context
 };
 
@@ -1061,7 +1066,7 @@ class ChatGLM2ForCausalLM : public BasicModelForCausalLM<ChatGLM2Model> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;      // 2k context
+    static constexpr size_t MEM_SIZE = 1280 * MB;     // 2k context
     static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context
 };
 
@@ -1161,7 +1166,7 @@ class Baichuan7BForCausalLM : public BasicModelForCausalLM<Baichuan7BModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t MEM_SIZE = 1280 * MB;
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 
@@ -1187,7 +1192,7 @@ class Baichuan13BForCausalLM : public BasicModelForCausalLM<Baichuan13BModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t MEM_SIZE = 1280 * MB;
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 
@@ -1248,7 +1253,7 @@ class InternLMForCausalLM : public BasicModelForCausalLM<InternLMModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t MEM_SIZE = 1280 * MB;
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 

diff --git a/chatglm_pybind.cpp b/chatglm_pybind.cpp
@@ -27,8 +27,11 @@ class PyBaseModelForCausalLM : public BaseModelForCausalLM {
     using BaseModelForCausalLM::BaseModelForCausalLM;
 
     void load(ModelLoader &loader) override { PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, load, loader); }
-    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
-        PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx)
+
+    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
+                         bool is_decoding) const override {
+        PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx,
+                               is_decoding)
     }
 };
 

diff --git a/main.cpp b/main.cpp
@@ -40,28 +40,29 @@ struct Args {
 };
 
 static void usage(const std::string &prog) {
-    std::cout << "Usage: " << prog << " [options]\n"
-              << "\n"
-              << "options:\n"
-              << "  -h, --help              show this help message and exit\n"
-              << "  -m, --model PATH        model path (default: chatglm-ggml.bin)\n"
-              << "  --mode                  inference mode chosen from {chat, generate} (default: chat)\n"
-              << "  --sync                  synchronized generation without streaming\n"
-              << "  -p, --prompt PROMPT     prompt to start generation with (default: 你好)\n"
-              << "  --pp, --prompt_path     path to the plain text file that stores the prompt\n"
-              << "  -s, --system SYSTEM     system message to set the behavior of the assistant\n"
-              << "  --sp, --system_path     path to the plain text file that stores the system message\n"
-              << "  -i, --interactive       run in interactive mode\n"
-              << "  -l, --max_length N      max total length including prompt and output (default: 2048)\n"
-              << "  --max_new_tokens N      max number of tokens to generate, ignoring the number of prompt tokens\n"
-              << "  -c, --max_context_length N\n"
-              << "                          max context length (default: 512)\n"
-              << "  --top_k N               top-k sampling (default: 0)\n"
-              << "  --top_p N               top-p sampling (default: 0.7)\n"
-              << "  --temp N                temperature (default: 0.95)\n"
-              << "  --repeat_penalty N      penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)\n"
-              << "  -t, --threads N         number of threads for inference\n"
-              << "  -v, --verbose           display verbose output including config/system/performance info\n";
+    std::cout << "Usage: " << prog << R"( [options]
+
+options:
+  -h, --help            show this help message and exit
+  -m, --model PATH      model path (default: chatglm-ggml.bin)
+  --mode                inference mode chosen from {chat, generate} (default: chat)
+  --sync                synchronized generation without streaming
+  -p, --prompt PROMPT   prompt to start generation with (default: 你好)
+  --pp, --prompt_path   path to the plain text file that stores the prompt
+  -s, --system SYSTEM   system message to set the behavior of the assistant
+  --sp, --system_path   path to the plain text file that stores the system message
+  -i, --interactive     run in interactive mode
+  -l, --max_length N    max total length including prompt and output (default: 2048)
+  --max_new_tokens N    max number of tokens to generate, ignoring the number of prompt tokens
+  -c, --max_context_length N
+                        max context length (default: 512)
+  --top_k N             top-k sampling (default: 0)
+  --top_p N             top-p sampling (default: 0.7)
+  --temp N              temperature (default: 0.95)
+  --repeat_penalty N    penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
+  -t, --threads N       number of threads for inference
+  -v, --verbose         display verbose output including config/system/performance info
+)";
 }
 
 static std::string read_text(std::string path) {