Add perplexity example (#210)

li-plus · Jan 20, 2024 · 23442b0 · 23442b0
1 parent 3286db5
commit 23442b0
Show file tree

Hide file tree

Showing 11 changed files with 284 additions and 50 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -38,6 +38,8 @@ if (GGML_PERF)
     add_compile_definitions(GGML_PERF)
 endif ()
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
 file(GLOB CPP_SOURCES
     ${PROJECT_SOURCE_DIR}/*.h
     ${PROJECT_SOURCE_DIR}/*.cpp)
@@ -47,8 +49,19 @@ set_source_files_properties(${CPP_SOURCES} PROPERTIES COMPILE_FLAGS "-pedantic-e
 add_library(chatglm STATIC chatglm.cpp)
 target_link_libraries(chatglm PUBLIC ggml sentencepiece-static)
 
-add_executable(main main.cpp)
-target_link_libraries(main PRIVATE chatglm)
+# c++ examples
+option(CHATGLM_ENABLE_EXAMPLES "chatglm: enable c++ examples" ON)
+if (CHATGLM_ENABLE_EXAMPLES)
+    add_executable(main main.cpp)
+    target_link_libraries(main PRIVATE chatglm)
+
+    find_package(OpenMP)
+    if (OpenMP_CXX_FOUND)
+        set(CHATGLM_OPENMP_TARGET OpenMP::OpenMP_CXX)
+    endif ()
+    add_executable(perplexity tests/perplexity.cpp)
+    target_link_libraries(perplexity PRIVATE chatglm ${CHATGLM_OPENMP_TARGET})
+endif ()
 
 # GoogleTest
 option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
@@ -75,7 +88,6 @@ endif ()
 
 option(CHATGLM_ENABLE_PYBIND "chatglm: enable python binding" OFF)
 if (CHATGLM_ENABLE_PYBIND)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
     set_target_properties(chatglm ggml sentencepiece-static PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
     add_subdirectory(third_party/pybind11)
     pybind11_add_module(_C chatglm_pybind.cpp)

diff --git a/README.md b/README.md
@@ -272,6 +272,8 @@ cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="80"       # for A100
 cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="70;75"    # compatible with both V100 and T4
 ```
 
+To find out the CUDA architecture of your GPU device, see [Matching CUDA arch and CUDA gencode for various NVIDIA architectures](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/).
+
 **Metal**
 
 MPS (Metal Performance Shaders) allows computation to run on powerful Apple Silicon GPU. Add the CMake flag `-DGGML_METAL=ON` to enable it.
@@ -628,6 +630,21 @@ InternLM-20B:
 | ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A   |
 | ms/token (CUDA @ V100 SXM2)    | 21.6  | 23.2  | 25.0  | 25.9  | 33.4  | N/A   |
 
+## Model Quality
+
+We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model.
+
+Download and unzip the dataset from [link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip). Measure the perplexity with a stride of 512 and max input length of 2048:
+```sh
+./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
+```
+
+|                         | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
+|-------------------------|-------|-------|-------|-------|-------|-------|
+| [ChatGLM3-6B-Base][1]   | 6.215 | 6.184 | 5.997 | 6.015 | 5.965 | 5.971 |
+
+[1]: https://huggingface.co/THUDM/chatglm3-6b-base
+
 ## Development
 
 **Unit Test & Benchmark**

diff --git a/chatglm.cpp b/chatglm.cpp
@@ -81,15 +81,20 @@ std::string to_string(ggml_tensor *tensor, bool with_data) {
                     for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                         auto ptr = (char *)tensor->data + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] +
                                    i0 * tensor->nb[0];
-                        float val;
-                        if (tensor->type == GGML_TYPE_F32) {
-                            val = *(float *)ptr;
-                        } else if (tensor->type == GGML_TYPE_F16) {
-                            val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
+                        oss << (i0 > 0 ? ", " : "");
+                        if (tensor->type == GGML_TYPE_I32) {
+                            oss << *(int *)ptr;
                         } else {
-                            CHATGLM_THROW << "unimplemented";
+                            float val;
+                            if (tensor->type == GGML_TYPE_F32) {
+                                val = *(float *)ptr;
+                            } else if (tensor->type == GGML_TYPE_F16) {
+                                val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
+                            } else {
+                                CHATGLM_THROW << "unimplemented";
+                            }
+                            oss << std::setw(7) << std::fixed << std::setprecision(4) << val;
                         }
-                        oss << (i0 > 0 ? ", " : "") << std::setw(7) << std::fixed << std::setprecision(4) << val;
                     }
                     oss << "]";
                 }
@@ -496,12 +501,11 @@ BaseModelForCausalLM::BaseModelForCausalLM(ModelConfig config, size_t mem_size,
 #endif
 }
 
-int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
-                                              int n_past, int n_ctx) {
+ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,
+                                                         int n_threads, bool is_decoding) {
     ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false);
     ctx_.gf = {};
 
-    int n_threads = gen_config.num_threads; // user defined
     if (n_threads <= 0) {
         n_threads = get_default_num_threads(); // default thread num
     }
@@ -513,7 +517,7 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
     ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
     memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids));
 
-    ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx);
+    ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding);
     lm_logits->backend = GGML_BACKEND_CPU;
 
     ggml_build_forward_expand(&ctx_.gf, lm_logits);
@@ -527,6 +531,13 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
     ggml_graph_print(&ctx_.gf);
 #endif
 
+    return lm_logits;
+}
+
+int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
+                                              int n_past, int n_ctx) {
+    ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true);
+
     int vocab_size = lm_logits->ne[0];
     float *next_token_logits = (float *)lm_logits->data;
 

diff --git a/chatglm.h b/chatglm.h
@@ -855,7 +855,11 @@ class BaseModelForCausalLM {
     virtual ~BaseModelForCausalLM() = default;
 
     virtual void load(ModelLoader &loader) = 0;
-    virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const = 0;
+    virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
+                                 bool is_decoding) const = 0;
+
+    ggml_tensor *forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx, int n_threads,
+                                       bool is_decoding);
 
     std::vector<int> generate(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
                               BaseStreamer *streamer = nullptr);
@@ -896,10 +900,11 @@ class BasicModelForCausalLM : public BaseModelForCausalLM {
     ~BasicModelForCausalLM() { to_cpu(); }
 
   public:
-    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
+    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
+                         bool is_decoding) const override {
         ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx);
-        // NOTE: only compute next_token_logits for the last token
-        if (input_ids->ne[0] > 1) {
+        // NOTE: only compute next token logits for decoding
+        if (is_decoding && input_ids->ne[0] > 1) {
             transformer_outputs = tensor_assign_buffers(
                 ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
                              (input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
@@ -1011,7 +1016,7 @@ class ChatGLMForCausalLM : public BasicModelForCausalLM<ChatGLMModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;      // 2k context
+    static constexpr size_t MEM_SIZE = 1280 * MB;     // 2k context
     static constexpr size_t SCRATCH_SIZE = 1024 * MB; // 2k context
 };
 
@@ -1061,7 +1066,7 @@ class ChatGLM2ForCausalLM : public BasicModelForCausalLM<ChatGLM2Model> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;      // 2k context
+    static constexpr size_t MEM_SIZE = 1280 * MB;     // 2k context
     static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context
 };
 
@@ -1161,7 +1166,7 @@ class Baichuan7BForCausalLM : public BasicModelForCausalLM<Baichuan7BModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t MEM_SIZE = 1280 * MB;
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 
@@ -1187,7 +1192,7 @@ class Baichuan13BForCausalLM : public BasicModelForCausalLM<Baichuan13BModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t MEM_SIZE = 1280 * MB;
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 
@@ -1248,7 +1253,7 @@ class InternLMForCausalLM : public BasicModelForCausalLM<InternLMModel> {
     StateDict state_dict() const;
 
   public:
-    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t MEM_SIZE = 1280 * MB;
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 

diff --git a/chatglm_cpp/__init__.py b/chatglm_cpp/__init__.py
@@ -6,7 +6,7 @@
 import chatglm_cpp._C as _C
 from chatglm_cpp._C import ChatMessage
 
-__version__ = "0.3.1.dev"
+__version__ = "0.3.1"
 
 
 @dataclass

diff --git a/chatglm_pybind.cpp b/chatglm_pybind.cpp
@@ -27,8 +27,11 @@ class PyBaseModelForCausalLM : public BaseModelForCausalLM {
     using BaseModelForCausalLM::BaseModelForCausalLM;
 
     void load(ModelLoader &loader) override { PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, load, loader); }
-    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
-        PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx)
+
+    ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
+                         bool is_decoding) const override {
+        PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx,
+                               is_decoding)
     }
 };
 

diff --git a/main.cpp b/main.cpp
@@ -40,28 +40,29 @@ struct Args {
 };
 
 static void usage(const std::string &prog) {
-    std::cout << "Usage: " << prog << " [options]\n"
-              << "\n"
-              << "options:\n"
-              << "  -h, --help              show this help message and exit\n"
-              << "  -m, --model PATH        model path (default: chatglm-ggml.bin)\n"
-              << "  --mode                  inference mode chosen from {chat, generate} (default: chat)\n"
-              << "  --sync                  synchronized generation without streaming\n"
-              << "  -p, --prompt PROMPT     prompt to start generation with (default: 你好)\n"
-              << "  --pp, --prompt_path     path to the plain text file that stores the prompt\n"
-              << "  -s, --system SYSTEM     system message to set the behavior of the assistant\n"
-              << "  --sp, --system_path     path to the plain text file that stores the system message\n"
-              << "  -i, --interactive       run in interactive mode\n"
-              << "  -l, --max_length N      max total length including prompt and output (default: 2048)\n"
-              << "  --max_new_tokens N      max number of tokens to generate, ignoring the number of prompt tokens\n"
-              << "  -c, --max_context_length N\n"
-              << "                          max context length (default: 512)\n"
-              << "  --top_k N               top-k sampling (default: 0)\n"
-              << "  --top_p N               top-p sampling (default: 0.7)\n"
-              << "  --temp N                temperature (default: 0.95)\n"
-              << "  --repeat_penalty N      penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)\n"
-              << "  -t, --threads N         number of threads for inference\n"
-              << "  -v, --verbose           display verbose output including config/system/performance info\n";
+    std::cout << "Usage: " << prog << R"( [options]
+
+options:
+  -h, --help            show this help message and exit
+  -m, --model PATH      model path (default: chatglm-ggml.bin)
+  --mode                inference mode chosen from {chat, generate} (default: chat)
+  --sync                synchronized generation without streaming
+  -p, --prompt PROMPT   prompt to start generation with (default: 你好)
+  --pp, --prompt_path   path to the plain text file that stores the prompt
+  -s, --system SYSTEM   system message to set the behavior of the assistant
+  --sp, --system_path   path to the plain text file that stores the system message
+  -i, --interactive     run in interactive mode
+  -l, --max_length N    max total length including prompt and output (default: 2048)
+  --max_new_tokens N    max number of tokens to generate, ignoring the number of prompt tokens
+  -c, --max_context_length N
+                        max context length (default: 512)
+  --top_k N             top-k sampling (default: 0)
+  --top_p N             top-p sampling (default: 0.7)
+  --temp N              temperature (default: 0.95)
+  --repeat_penalty N    penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
+  -t, --threads N       number of threads for inference
+  -v, --verbose         display verbose output including config/system/performance info
+)";
 }
 
 static std::string read_text(std::string path) {

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,10 +13,10 @@ authors = [
 maintainers = [
     {name = "Jiahao Li", email = "[email protected]"},
 ]
-description = "C++ implementation of ChatGLM-6B & ChatGLM2-6B"
+description = "C++ implementation of ChatGLM family models and more LLMs"
 readme = "README.md"
 requires-python = ">=3.7"
-keywords = ["ChatGLM", "ChatGLM2", "Large Language Model"]
+keywords = ["ChatGLM", "ChatGLM2", "ChatGLM3", "Large Language Model"]
 license = {text = "MIT License"}
 classifiers = [
     "Development Status :: 3 - Alpha",
@@ -27,6 +27,7 @@ classifiers = [
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 dynamic = ["version"]
 

diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
             f"-DPYTHON_EXECUTABLE={sys.executable}",
             f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
             f"-DCHATGLM_ENABLE_PYBIND=ON",
+            f"-DCHATGLM_ENABLE_EXAMPLES=OFF",
             f"-DBUILD_SHARED_LIBS=OFF",
         ]
         build_args = []