Support ChatGLM4 conversation mode (#303)

li-plus · Jun 13, 2024 · 598b38e · 598b38e
1 parent 5f584ce
commit 598b38e
Show file tree

Hide file tree

Showing 20 changed files with 826 additions and 106 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -8,3 +8,9 @@
 [submodule "third_party/ggml"]
 	path = third_party/ggml
 	url = https://github.com/ggerganov/ggml.git
+[submodule "third_party/re2"]
+	path = third_party/re2
+	url = https://github.com/google/re2.git
+[submodule "third_party/abseil-cpp"]
+	path = third_party/abseil-cpp
+	url = https://github.com/abseil/abseil-cpp.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,8 +13,17 @@ if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif ()
 
+option(CHATGLM_ENABLE_EXAMPLES "chatglm: enable c++ examples" ON)
+option(CHATGLM_ENABLE_PYBIND "chatglm: enable python binding" OFF)
+option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
+
+if (CHATGLM_ENABLE_PYBIND)
+    set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif ()
+
 # third-party libraries
-add_compile_definitions(GGML_CUDA_MMV_Y=2)  # for large vocab
+add_compile_definitions(GGML_CUDA_MMV_Y=4)  # for large vocab
 include_directories(third_party/ggml/include/ggml third_party/ggml/src)
 add_subdirectory(third_party/ggml)
 
@@ -41,6 +50,14 @@ if (GGML_CUBLAS)
     set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES})
 endif ()
 
+include_directories(third_party/sentencepiece/third_party/protobuf-lite)
+
+set(ABSL_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
+set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)
+add_subdirectory(third_party/abseil-cpp)
+
+add_subdirectory(third_party/re2)
+
 if (GGML_METAL)
     add_compile_definitions(GGML_USE_METAL)
     configure_file(third_party/ggml/src/ggml-metal.metal ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
@@ -59,10 +76,9 @@ file(GLOB CPP_SOURCES
 set_source_files_properties(${CPP_SOURCES} PROPERTIES COMPILE_FLAGS "-pedantic-errors")
 
 add_library(chatglm STATIC chatglm.cpp)
-target_link_libraries(chatglm PUBLIC ggml sentencepiece-static)
+target_link_libraries(chatglm PUBLIC ggml sentencepiece-static re2)
 
 # c++ examples
-option(CHATGLM_ENABLE_EXAMPLES "chatglm: enable c++ examples" ON)
 if (CHATGLM_ENABLE_EXAMPLES)
     add_executable(main main.cpp)
     target_link_libraries(main PRIVATE chatglm)
@@ -76,7 +92,6 @@ if (CHATGLM_ENABLE_EXAMPLES)
 endif ()
 
 # GoogleTest
-option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
 if (CHATGLM_ENABLE_TESTING)
     enable_testing()
 
@@ -98,9 +113,7 @@ if (CHATGLM_ENABLE_TESTING)
     gtest_discover_tests(chatglm_test)
 endif ()
 
-option(CHATGLM_ENABLE_PYBIND "chatglm: enable python binding" OFF)
 if (CHATGLM_ENABLE_PYBIND)
-    set_target_properties(chatglm ggml sentencepiece-static PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
     add_subdirectory(third_party/pybind11)
     pybind11_add_module(_C chatglm_pybind.cpp)
     target_link_libraries(_C PRIVATE chatglm)
@@ -119,7 +132,7 @@ add_custom_target(lint
     COMMAND black ${PY_SOURCES} --verbose)
 
 # check all
-add_custom_target(check
+add_custom_target(check-all
     COMMAND cmake --build build -j
     COMMAND ./build/bin/chatglm_test
     COMMAND python3 setup.py develop

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -14,3 +14,9 @@ graft third_party/pybind11/tools
 graft third_party/sentencepiece/src
 graft third_party/sentencepiece/third_party
 include third_party/sentencepiece/*
+
+# re2
+graft third_party/re2
+
+# absl
+graft third_party/abseil-cpp
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 ![Python](https://img.shields.io/pypi/pyversions/chatglm-cpp)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
 
-C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3-6B](https://github.com/THUDM/ChatGLM3) and more LLMs for real-time chatting on your MacBook.
+C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4) and more LLMs for real-time chatting on your MacBook.
 
 ![demo](docs/demo.gif)
 
@@ -22,7 +22,7 @@ Highlights:
 Support Matrix:
 * Hardwares: x86/arm CPU, NVIDIA GPU, Apple Silicon GPU
 * Platforms: Linux, MacOS, Windows
-* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3-6B](https://github.com/THUDM/ChatGLM3), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan2](https://github.com/baichuan-inc/Baichuan2), [InternLM](https://github.com/InternLM/InternLM)
+* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan2](https://github.com/baichuan-inc/Baichuan2), [InternLM](https://github.com/InternLM/InternLM)
 
 **NOTE**: Baichuan & InternLM model series are deprecated in favor of [llama.cpp](https://github.com/ggerganov/llama.cpp).
 
@@ -57,6 +57,7 @@ The original model (`-i <model_name_or_path>`) can be a Hugging Face model name
 * ChatGLM-6B: `THUDM/chatglm-6b`, `THUDM/chatglm-6b-int8`, `THUDM/chatglm-6b-int4`
 * ChatGLM2-6B: `THUDM/chatglm2-6b`, `THUDM/chatglm2-6b-int4`
 * ChatGLM3-6B: `THUDM/chatglm3-6b`
+* ChatGLM4-9B: `THUDM/glm-4-9b-chat`
 * CodeGeeX2: `THUDM/codegeex2-6b`, `THUDM/codegeex2-6b-int4`
 * Baichuan & Baichuan2: `baichuan-inc/Baichuan-13B-Chat`, `baichuan-inc/Baichuan2-7B-Chat`, `baichuan-inc/Baichuan2-13B-Chat`
 
@@ -176,6 +177,18 @@ $$
 
 </details>
 
+<details open>
+<summary>ChatGLM4-9B</summary>
+
+Chat mode:
+```sh
+python3 chatglm_cpp/convert.py -i THUDM/glm-4-9b-chat -t q4_0 -o models/chatglm4-ggml.bin
+./build/bin/main -m models/chatglm4-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+# 你好👋！有什么可以帮助你的吗？
+```
+
+</details>
+
 <details>
 <summary>CodeGeeX2</summary>
 
@@ -390,6 +403,15 @@ streamlit run chatglm3_demo.py
 
 </details>
 
+<details open>
+<summary>ChatGLM4-9B</summary>
+
+Chat mode:
+```sh
+python3 cli_demo.py -m ../models/chatglm4-ggml.bin -p 你好 --temp 0.8 --top_p 0.8
+```
+</details>
+
 <details>
 <summary>CodeGeeX2</summary>
 
@@ -607,6 +629,14 @@ ChatGLM2-6B / ChatGLM3-6B / CodeGeeX2:
 | file size                      | 3.3G  | 3.7G  | 4.0G  | 4.4G  | 6.2G  | 12G   |
 | mem usage                      | 3.4G  | 3.8G  | 4.1G  | 4.5G  | 6.2G  | 12G   |
 
+ChatGLM4-9B:
+
+|                                | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
+|--------------------------------|-------|-------|-------|-------|-------|-------|
+| ms/token (CPU @ Platinum 8260) | 105   | 105   | 122   | 134   | 158   | 279   |
+| ms/token (CUDA @ V100 SXM2)    | 12.1  | 12.5  | 13.8  | 13.9  | 17.7  | 27.7  |
+| file size                      | 5.0G  | 5.5G  | 6.1G  | 6.6G  | 9.4G  | 18G   |
+
 Baichuan-7B / Baichuan2-7B:
 
 |                                | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
@@ -647,14 +677,16 @@ We measure model quality by evaluating the perplexity over the WikiText-2 test d
 
 Download and unzip the dataset from [link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip). Measure the perplexity with a stride of 512 and max input length of 2048:
 ```sh
-./build/bin/perplexity -m models/chatglm3-ggml.bin -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
+./build/bin/perplexity -m models/chatglm3-base-ggml.bin -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
 ```
 
 |                         | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
 |-------------------------|-------|-------|-------|-------|-------|-------|
 | [ChatGLM3-6B-Base][1]   | 6.215 | 6.184 | 5.997 | 6.015 | 5.965 | 5.971 |
+| [ChatGLM4-9B-Base][2]   | 6.851 | 6.793 | 6.652 | 6.635 | 6.582 | 6.586 |
 
 [1]: https://huggingface.co/THUDM/chatglm3-6b-base
+[2]: https://huggingface.co/THUDM/glm-4-9b
 
 ## Development
 
@@ -687,4 +719,4 @@ This will print timing for each graph operation when running the model.
 ## Acknowledgements
 
 * This project is greatly inspired by [@ggerganov](https://github.com/ggerganov)'s [llama.cpp](https://github.com/ggerganov/llama.cpp) and is based on his NN library [ggml](https://github.com/ggerganov/ggml).
-* Thank [@THUDM](https://github.com/THUDM) for the amazing [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B) and [ChatGLM3-6B](https://github.com/THUDM/ChatGLM3) and for releasing the model sources and checkpoints.
+* Thank [@THUDM](https://github.com/THUDM) for the amazing [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3) and [GLM-4](https://github.com/THUDM/GLM-4) and for releasing the model sources and checkpoints.