Skip to content

Commit

Permalink
Add perplexity example (#210)
Browse files Browse the repository at this point in the history
  • Loading branch information
li-plus authored Jan 20, 2024
1 parent 3286db5 commit 23442b0
Show file tree
Hide file tree
Showing 11 changed files with 284 additions and 50 deletions.
18 changes: 15 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ if (GGML_PERF)
add_compile_definitions(GGML_PERF)
endif ()

include_directories(${CMAKE_CURRENT_SOURCE_DIR})

file(GLOB CPP_SOURCES
${PROJECT_SOURCE_DIR}/*.h
${PROJECT_SOURCE_DIR}/*.cpp)
Expand All @@ -47,8 +49,19 @@ set_source_files_properties(${CPP_SOURCES} PROPERTIES COMPILE_FLAGS "-pedantic-e
add_library(chatglm STATIC chatglm.cpp)
target_link_libraries(chatglm PUBLIC ggml sentencepiece-static)

add_executable(main main.cpp)
target_link_libraries(main PRIVATE chatglm)
# c++ examples
option(CHATGLM_ENABLE_EXAMPLES "chatglm: enable c++ examples" ON)
if (CHATGLM_ENABLE_EXAMPLES)
add_executable(main main.cpp)
target_link_libraries(main PRIVATE chatglm)

find_package(OpenMP)
if (OpenMP_CXX_FOUND)
set(CHATGLM_OPENMP_TARGET OpenMP::OpenMP_CXX)
endif ()
add_executable(perplexity tests/perplexity.cpp)
target_link_libraries(perplexity PRIVATE chatglm ${CHATGLM_OPENMP_TARGET})
endif ()

# GoogleTest
option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
Expand All @@ -75,7 +88,6 @@ endif ()

option(CHATGLM_ENABLE_PYBIND "chatglm: enable python binding" OFF)
if (CHATGLM_ENABLE_PYBIND)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
set_target_properties(chatglm ggml sentencepiece-static PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
add_subdirectory(third_party/pybind11)
pybind11_add_module(_C chatglm_pybind.cpp)
Expand Down
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="80" # for A100
cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4
```
To find out the CUDA architecture of your GPU device, see [Matching CUDA arch and CUDA gencode for various NVIDIA architectures](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/).
**Metal**
MPS (Metal Performance Shaders) allows computation to run on powerful Apple Silicon GPU. Add the CMake flag `-DGGML_METAL=ON` to enable it.
Expand Down Expand Up @@ -628,6 +630,21 @@ InternLM-20B:
| ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A |
| ms/token (CUDA @ V100 SXM2) | 21.6 | 23.2 | 25.0 | 25.9 | 33.4 | N/A |
## Model Quality
We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model.
Download and unzip the dataset from [link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip). Measure the perplexity with a stride of 512 and max input length of 2048:
```sh
./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
```
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
|-------------------------|-------|-------|-------|-------|-------|-------|
| [ChatGLM3-6B-Base][1] | 6.215 | 6.184 | 5.997 | 6.015 | 5.965 | 5.971 |
[1]: https://huggingface.co/THUDM/chatglm3-6b-base
## Development
**Unit Test & Benchmark**
Expand Down
33 changes: 22 additions & 11 deletions chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,20 @@ std::string to_string(ggml_tensor *tensor, bool with_data) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
auto ptr = (char *)tensor->data + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] +
i0 * tensor->nb[0];
float val;
if (tensor->type == GGML_TYPE_F32) {
val = *(float *)ptr;
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
oss << (i0 > 0 ? ", " : "");
if (tensor->type == GGML_TYPE_I32) {
oss << *(int *)ptr;
} else {
CHATGLM_THROW << "unimplemented";
float val;
if (tensor->type == GGML_TYPE_F32) {
val = *(float *)ptr;
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
} else {
CHATGLM_THROW << "unimplemented";
}
oss << std::setw(7) << std::fixed << std::setprecision(4) << val;
}
oss << (i0 > 0 ? ", " : "") << std::setw(7) << std::fixed << std::setprecision(4) << val;
}
oss << "]";
}
Expand Down Expand Up @@ -496,12 +501,11 @@ BaseModelForCausalLM::BaseModelForCausalLM(ModelConfig config, size_t mem_size,
#endif
}

int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
int n_past, int n_ctx) {
ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,
int n_threads, bool is_decoding) {
ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false);
ctx_.gf = {};

int n_threads = gen_config.num_threads; // user defined
if (n_threads <= 0) {
n_threads = get_default_num_threads(); // default thread num
}
Expand All @@ -513,7 +517,7 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids));

ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx);
ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding);
lm_logits->backend = GGML_BACKEND_CPU;

ggml_build_forward_expand(&ctx_.gf, lm_logits);
Expand All @@ -527,6 +531,13 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
ggml_graph_print(&ctx_.gf);
#endif

return lm_logits;
}

int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
int n_past, int n_ctx) {
ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true);

int vocab_size = lm_logits->ne[0];
float *next_token_logits = (float *)lm_logits->data;

Expand Down
23 changes: 14 additions & 9 deletions chatglm.h
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,11 @@ class BaseModelForCausalLM {
virtual ~BaseModelForCausalLM() = default;

virtual void load(ModelLoader &loader) = 0;
virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const = 0;
virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
bool is_decoding) const = 0;

ggml_tensor *forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx, int n_threads,
bool is_decoding);

std::vector<int> generate(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
BaseStreamer *streamer = nullptr);
Expand Down Expand Up @@ -896,10 +900,11 @@ class BasicModelForCausalLM : public BaseModelForCausalLM {
~BasicModelForCausalLM() { to_cpu(); }

public:
ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
bool is_decoding) const override {
ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx);
// NOTE: only compute next_token_logits for the last token
if (input_ids->ne[0] > 1) {
// NOTE: only compute next token logits for decoding
if (is_decoding && input_ids->ne[0] > 1) {
transformer_outputs = tensor_assign_buffers(
ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
(input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
Expand Down Expand Up @@ -1011,7 +1016,7 @@ class ChatGLMForCausalLM : public BasicModelForCausalLM<ChatGLMModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB; // 2k context
static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context
static constexpr size_t SCRATCH_SIZE = 1024 * MB; // 2k context
};

Expand Down Expand Up @@ -1061,7 +1066,7 @@ class ChatGLM2ForCausalLM : public BasicModelForCausalLM<ChatGLM2Model> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB; // 2k context
static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context
static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context
};

Expand Down Expand Up @@ -1161,7 +1166,7 @@ class Baichuan7BForCausalLM : public BasicModelForCausalLM<Baichuan7BModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB;
static constexpr size_t MEM_SIZE = 1280 * MB;
static constexpr size_t SCRATCH_SIZE = 1280 * MB;
};

Expand All @@ -1187,7 +1192,7 @@ class Baichuan13BForCausalLM : public BasicModelForCausalLM<Baichuan13BModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB;
static constexpr size_t MEM_SIZE = 1280 * MB;
static constexpr size_t SCRATCH_SIZE = 1280 * MB;
};

Expand Down Expand Up @@ -1248,7 +1253,7 @@ class InternLMForCausalLM : public BasicModelForCausalLM<InternLMModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB;
static constexpr size_t MEM_SIZE = 1280 * MB;
static constexpr size_t SCRATCH_SIZE = 1280 * MB;
};

Expand Down
2 changes: 1 addition & 1 deletion chatglm_cpp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import chatglm_cpp._C as _C
from chatglm_cpp._C import ChatMessage

__version__ = "0.3.1.dev"
__version__ = "0.3.1"


@dataclass
Expand Down
7 changes: 5 additions & 2 deletions chatglm_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ class PyBaseModelForCausalLM : public BaseModelForCausalLM {
using BaseModelForCausalLM::BaseModelForCausalLM;

void load(ModelLoader &loader) override { PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, load, loader); }
ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx)

ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
bool is_decoding) const override {
PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx,
is_decoding)
}
};

Expand Down
45 changes: 23 additions & 22 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,28 +40,29 @@ struct Args {
};

static void usage(const std::string &prog) {
std::cout << "Usage: " << prog << " [options]\n"
<< "\n"
<< "options:\n"
<< " -h, --help show this help message and exit\n"
<< " -m, --model PATH model path (default: chatglm-ggml.bin)\n"
<< " --mode inference mode chosen from {chat, generate} (default: chat)\n"
<< " --sync synchronized generation without streaming\n"
<< " -p, --prompt PROMPT prompt to start generation with (default: 你好)\n"
<< " --pp, --prompt_path path to the plain text file that stores the prompt\n"
<< " -s, --system SYSTEM system message to set the behavior of the assistant\n"
<< " --sp, --system_path path to the plain text file that stores the system message\n"
<< " -i, --interactive run in interactive mode\n"
<< " -l, --max_length N max total length including prompt and output (default: 2048)\n"
<< " --max_new_tokens N max number of tokens to generate, ignoring the number of prompt tokens\n"
<< " -c, --max_context_length N\n"
<< " max context length (default: 512)\n"
<< " --top_k N top-k sampling (default: 0)\n"
<< " --top_p N top-p sampling (default: 0.7)\n"
<< " --temp N temperature (default: 0.95)\n"
<< " --repeat_penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)\n"
<< " -t, --threads N number of threads for inference\n"
<< " -v, --verbose display verbose output including config/system/performance info\n";
std::cout << "Usage: " << prog << R"( [options]
options:
-h, --help show this help message and exit
-m, --model PATH model path (default: chatglm-ggml.bin)
--mode inference mode chosen from {chat, generate} (default: chat)
--sync synchronized generation without streaming
-p, --prompt PROMPT prompt to start generation with (default: 你好)
--pp, --prompt_path path to the plain text file that stores the prompt
-s, --system SYSTEM system message to set the behavior of the assistant
--sp, --system_path path to the plain text file that stores the system message
-i, --interactive run in interactive mode
-l, --max_length N max total length including prompt and output (default: 2048)
--max_new_tokens N max number of tokens to generate, ignoring the number of prompt tokens
-c, --max_context_length N
max context length (default: 512)
--top_k N top-k sampling (default: 0)
--top_p N top-p sampling (default: 0.7)
--temp N temperature (default: 0.95)
--repeat_penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
-t, --threads N number of threads for inference
-v, --verbose display verbose output including config/system/performance info
)";
}

static std::string read_text(std::string path) {
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ authors = [
maintainers = [
{name = "Jiahao Li", email = "[email protected]"},
]
description = "C++ implementation of ChatGLM-6B & ChatGLM2-6B"
description = "C++ implementation of ChatGLM family models and more LLMs"
readme = "README.md"
requires-python = ">=3.7"
keywords = ["ChatGLM", "ChatGLM2", "Large Language Model"]
keywords = ["ChatGLM", "ChatGLM2", "ChatGLM3", "Large Language Model"]
license = {text = "MIT License"}
classifiers = [
"Development Status :: 3 - Alpha",
Expand All @@ -27,6 +27,7 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dynamic = ["version"]

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
f"-DPYTHON_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
f"-DCHATGLM_ENABLE_PYBIND=ON",
f"-DCHATGLM_ENABLE_EXAMPLES=OFF",
f"-DBUILD_SHARED_LIBS=OFF",
]
build_args = []
Expand Down
Loading

0 comments on commit 23442b0

Please sign in to comment.