Skip to content

Commit

Permalink
Add perplexity evaluation script
Browse files Browse the repository at this point in the history
  • Loading branch information
li-plus committed Dec 3, 2023
1 parent 8b17a67 commit 32ef183
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 44 deletions.
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ target_link_libraries(chatglm PUBLIC ggml sentencepiece-static)
add_executable(main main.cpp)
target_link_libraries(main PRIVATE chatglm)

find_package(OpenMP)
if (OpenMP_CXX_FOUND)
set(CHATGLM_OPENMP_TARGET OpenMP::OpenMP_CXX)
endif ()
add_executable(perplexity tests/perplexity.cpp)
target_link_libraries(perplexity PRIVATE chatglm ${CHATGLM_OPENMP_TARGET})

# GoogleTest
option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
if (CHATGLM_ENABLE_TESTING)
Expand Down
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,27 @@ InternLM-20B:
| ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A |
| ms/token (CUDA @ V100 SXM2) | 21.6 | 23.2 | 25.0 | 25.9 | 33.4 | N/A |
## Model Quality
We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model.
Download and unzip the dataset ([link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip)). Measure the perplexity with a stride of 512 and max input length of 2048:
```sh
./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
```
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
|-------------------------|-------|-------|-------|-------|-------|-------|
| [ChatGLM3-6B-Base][1] | 6.215 | 6.184 | 5.997 | 6.015 | 5.965 | 5.971 |
| [Baichuan2-7B-Base][2] |
| [Baichuan2-13B-Base][3] |
| [InternLM-7B][4] |
[1]: https://huggingface.co/THUDM/chatglm3-6b-base
[2]: https://huggingface.co/baichuan-inc/Baichuan2-7B-Base
[3]: https://huggingface.co/baichuan-inc/Baichuan2-13B-Base
[4]: https://huggingface.co/internlm/internlm-7b
## Development
**Unit Test & Benchmark**
Expand Down
33 changes: 22 additions & 11 deletions chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,20 @@ std::string to_string(ggml_tensor *tensor, bool with_data) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
auto ptr = (char *)tensor->data + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] +
i0 * tensor->nb[0];
float val;
if (tensor->type == GGML_TYPE_F32) {
val = *(float *)ptr;
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
oss << (i0 > 0 ? ", " : "");
if (tensor->type == GGML_TYPE_I32) {
oss << *(int *)ptr;
} else {
CHATGLM_THROW << "unimplemented";
float val;
if (tensor->type == GGML_TYPE_F32) {
val = *(float *)ptr;
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *)ptr);
} else {
CHATGLM_THROW << "unimplemented";
}
oss << std::setw(7) << std::fixed << std::setprecision(4) << val;
}
oss << (i0 > 0 ? ", " : "") << std::setw(7) << std::fixed << std::setprecision(4) << val;
}
oss << "]";
}
Expand Down Expand Up @@ -496,12 +501,11 @@ BaseModelForCausalLM::BaseModelForCausalLM(ModelConfig config, size_t mem_size,
#endif
}

int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
int n_past, int n_ctx) {
ggml_tensor *BaseModelForCausalLM::forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,
int n_threads, bool is_decoding) {
ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false);
ctx_.gf = {};

int n_threads = gen_config.num_threads; // user defined
if (n_threads <= 0) {
n_threads = get_default_num_threads(); // default thread num
}
Expand All @@ -513,7 +517,7 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids));

ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx);
ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding);
lm_logits->backend = GGML_BACKEND_CPU;

ggml_build_forward_expand(&ctx_.gf, lm_logits);
Expand All @@ -527,6 +531,13 @@ int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids,
ggml_graph_print(&ctx_.gf);
#endif

return lm_logits;
}

int BaseModelForCausalLM::generate_next_token(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
int n_past, int n_ctx) {
ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true);

int vocab_size = lm_logits->ne[0];
float *next_token_logits = (float *)lm_logits->data;

Expand Down
23 changes: 14 additions & 9 deletions chatglm.h
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,11 @@ class BaseModelForCausalLM {
virtual ~BaseModelForCausalLM() = default;

virtual void load(ModelLoader &loader) = 0;
virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const = 0;
virtual ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
bool is_decoding) const = 0;

ggml_tensor *forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx, int n_threads,
bool is_decoding);

std::vector<int> generate(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
BaseStreamer *streamer = nullptr);
Expand Down Expand Up @@ -896,10 +900,11 @@ class BasicModelForCausalLM : public BaseModelForCausalLM {
~BasicModelForCausalLM() { to_cpu(); }

public:
ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
bool is_decoding) const override {
ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx);
// NOTE: only compute next_token_logits for the last token
if (input_ids->ne[0] > 1) {
// NOTE: only compute next token logits for decoding
if (is_decoding && input_ids->ne[0] > 1) {
transformer_outputs = tensor_assign_buffers(
ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
(input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
Expand Down Expand Up @@ -1011,7 +1016,7 @@ class ChatGLMForCausalLM : public BasicModelForCausalLM<ChatGLMModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB; // 2k context
static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context
static constexpr size_t SCRATCH_SIZE = 1024 * MB; // 2k context
};

Expand Down Expand Up @@ -1061,7 +1066,7 @@ class ChatGLM2ForCausalLM : public BasicModelForCausalLM<ChatGLM2Model> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB; // 2k context
static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context
static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context
};

Expand Down Expand Up @@ -1161,7 +1166,7 @@ class Baichuan7BForCausalLM : public BasicModelForCausalLM<Baichuan7BModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB;
static constexpr size_t MEM_SIZE = 1280 * MB;
static constexpr size_t SCRATCH_SIZE = 1280 * MB;
};

Expand All @@ -1187,7 +1192,7 @@ class Baichuan13BForCausalLM : public BasicModelForCausalLM<Baichuan13BModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB;
static constexpr size_t MEM_SIZE = 1280 * MB;
static constexpr size_t SCRATCH_SIZE = 1280 * MB;
};

Expand Down Expand Up @@ -1248,7 +1253,7 @@ class InternLMForCausalLM : public BasicModelForCausalLM<InternLMModel> {
StateDict state_dict() const;

public:
static constexpr size_t MEM_SIZE = 512 * MB;
static constexpr size_t MEM_SIZE = 1280 * MB;
static constexpr size_t SCRATCH_SIZE = 1280 * MB;
};

Expand Down
7 changes: 5 additions & 2 deletions chatglm_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ class PyBaseModelForCausalLM : public BaseModelForCausalLM {
using BaseModelForCausalLM::BaseModelForCausalLM;

void load(ModelLoader &loader) override { PYBIND11_OVERLOAD_PURE(void, PyBaseModelForCausalLM, load, loader); }
ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const override {
PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx)

ggml_tensor *forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,
bool is_decoding) const override {
PYBIND11_OVERLOAD_PURE(ggml_tensor *, PyBaseModelForCausalLM, forward, ctx, input_ids, n_past, n_ctx,
is_decoding)
}
};

Expand Down
45 changes: 23 additions & 22 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,28 +40,29 @@ struct Args {
};

static void usage(const std::string &prog) {
std::cout << "Usage: " << prog << " [options]\n"
<< "\n"
<< "options:\n"
<< " -h, --help show this help message and exit\n"
<< " -m, --model PATH model path (default: chatglm-ggml.bin)\n"
<< " --mode inference mode chosen from {chat, generate} (default: chat)\n"
<< " --sync synchronized generation without streaming\n"
<< " -p, --prompt PROMPT prompt to start generation with (default: 你好)\n"
<< " --pp, --prompt_path path to the plain text file that stores the prompt\n"
<< " -s, --system SYSTEM system message to set the behavior of the assistant\n"
<< " --sp, --system_path path to the plain text file that stores the system message\n"
<< " -i, --interactive run in interactive mode\n"
<< " -l, --max_length N max total length including prompt and output (default: 2048)\n"
<< " --max_new_tokens N max number of tokens to generate, ignoring the number of prompt tokens\n"
<< " -c, --max_context_length N\n"
<< " max context length (default: 512)\n"
<< " --top_k N top-k sampling (default: 0)\n"
<< " --top_p N top-p sampling (default: 0.7)\n"
<< " --temp N temperature (default: 0.95)\n"
<< " --repeat_penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)\n"
<< " -t, --threads N number of threads for inference\n"
<< " -v, --verbose display verbose output including config/system/performance info\n";
std::cout << "Usage: " << prog << R"( [options]
options:
-h, --help show this help message and exit
-m, --model PATH model path (default: chatglm-ggml.bin)
--mode inference mode chosen from {chat, generate} (default: chat)
--sync synchronized generation without streaming
-p, --prompt PROMPT prompt to start generation with (default: 你好)
--pp, --prompt_path path to the plain text file that stores the prompt
-s, --system SYSTEM system message to set the behavior of the assistant
--sp, --system_path path to the plain text file that stores the system message
-i, --interactive run in interactive mode
-l, --max_length N max total length including prompt and output (default: 2048)
--max_new_tokens N max number of tokens to generate, ignoring the number of prompt tokens
-c, --max_context_length N
max context length (default: 512)
--top_k N top-k sampling (default: 0)
--top_p N top-p sampling (default: 0.7)
--temp N temperature (default: 0.95)
--repeat_penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
-t, --threads N number of threads for inference
-v, --verbose display verbose output including config/system/performance info
)";
}

static std::string read_text(std::string path) {
Expand Down
Loading

0 comments on commit 32ef183

Please sign in to comment.