forked from ggerganov/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 2
/
test-barrier.cpp
94 lines (71 loc) · 2.63 KB
/
test-barrier.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include <chrono>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <vector>
#define MAX_NARGS 2
int main(int argc, char *argv[]) {
int n_threads = 4;
int n_rounds = 100;
if (argc > 1) {
n_threads = std::atoi(argv[1]);
}
if (argc > 2) {
n_rounds = std::atoi(argv[2]);
}
struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
// Create graph
struct ggml_cgraph * gf = ggml_new_graph(ctx);
// Lots of small, parallel ops where barriers in between will dominate
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
for (int i = 0; i < 1000; i++) {
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
out = ggml_mul_mat(ctx, a, out);
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
out = ggml_mul_mat(ctx, d, out);
}
ggml_build_forward_expand(gf, out);
int n_nodes = ggml_graph_n_nodes(gf);
// Create threadpool
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
exit(1);
}
// Create compute plan
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
std::vector<uint8_t> work_data(cplan.work_size);
cplan.work_data = work_data.data();
std::cerr << "graph-compute with"
<< "\n n_threads: " << n_threads
<< "\n n_nodes: " << n_nodes
<< "\n n_rounds: " << n_rounds
<< "\n";
// ggml_graph_print(gf);
// Warmup
ggml_graph_compute(gf, &cplan);
auto t0 = std::chrono::high_resolution_clock::now();
for (int i=0; i < n_rounds; i++) {
ggml_graph_compute(gf, &cplan);
}
auto t1 = std::chrono::high_resolution_clock::now();
auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
std::cerr << "graph-compute took " << usec << " usec "
<< "\n " << (float) usec / n_rounds << " usec per-iter"
<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
<< "\n";
ggml_threadpool_free(threadpool);
ggml_free(ctx);
return 0;
}