diff --git a/README.md b/README.md index 79b06f3..6d6b5ec 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Lily wanted to play with the ball, but it was too high up in the sky. She tried Lily found a stick and tried to hit the ball. But the stick was too short. She tried again and again, but she couldn't reach it. She felt sad. Suddenly, a kind man came by and saw Lily. He asked her what was wrong. Lily told him about the ball. The man smiled and said, "I have a useful idea!" He took out a long stick and used it to knock the ball down. Lily was so happy! She thanked the man and they played together in the sunshine. -achieved: 701.587 tok/s +achieved: 712.903 tok/s ``` ## Run Llama 2 from Hugging Face diff --git a/src/attention.zig b/src/attention.zig index 28871cf..32cbc26 100644 --- a/src/attention.zig +++ b/src/attention.zig @@ -67,7 +67,7 @@ pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint, sequence_lengt }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.input_buffer.deinit(); self.output_buffer.deinit(); self.query_buffer.deinit(); @@ -76,7 +76,7 @@ pub fn deinit(self: *const Self) void { self.allocator.free(self.scores); } -pub fn forward(self: *const Self, layer: usize, position: usize) void { +pub fn forward(self: Self, layer: usize, position: usize) void { const weights = self.checkpoint.weights; const query_matrix = weights.attention_query_matrices.slice(layer); const key_matrix = weights.attention_key_matrices.slice(layer); @@ -99,7 +99,7 @@ pub fn forward(self: *const Self, layer: usize, position: usize) void { } // Rotary positional embeddings: https://arxiv.org/abs/2104.09864 -fn computeRoPE(self: *const Self, position: usize, key_buffer: Tensor(2)) void { +fn computeRoPE(self: Self, position: usize, key_buffer: Tensor(2)) void { @setFloatMode(.Optimized); std.debug.assert(self.query_buffer.values.len % key_buffer.values.len == 0); @@ -133,7 +133,7 @@ fn computeRoPE(self: *const Self, position: usize, key_buffer: Tensor(2)) void { } // Grouped-query attention: https://arxiv.org/abs/2305.13245v1 -fn computeGQA(self: *const Self, layer: usize, current_position: usize, head: usize) void { +fn computeGQA(self: Self, layer: usize, current_position: usize, head: usize) void { @setFloatMode(.Optimized); const query_vector = self.query_buffer.slice(head); diff --git a/src/chat.zig b/src/chat.zig index 06f88e3..87923b9 100644 --- a/src/chat.zig +++ b/src/chat.zig @@ -38,7 +38,7 @@ pub fn init(allocator: std.mem.Allocator, args: ChatArgs) !Self { }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.transformer.deinit(); self.tokenizer.deinit(); self.sampler.deinit(); diff --git a/src/checkpoint.zig b/src/checkpoint.zig index 971802e..c8e0087 100644 --- a/src/checkpoint.zig +++ b/src/checkpoint.zig @@ -59,7 +59,7 @@ pub fn init(allocator: std.mem.Allocator, model_path: []const u8) !Self { } // https://github.com/karpathy/llama2.c/blob/d9862069e7ef665fe6309e3c17398ded2f121bf5/export.py#L132 -pub fn writeV1(self: *const Self, allocator: std.mem.Allocator, model_path: []const u8) !void { +pub fn writeV1(self: Self, allocator: std.mem.Allocator, model_path: []const u8) !void { const path = try std.fs.path.join( allocator, &[_][]const u8{ model_path, "checkpoint_v1.bin" }, @@ -403,7 +403,7 @@ fn readLegacy(allocator: std.mem.Allocator, file: std.fs.File) !Self { }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.weights.token_embedding_vectors.deinit(); self.weights.attention_norm_vectors.deinit(); self.weights.attention_query_matrices.deinit(); diff --git a/src/ffn.zig b/src/ffn.zig index c4ff36b..711740a 100644 --- a/src/ffn.zig +++ b/src/ffn.zig @@ -38,7 +38,7 @@ pub fn init(allocator: std.mem.Allocator, checkpoint: Checkpoint) !Self { }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.input_buffer.deinit(); self.gate_buffer.deinit(); self.hidden_buffer.deinit(); @@ -46,7 +46,7 @@ pub fn deinit(self: *const Self) void { } // SwiGLU activation function: https://arxiv.org/abs/2002.05202 -pub fn forward(self: *const Self, layer: usize) void { +pub fn forward(self: Self, layer: usize) void { @setFloatMode(.Optimized); const weights = self.checkpoint.weights; diff --git a/src/generator.zig b/src/generator.zig index 34b521f..dd07e63 100644 --- a/src/generator.zig +++ b/src/generator.zig @@ -40,7 +40,7 @@ pub fn init(allocator: std.mem.Allocator, args: GeneratorArgs) !Self { }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.transformer.deinit(); self.tokenizer.deinit(); self.sampler.deinit(); diff --git a/src/quantized_tensor.zig b/src/quantized_tensor.zig index 126966d..b41bd64 100644 --- a/src/quantized_tensor.zig +++ b/src/quantized_tensor.zig @@ -30,14 +30,14 @@ pub fn QuantizedTensor(comptime n_dims: comptime_int) type { }; } - pub fn deinit(self: *const Self) void { + pub fn deinit(self: Self) void { if (self.allocator) |allocator| { allocator.free(self.values); allocator.free(self.scaling_factors); } } - pub fn slice(self: *const Self, index: usize) !QuantizedTensor(n_dims - 1) { + pub fn slice(self: Self, index: usize) !QuantizedTensor(n_dims - 1) { comptime if (n_dims < 2) @compileError("n_dims < 2"); const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims)); @@ -58,7 +58,7 @@ pub fn QuantizedTensor(comptime n_dims: comptime_int) type { } pub fn computeMatrixVectorMultiplication( - self: *const Self, + self: Self, input: anytype, output: anytype, ) !void { @@ -67,7 +67,7 @@ pub fn QuantizedTensor(comptime n_dims: comptime_int) type { } } - fn computeScalarProduct(self: *const Self, other: anytype) !f32 { + fn computeScalarProduct(self: Self, other: anytype) !f32 { // https://github.com/karpathy/llama2.c/pull/312#issuecomment-1684140683 if (self.group_size == 32) { return _computeScalarProduct(32, self, other); diff --git a/src/sampler.zig b/src/sampler.zig index 5855fec..9dfa5c2 100644 --- a/src/sampler.zig +++ b/src/sampler.zig @@ -23,7 +23,7 @@ pub fn init(allocator: std.mem.Allocator, args: anytype, vocab_size: usize) !Sel }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.allocator.free(self.probability_index_pairs_buffer); } diff --git a/src/tensor.zig b/src/tensor.zig index af2304d..47d2a60 100644 --- a/src/tensor.zig +++ b/src/tensor.zig @@ -20,25 +20,25 @@ pub fn Tensor(comptime n_dims: comptime_int) type { }; } - pub fn deinit(self: *const Self) void { + pub fn deinit(self: Self) void { if (self.allocator) |allocator| { allocator.free(self.values); } } - pub fn read(self: *const Self, file: std.fs.File) !void { + pub fn read(self: Self, file: std.fs.File) !void { const values: [*]u8 = @ptrCast(self.values); try file.reader().readNoEof(values[0 .. self.values.len * @sizeOf(f32)]); } - pub fn write(self: *const Self, file: std.fs.File) !void { + pub fn write(self: Self, file: std.fs.File) !void { const values: [*]u8 = @ptrCast(self.values); try file.writer().writeAll(values[0 .. self.values.len * @sizeOf(f32)]); } - pub fn slice(self: *const Self, index: usize) Tensor(n_dims - 1) { + pub fn slice(self: Self, index: usize) Tensor(n_dims - 1) { comptime if (n_dims < 2) @compileError("n_dims < 2"); const n_sub_values = @reduce(.Mul, @as(@Vector(n_dims - 1, usize), self.sub_dims)); @@ -50,7 +50,7 @@ pub fn Tensor(comptime n_dims: comptime_int) type { }; } - pub fn add(self: *const Self, other: anytype) void { + pub fn add(self: Self, other: anytype) void { @setFloatMode(.Optimized); std.debug.assert(self.values.len == other.values.len); @@ -60,17 +60,13 @@ pub fn Tensor(comptime n_dims: comptime_int) type { } } - pub fn computeMatrixVectorMultiplication( - self: *const Self, - input: anytype, - output: anytype, - ) void { + pub fn computeMatrixVectorMultiplication(self: Self, input: anytype, output: anytype) void { for (output.values, 0..) |*value, index| { - value.* = self.slice(index).computeScalarProduct(&input); + value.* = self.slice(index).computeScalarProduct(input); } } - pub fn computeScalarProduct(self: *const Self, other: anytype) f32 { + pub fn computeScalarProduct(self: Self, other: anytype) f32 { if (self.values.len % 32 == 0) { return _computeScalarProduct(32, self, other); } @@ -87,7 +83,7 @@ pub fn Tensor(comptime n_dims: comptime_int) type { } // Pre-normalization using RMSNorm: https://arxiv.org/abs/1910.07467 - pub fn computeRMSNorm(self: *const Self, weight: anytype, output: anytype) void { + pub fn computeRMSNorm(self: Self, weight: anytype, output: anytype) void { @setFloatMode(.Optimized); std.debug.assert(output.values.len == self.values.len); diff --git a/src/tokenizer.zig b/src/tokenizer.zig index 723dd40..c742635 100644 --- a/src/tokenizer.zig +++ b/src/tokenizer.zig @@ -54,7 +54,7 @@ pub fn init(allocator: std.mem.Allocator, model_path: []const u8, vocab_size: us }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { for (self.vocab) |word| { self.allocator.free(word); } @@ -64,11 +64,7 @@ pub fn deinit(self: *const Self) void { self.allocator.free(self.sorted_vocab); } -pub fn encode( - self: *const Self, - allocator: std.mem.Allocator, - text: []const u8, -) ![]usize { +pub fn encode(self: Self, allocator: std.mem.Allocator, text: []const u8) ![]usize { var double_word_buffer = try allocator.alloc(u8, self.max_word_length * 2); defer allocator.free(double_word_buffer); @@ -90,14 +86,14 @@ pub fn encode( return merged_tokens_copy; } -pub fn decode(self: *const Self, token: usize, bos: bool) []const u8 { +pub fn decode(self: Self, token: usize, bos: bool) []const u8 { const word = self.vocab[token]; // https://github.com/karpathy/llama2.c/blob/7ac65cb2c2b169050747be92011b7bebdd1b4544/run.c#L425 return if (bos and std.ascii.isWhitespace(word[0])) word[1..] else word; } -fn encodeCodepoints(self: *const Self, allocator: std.mem.Allocator, text: []const u8) ![]usize { +fn encodeCodepoints(self: Self, allocator: std.mem.Allocator, text: []const u8) ![]usize { var tokens = std.ArrayList(usize).init(allocator); errdefer tokens.deinit(); @@ -125,7 +121,7 @@ fn encodeCodepoints(self: *const Self, allocator: std.mem.Allocator, text: []con return tokens.toOwnedSlice(); } -fn mergeBestWordPair(self: *const Self, tokens: []usize, double_word_buffer: []u8) bool { +fn mergeBestWordPair(self: Self, tokens: []usize, double_word_buffer: []u8) bool { if (tokens.len < 1) { return false; } @@ -168,7 +164,7 @@ fn mergeBestWordPair(self: *const Self, tokens: []usize, double_word_buffer: []u return false; } -fn lookupToken(self: *const Self, word: []const u8) ?usize { +fn lookupToken(self: Self, word: []const u8) ?usize { var left: usize = 0; var right = self.sorted_vocab.len; diff --git a/src/transformer.zig b/src/transformer.zig index 3895117..b86fc96 100644 --- a/src/transformer.zig +++ b/src/transformer.zig @@ -55,7 +55,7 @@ pub fn init( }; } -pub fn deinit(self: *const Self) void { +pub fn deinit(self: Self) void { self.checkpoint.deinit(); self.attention.deinit(); self.ffn.deinit(); @@ -63,7 +63,7 @@ pub fn deinit(self: *const Self) void { self.output_buffer.deinit(); } -pub fn forward(self: *const Self, token: usize, position: usize) void { +pub fn forward(self: Self, token: usize, position: usize) void { const weights = self.checkpoint.weights; @memcpy(self.hidden_buffer.values, weights.token_embedding_vectors.slice(token).values);